diff options
Diffstat (limited to 'fs')
48 files changed, 1511 insertions, 803 deletions
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index d3f3b43cae0b..2e14fd89a8b4 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -219,7 +219,7 @@ static int fs_path_ensure_buf(struct fs_path *p, int len) len = PAGE_ALIGN(len); if (p->buf == p->inline_buf) { - tmp_buf = kmalloc(len, GFP_NOFS); + tmp_buf = kmalloc(len, GFP_NOFS | __GFP_NOWARN); if (!tmp_buf) { tmp_buf = vmalloc(len); if (!tmp_buf) diff --git a/fs/ext3/super.c b/fs/ext3/super.c index c47f14750722..c50c76190373 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -27,6 +27,7 @@ #include <linux/seq_file.h> #include <linux/log2.h> #include <linux/cleancache.h> +#include <linux/namei.h> #include <asm/uaccess.h> @@ -819,6 +820,7 @@ enum { Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh, Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev, + Opt_journal_path, Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, Opt_data_err_abort, Opt_data_err_ignore, Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, @@ -860,6 +862,7 @@ static const match_table_t tokens = { {Opt_journal_update, "journal=update"}, {Opt_journal_inum, "journal=%u"}, {Opt_journal_dev, "journal_dev=%u"}, + {Opt_journal_path, "journal_path=%s"}, {Opt_abort, "abort"}, {Opt_data_journal, "data=journal"}, {Opt_data_ordered, "data=ordered"}, @@ -975,6 +978,11 @@ static int parse_options (char *options, struct super_block *sb, int option; kuid_t uid; kgid_t gid; + char *journal_path; + struct inode *journal_inode; + struct path path; + int error; + #ifdef CONFIG_QUOTA int qfmt; #endif @@ -1129,6 +1137,41 @@ static int parse_options (char *options, struct super_block *sb, return 0; *journal_devnum = option; break; + case Opt_journal_path: + if (is_remount) { + ext3_msg(sb, KERN_ERR, "error: cannot specify " + "journal on remount"); + return 0; + } + + journal_path = match_strdup(&args[0]); + if (!journal_path) { + ext3_msg(sb, KERN_ERR, "error: could not dup " + "journal device string"); + return 0; + } + + error = kern_path(journal_path, LOOKUP_FOLLOW, &path); + if (error) { + ext3_msg(sb, KERN_ERR, "error: could not find " + "journal device path: error %d", error); + kfree(journal_path); + return 0; + } + + journal_inode = path.dentry->d_inode; + if (!S_ISBLK(journal_inode->i_mode)) { + ext3_msg(sb, KERN_ERR, "error: journal path %s " + "is not a block device", journal_path); + path_put(&path); + kfree(journal_path); + return 0; + } + + *journal_devnum = new_encode_dev(journal_inode->i_rdev); + path_put(&path); + kfree(journal_path); + break; case Opt_noload: set_opt (sbi->s_mount_opt, NOLOAD); break; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 049c8a8bdc0e..2c2e6cbc6bed 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -162,7 +162,7 @@ void *ext4_kvmalloc(size_t size, gfp_t flags) { void *ret; - ret = kmalloc(size, flags); + ret = kmalloc(size, flags | __GFP_NOWARN); if (!ret) ret = __vmalloc(size, flags, PAGE_KERNEL); return ret; @@ -172,7 +172,7 @@ void *ext4_kvzalloc(size_t size, gfp_t flags) { void *ret; - ret = kzalloc(size, flags); + ret = kzalloc(size, flags | __GFP_NOWARN); if (!ret) ret = __vmalloc(size, flags | __GFP_ZERO, PAGE_KERNEL); return ret; diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 66a6b85a51d8..bb312201ca95 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -182,7 +182,7 @@ const struct address_space_operations f2fs_meta_aops = { .set_page_dirty = f2fs_set_meta_page_dirty, }; -int check_orphan_space(struct f2fs_sb_info *sbi) +int acquire_orphan_inode(struct f2fs_sb_info *sbi) { unsigned int max_orphans; int err = 0; @@ -197,10 +197,19 @@ int check_orphan_space(struct f2fs_sb_info *sbi) mutex_lock(&sbi->orphan_inode_mutex); if (sbi->n_orphans >= max_orphans) err = -ENOSPC; + else + sbi->n_orphans++; mutex_unlock(&sbi->orphan_inode_mutex); return err; } +void release_orphan_inode(struct f2fs_sb_info *sbi) +{ + mutex_lock(&sbi->orphan_inode_mutex); + sbi->n_orphans--; + mutex_unlock(&sbi->orphan_inode_mutex); +} + void add_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) { struct list_head *head, *this; @@ -229,21 +238,18 @@ retry: list_add(&new->list, this->prev); else list_add_tail(&new->list, head); - - sbi->n_orphans++; out: mutex_unlock(&sbi->orphan_inode_mutex); } void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) { - struct list_head *this, *next, *head; + struct list_head *head; struct orphan_inode_entry *orphan; mutex_lock(&sbi->orphan_inode_mutex); head = &sbi->orphan_inode_list; - list_for_each_safe(this, next, head) { - orphan = list_entry(this, struct orphan_inode_entry, list); + list_for_each_entry(orphan, head, list) { if (orphan->ino == ino) { list_del(&orphan->list); kmem_cache_free(orphan_entry_slab, orphan); @@ -373,7 +379,7 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi, if (!f2fs_crc_valid(crc, cp_block, crc_offset)) goto invalid_cp1; - pre_version = le64_to_cpu(cp_block->checkpoint_ver); + pre_version = cur_cp_version(cp_block); /* Read the 2nd cp block in this CP pack */ cp_addr += le32_to_cpu(cp_block->cp_pack_total_block_count) - 1; @@ -388,7 +394,7 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi, if (!f2fs_crc_valid(crc, cp_block, crc_offset)) goto invalid_cp2; - cur_version = le64_to_cpu(cp_block->checkpoint_ver); + cur_version = cur_cp_version(cp_block); if (cur_version == pre_version) { *version = cur_version; @@ -793,7 +799,7 @@ void write_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) * Increase the version number so that * SIT entries and seg summaries are written at correct place */ - ckpt_ver = le64_to_cpu(ckpt->checkpoint_ver); + ckpt_ver = cur_cp_version(ckpt); ckpt->checkpoint_ver = cpu_to_le64(++ckpt_ver); /* write cached NAT/SIT entries to NAT/SIT area */ diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 035f9a345cdf..941f9b9ca3a5 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -37,9 +37,9 @@ static void __set_data_blkaddr(struct dnode_of_data *dn, block_t new_addr) struct page *node_page = dn->node_page; unsigned int ofs_in_node = dn->ofs_in_node; - wait_on_page_writeback(node_page); + f2fs_wait_on_page_writeback(node_page, NODE, false); - rn = (struct f2fs_node *)page_address(node_page); + rn = F2FS_NODE(node_page); /* Get physical address of data block */ addr_array = blkaddr_in_node(rn); @@ -117,7 +117,8 @@ void update_extent_cache(block_t blk_addr, struct dnode_of_data *dn) block_t start_blkaddr, end_blkaddr; BUG_ON(blk_addr == NEW_ADDR); - fofs = start_bidx_of_node(ofs_of_node(dn->node_page)) + dn->ofs_in_node; + fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) + + dn->ofs_in_node; /* Update the page address in the parent node */ __set_data_blkaddr(dn, blk_addr); @@ -176,7 +177,6 @@ void update_extent_cache(block_t blk_addr, struct dnode_of_data *dn) end_update: write_unlock(&fi->ext.ext_lock); sync_inode_page(dn); - return; } struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync) @@ -260,8 +260,17 @@ repeat: if (PageUptodate(page)) return page; - BUG_ON(dn.data_blkaddr == NEW_ADDR); - BUG_ON(dn.data_blkaddr == NULL_ADDR); + /* + * A new dentry page is allocated but not able to be written, since its + * new inode page couldn't be allocated due to -ENOSPC. + * In such the case, its blkaddr can be remained as NEW_ADDR. + * see, f2fs_add_link -> get_new_data_page -> init_inode_metadata. + */ + if (dn.data_blkaddr == NEW_ADDR) { + zero_user_segment(page, 0, PAGE_CACHE_SIZE); + SetPageUptodate(page); + return page; + } err = f2fs_readpage(sbi, page, dn.data_blkaddr, READ_SYNC); if (err) @@ -365,7 +374,6 @@ static void read_end_io(struct bio *bio, int err) } unlock_page(page); } while (bvec >= bio->bi_io_vec); - kfree(bio->bi_private); bio_put(bio); } @@ -391,7 +399,6 @@ int f2fs_readpage(struct f2fs_sb_info *sbi, struct page *page, bio->bi_end_io = read_end_io; if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) { - kfree(bio->bi_private); bio_put(bio); up_read(&sbi->bio_sem); f2fs_put_page(page, 1); @@ -442,7 +449,7 @@ static int get_data_block_ro(struct inode *inode, sector_t iblock, unsigned int end_offset; end_offset = IS_INODE(dn.node_page) ? - ADDRS_PER_INODE : + ADDRS_PER_INODE(F2FS_I(inode)) : ADDRS_PER_BLOCK; clear_buffer_new(bh_result); @@ -636,9 +643,6 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, int err = 0; int ilock; - /* for nobh_write_end */ - *fsdata = NULL; - f2fs_balance_fs(sbi); repeat: page = grab_cache_page_write_begin(mapping, index, flags); diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 0d6c6aafb235..a84b0a8e6854 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -29,7 +29,7 @@ static DEFINE_MUTEX(f2fs_stat_mutex); static void update_general_status(struct f2fs_sb_info *sbi) { - struct f2fs_stat_info *si = sbi->stat_info; + struct f2fs_stat_info *si = F2FS_STAT(sbi); int i; /* valid check of the segment numbers */ @@ -83,7 +83,7 @@ static void update_general_status(struct f2fs_sb_info *sbi) */ static void update_sit_info(struct f2fs_sb_info *sbi) { - struct f2fs_stat_info *si = sbi->stat_info; + struct f2fs_stat_info *si = F2FS_STAT(sbi); unsigned int blks_per_sec, hblks_per_sec, total_vblocks, bimodal, dist; struct sit_info *sit_i = SIT_I(sbi); unsigned int segno, vblocks; @@ -118,7 +118,7 @@ static void update_sit_info(struct f2fs_sb_info *sbi) */ static void update_mem_info(struct f2fs_sb_info *sbi) { - struct f2fs_stat_info *si = sbi->stat_info; + struct f2fs_stat_info *si = F2FS_STAT(sbi); unsigned npages; if (si->base_mem) @@ -253,21 +253,21 @@ static int stat_show(struct seq_file *s, void *v) si->nats, NM_WOUT_THRESHOLD); seq_printf(s, " - SITs: %5d\n - free_nids: %5d\n", si->sits, si->fnids); - seq_printf(s, "\nDistribution of User Blocks:"); - seq_printf(s, " [ valid | invalid | free ]\n"); - seq_printf(s, " ["); + seq_puts(s, "\nDistribution of User Blocks:"); + seq_puts(s, " [ valid | invalid | free ]\n"); + seq_puts(s, " ["); for (j = 0; j < si->util_valid; j++) - seq_printf(s, "-"); - seq_printf(s, "|"); + seq_putc(s, '-'); + seq_putc(s, '|'); for (j = 0; j < si->util_invalid; j++) - seq_printf(s, "-"); - seq_printf(s, "|"); + seq_putc(s, '-'); + seq_putc(s, '|'); for (j = 0; j < si->util_free; j++) - seq_printf(s, "-"); - seq_printf(s, "]\n\n"); + seq_putc(s, '-'); + seq_puts(s, "]\n\n"); seq_printf(s, "SSR: %u blocks in %u segments\n", si->block_count[SSR], si->segment_count[SSR]); seq_printf(s, "LFS: %u blocks in %u segments\n", @@ -305,11 +305,10 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi) struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); struct f2fs_stat_info *si; - sbi->stat_info = kzalloc(sizeof(struct f2fs_stat_info), GFP_KERNEL); - if (!sbi->stat_info) + si = kzalloc(sizeof(struct f2fs_stat_info), GFP_KERNEL); + if (!si) return -ENOMEM; - si = sbi->stat_info; si->all_area_segs = le32_to_cpu(raw_super->segment_count); si->sit_area_segs = le32_to_cpu(raw_super->segment_count_sit); si->nat_area_segs = le32_to_cpu(raw_super->segment_count_nat); @@ -319,6 +318,7 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi) si->main_area_zones = si->main_area_sections / le32_to_cpu(raw_super->secs_per_zone); si->sbi = sbi; + sbi->stat_info = si; mutex_lock(&f2fs_stat_mutex); list_add_tail(&si->stat_list, &f2fs_stat_list); @@ -329,13 +329,13 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi) void f2fs_destroy_stats(struct f2fs_sb_info *sbi) { - struct f2fs_stat_info *si = sbi->stat_info; + struct f2fs_stat_info *si = F2FS_STAT(sbi); mutex_lock(&f2fs_stat_mutex); list_del(&si->stat_list); mutex_unlock(&f2fs_stat_mutex); - kfree(sbi->stat_info); + kfree(si); } void __init f2fs_create_root_stats(void) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 62f0d5977c64..384c6daf9a89 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -270,12 +270,27 @@ static void init_dent_inode(const struct qstr *name, struct page *ipage) struct f2fs_node *rn; /* copy name info. to this inode page */ - rn = (struct f2fs_node *)page_address(ipage); + rn = F2FS_NODE(ipage); rn->i.i_namelen = cpu_to_le32(name->len); memcpy(rn->i.i_name, name->name, name->len); set_page_dirty(ipage); } +int update_dent_inode(struct inode *inode, const struct qstr *name) +{ + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct page *page; + + page = get_node_page(sbi, inode->i_ino); + if (IS_ERR(page)) + return PTR_ERR(page); + + init_dent_inode(name, page); + f2fs_put_page(page, 1); + + return 0; +} + static int make_empty_dir(struct inode *inode, struct inode *parent, struct page *page) { @@ -557,6 +572,8 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, if (inode->i_nlink == 0) add_orphan_inode(sbi, inode->i_ino); + else + release_orphan_inode(sbi); } if (bit_pos == NR_DENTRY_IN_BLOCK) { diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 467d42d65c48..608f0df5b919 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -17,6 +17,7 @@ #include <linux/slab.h> #include <linux/crc32.h> #include <linux/magic.h> +#include <linux/kobject.h> /* * For mount options @@ -28,6 +29,7 @@ #define F2FS_MOUNT_XATTR_USER 0x00000010 #define F2FS_MOUNT_POSIX_ACL 0x00000020 #define F2FS_MOUNT_DISABLE_EXT_IDENTIFY 0x00000040 +#define F2FS_MOUNT_INLINE_XATTR 0x00000080 #define clear_opt(sbi, option) (sbi->mount_opt.opt &= ~F2FS_MOUNT_##option) #define set_opt(sbi, option) (sbi->mount_opt.opt |= F2FS_MOUNT_##option) @@ -134,11 +136,13 @@ static inline int update_sits_in_cursum(struct f2fs_summary_block *rs, int i) /* * For INODE and NODE manager */ -#define XATTR_NODE_OFFSET (-1) /* - * store xattrs to one node block per - * file keeping -1 as its node offset to - * distinguish from index node blocks. - */ +/* + * XATTR_NODE_OFFSET stores xattrs to one node block per file keeping -1 + * as its node offset to distinguish from index node blocks. + * But some bits are used to mark the node block. + */ +#define XATTR_NODE_OFFSET ((((unsigned int)-1) << OFFSET_BIT_SHIFT) \ + >> OFFSET_BIT_SHIFT) enum { ALLOC_NODE, /* allocate a new node page if needed */ LOOKUP_NODE, /* look up a node without readahead */ @@ -178,6 +182,7 @@ struct f2fs_inode_info { f2fs_hash_t chash; /* hash value of given file name */ unsigned int clevel; /* maximum level of given file name */ nid_t i_xattr_nid; /* node id that contains xattrs */ + unsigned long long xattr_ver; /* cp version of xattr modification */ struct extent_info ext; /* in-memory extent cache entry */ }; @@ -296,15 +301,6 @@ struct f2fs_sm_info { }; /* - * For directory operation - */ -#define NODE_DIR1_BLOCK (ADDRS_PER_INODE + 1) -#define NODE_DIR2_BLOCK (ADDRS_PER_INODE + 2) -#define NODE_IND1_BLOCK (ADDRS_PER_INODE + 3) -#define NODE_IND2_BLOCK (ADDRS_PER_INODE + 4) -#define NODE_DIND_BLOCK (ADDRS_PER_INODE + 5) - -/* * For superblock */ /* @@ -350,6 +346,7 @@ enum page_type { struct f2fs_sb_info { struct super_block *sb; /* pointer to VFS super block */ + struct proc_dir_entry *s_proc; /* proc entry */ struct buffer_head *raw_super_buf; /* buffer head of raw sb */ struct f2fs_super_block *raw_super; /* raw super block pointer */ int s_dirty; /* dirty flag for checkpoint */ @@ -429,6 +426,10 @@ struct f2fs_sb_info { #endif unsigned int last_victim[2]; /* last victim segment # */ spinlock_t stat_lock; /* lock for stat operations */ + + /* For sysfs suppport */ + struct kobject s_kobj; + struct completion s_kobj_unregister; }; /* @@ -454,6 +455,11 @@ static inline struct f2fs_checkpoint *F2FS_CKPT(struct f2fs_sb_info *sbi) return (struct f2fs_checkpoint *)(sbi->ckpt); } +static inline struct f2fs_node *F2FS_NODE(struct page *page) +{ + return (struct f2fs_node *)page_address(page); +} + static inline struct f2fs_nm_info *NM_I(struct f2fs_sb_info *sbi) { return (struct f2fs_nm_info *)(sbi->nm_info); @@ -489,6 +495,11 @@ static inline void F2FS_RESET_SB_DIRT(struct f2fs_sb_info *sbi) sbi->s_dirty = 0; } +static inline unsigned long long cur_cp_version(struct f2fs_checkpoint *cp) +{ + return le64_to_cpu(cp->checkpoint_ver); +} + static inline bool is_set_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f) { unsigned int ckpt_flags = le32_to_cpu(cp->ckpt_flags); @@ -677,7 +688,7 @@ static inline block_t __start_cp_addr(struct f2fs_sb_info *sbi) { block_t start_addr; struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); - unsigned long long ckpt_version = le64_to_cpu(ckpt->checkpoint_ver); + unsigned long long ckpt_version = cur_cp_version(ckpt); start_addr = le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_blkaddr); @@ -812,7 +823,7 @@ static inline struct kmem_cache *f2fs_kmem_cache_create(const char *name, static inline bool IS_INODE(struct page *page) { - struct f2fs_node *p = (struct f2fs_node *)page_address(page); + struct f2fs_node *p = F2FS_NODE(page); return RAW_IS_INODE(p); } @@ -826,7 +837,7 @@ static inline block_t datablock_addr(struct page *node_page, { struct f2fs_node *raw_node; __le32 *addr_array; - raw_node = (struct f2fs_node *)page_address(node_page); + raw_node = F2FS_NODE(node_page); addr_array = blkaddr_in_node(raw_node); return le32_to_cpu(addr_array[offset]); } @@ -873,6 +884,7 @@ enum { FI_NO_ALLOC, /* should not allocate any blocks */ FI_UPDATE_DIR, /* should update inode block for consistency */ FI_DELAY_IPUT, /* used for the recovery */ + FI_INLINE_XATTR, /* used for inline xattr */ }; static inline void set_inode_flag(struct f2fs_inode_info *fi, int flag) @@ -905,6 +917,45 @@ static inline int cond_clear_inode_flag(struct f2fs_inode_info *fi, int flag) return 0; } +static inline void get_inline_info(struct f2fs_inode_info *fi, + struct f2fs_inode *ri) +{ + if (ri->i_inline & F2FS_INLINE_XATTR) + set_inode_flag(fi, FI_INLINE_XATTR); +} + +static inline void set_raw_inline(struct f2fs_inode_info *fi, + struct f2fs_inode *ri) +{ + ri->i_inline = 0; + + if (is_inode_flag_set(fi, FI_INLINE_XATTR)) + ri->i_inline |= F2FS_INLINE_XATTR; +} + +static inline unsigned int addrs_per_inode(struct f2fs_inode_info *fi) +{ + if (is_inode_flag_set(fi, FI_INLINE_XATTR)) + return DEF_ADDRS_PER_INODE - F2FS_INLINE_XATTR_ADDRS; + return DEF_ADDRS_PER_INODE; +} + +static inline void *inline_xattr_addr(struct page *page) +{ + struct f2fs_inode *ri; + ri = (struct f2fs_inode *)page_address(page); + return (void *)&(ri->i_addr[DEF_ADDRS_PER_INODE - + F2FS_INLINE_XATTR_ADDRS]); +} + +static inline int inline_xattr_size(struct inode *inode) +{ + if (is_inode_flag_set(F2FS_I(inode), FI_INLINE_XATTR)) + return F2FS_INLINE_XATTR_ADDRS << 2; + else + return 0; +} + static inline int f2fs_readonly(struct super_block *sb) { return sb->s_flags & MS_RDONLY; @@ -947,6 +998,7 @@ struct f2fs_dir_entry *f2fs_parent_dir(struct inode *, struct page **); ino_t f2fs_inode_by_name(struct inode *, struct qstr *); void f2fs_set_link(struct inode *, struct f2fs_dir_entry *, struct page *, struct inode *); +int update_dent_inode(struct inode *, const struct qstr *); int __f2fs_add_link(struct inode *, const struct qstr *, struct inode *); void f2fs_delete_entry(struct f2fs_dir_entry *, struct page *, struct inode *); int f2fs_make_empty(struct inode *, struct inode *); @@ -980,6 +1032,7 @@ int is_checkpointed_node(struct f2fs_sb_info *, nid_t); void get_node_info(struct f2fs_sb_info *, nid_t, struct node_info *); int get_dnode_of_data(struct dnode_of_data *, pgoff_t, int); int truncate_inode_blocks(struct inode *, pgoff_t); +int truncate_xattr_node(struct inode *, struct page *); int remove_inode_page(struct inode *); struct page *new_inode_page(struct inode *, const struct qstr *); struct page *new_node_page(struct dnode_of_data *, unsigned int, struct page *); @@ -1012,7 +1065,8 @@ int npages_for_summary_flush(struct f2fs_sb_info *); void allocate_new_segments(struct f2fs_sb_info *); struct page *get_sum_page(struct f2fs_sb_info *, unsigned int); struct bio *f2fs_bio_alloc(struct block_device *, int); -void f2fs_submit_bio(struct f2fs_sb_info *, enum page_type, bool sync); +void f2fs_submit_bio(struct f2fs_sb_info *, enum page_type, bool); +void f2fs_wait_on_page_writeback(struct page *, enum page_type, bool); void write_meta_page(struct f2fs_sb_info *, struct page *); void write_node_page(struct f2fs_sb_info *, struct page *, unsigned int, block_t, block_t *); @@ -1037,7 +1091,8 @@ void destroy_segment_manager(struct f2fs_sb_info *); struct page *grab_meta_page(struct f2fs_sb_info *, pgoff_t); struct page *get_meta_page(struct f2fs_sb_info *, pgoff_t); long sync_meta_pages(struct f2fs_sb_info *, enum page_type, long); -int check_orphan_space(struct f2fs_sb_info *); +int acquire_orphan_inode(struct f2fs_sb_info *); +void release_orphan_inode(struct f2fs_sb_info *); void add_orphan_inode(struct f2fs_sb_info *, nid_t); void remove_orphan_inode(struct f2fs_sb_info *, nid_t); int recover_orphan_inodes(struct f2fs_sb_info *); @@ -1068,7 +1123,7 @@ int do_write_data_page(struct page *); */ int start_gc_thread(struct f2fs_sb_info *); void stop_gc_thread(struct f2fs_sb_info *); -block_t start_bidx_of_node(unsigned int); +block_t start_bidx_of_node(unsigned int, struct f2fs_inode_info *); int f2fs_gc(struct f2fs_sb_info *); void build_gc_manager(struct f2fs_sb_info *); int __init create_gc_caches(void); @@ -1112,11 +1167,16 @@ struct f2fs_stat_info { unsigned base_mem, cache_mem; }; +static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) +{ + return (struct f2fs_stat_info*)sbi->stat_info; +} + #define stat_inc_call_count(si) ((si)->call_count++) #define stat_inc_seg_count(sbi, type) \ do { \ - struct f2fs_stat_info *si = sbi->stat_info; \ + struct f2fs_stat_info *si = F2FS_STAT(sbi); \ (si)->tot_segs++; \ if (type == SUM_TYPE_DATA) \ si->data_segs++; \ @@ -1129,14 +1189,14 @@ struct f2fs_stat_info { #define stat_inc_data_blk_count(sbi, blks) \ do { \ - struct f2fs_stat_info *si = sbi->stat_info; \ + struct f2fs_stat_info *si = F2FS_STAT(sbi); \ stat_inc_tot_blk_count(si, blks); \ si->data_blks += (blks); \ } while (0) #define stat_inc_node_blk_count(sbi, blks) \ do { \ - struct f2fs_stat_info *si = sbi->stat_info; \ + struct f2fs_stat_info *si = F2FS_STAT(sbi); \ stat_inc_tot_blk_count(si, blks); \ si->node_blks += (blks); \ } while (0) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index d2d2b7dbdcc1..02c906971cc6 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -112,11 +112,13 @@ static int get_parent_ino(struct inode *inode, nid_t *pino) if (!dentry) return 0; - inode = igrab(dentry->d_parent->d_inode); - dput(dentry); + if (update_dent_inode(inode, &dentry->d_name)) { + dput(dentry); + return 0; + } - *pino = inode->i_ino; - iput(inode); + *pino = parent_ino(dentry); + dput(dentry); return 1; } @@ -147,9 +149,10 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) mutex_lock(&inode->i_mutex); - if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) - goto out; - + /* + * Both of fdatasync() and fsync() are able to be recovered from + * sudden-power-off. + */ if (!S_ISREG(inode->i_mode) || inode->i_nlink != 1) need_cp = true; else if (file_wrong_pino(inode)) @@ -158,10 +161,14 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) need_cp = true; else if (!is_checkpointed_node(sbi, F2FS_I(inode)->i_pino)) need_cp = true; + else if (F2FS_I(inode)->xattr_ver == cur_cp_version(F2FS_CKPT(sbi))) + need_cp = true; if (need_cp) { nid_t pino; + F2FS_I(inode)->xattr_ver = 0; + /* all the dirty node pages should be flushed for POR */ ret = f2fs_sync_fs(inode->i_sb, 1); if (file_wrong_pino(inode) && inode->i_nlink == 1 && @@ -205,7 +212,7 @@ int truncate_data_blocks_range(struct dnode_of_data *dn, int count) struct f2fs_node *raw_node; __le32 *addr; - raw_node = page_address(dn->node_page); + raw_node = F2FS_NODE(dn->node_page); addr = blkaddr_in_node(raw_node) + ofs; for ( ; count > 0; count--, addr++, dn->ofs_in_node++) { @@ -283,7 +290,7 @@ static int truncate_blocks(struct inode *inode, u64 from) } if (IS_INODE(dn.node_page)) - count = ADDRS_PER_INODE; + count = ADDRS_PER_INODE(F2FS_I(inode)); else count = ADDRS_PER_BLOCK; diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 35f9b1a196aa..2f157e883687 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -29,10 +29,11 @@ static struct kmem_cache *winode_slab; static int gc_thread_func(void *data) { struct f2fs_sb_info *sbi = data; + struct f2fs_gc_kthread *gc_th = sbi->gc_thread; wait_queue_head_t *wq = &sbi->gc_thread->gc_wait_queue_head; long wait_ms; - wait_ms = GC_THREAD_MIN_SLEEP_TIME; + wait_ms = gc_th->min_sleep_time; do { if (try_to_freeze()) @@ -45,7 +46,7 @@ static int gc_thread_func(void *data) break; if (sbi->sb->s_writers.frozen >= SB_FREEZE_WRITE) { - wait_ms = GC_THREAD_MAX_SLEEP_TIME; + wait_ms = increase_sleep_time(gc_th, wait_ms); continue; } @@ -66,15 +67,15 @@ static int gc_thread_func(void *data) continue; if (!is_idle(sbi)) { - wait_ms = increase_sleep_time(wait_ms); + wait_ms = increase_sleep_time(gc_th, wait_ms); mutex_unlock(&sbi->gc_mutex); continue; } if (has_enough_invalid_blocks(sbi)) - wait_ms = decrease_sleep_time(wait_ms); + wait_ms = decrease_sleep_time(gc_th, wait_ms); else - wait_ms = increase_sleep_time(wait_ms); + wait_ms = increase_sleep_time(gc_th, wait_ms); #ifdef CONFIG_F2FS_STAT_FS sbi->bg_gc++; @@ -82,7 +83,7 @@ static int gc_thread_func(void *data) /* if return value is not zero, no victim was selected */ if (f2fs_gc(sbi)) - wait_ms = GC_THREAD_NOGC_SLEEP_TIME; + wait_ms = gc_th->no_gc_sleep_time; } while (!kthread_should_stop()); return 0; } @@ -101,6 +102,12 @@ int start_gc_thread(struct f2fs_sb_info *sbi) goto out; } + gc_th->min_sleep_time = DEF_GC_THREAD_MIN_SLEEP_TIME; + gc_th->max_sleep_time = DEF_GC_THREAD_MAX_SLEEP_TIME; + gc_th->no_gc_sleep_time = DEF_GC_THREAD_NOGC_SLEEP_TIME; + + gc_th->gc_idle = 0; + sbi->gc_thread = gc_th; init_waitqueue_head(&sbi->gc_thread->gc_wait_queue_head); sbi->gc_thread->f2fs_gc_task = kthread_run(gc_thread_func, sbi, @@ -125,9 +132,17 @@ void stop_gc_thread(struct f2fs_sb_info *sbi) sbi->gc_thread = NULL; } -static int select_gc_type(int gc_type) +static int select_gc_type(struct f2fs_gc_kthread *gc_th, int gc_type) { - return (gc_type == BG_GC) ? GC_CB : GC_GREEDY; + int gc_mode = (gc_type == BG_GC) ? GC_CB : GC_GREEDY; + + if (gc_th && gc_th->gc_idle) { + if (gc_th->gc_idle == 1) + gc_mode = GC_CB; + else if (gc_th->gc_idle == 2) + gc_mode = GC_GREEDY; + } + return gc_mode; } static void select_policy(struct f2fs_sb_info *sbi, int gc_type, @@ -138,12 +153,18 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type, if (p->alloc_mode == SSR) { p->gc_mode = GC_GREEDY; p->dirty_segmap = dirty_i->dirty_segmap[type]; + p->max_search = dirty_i->nr_dirty[type]; p->ofs_unit = 1; } else { - p->gc_mode = select_gc_type(gc_type); + p->gc_mode = select_gc_type(sbi->gc_thread, gc_type); p->dirty_segmap = dirty_i->dirty_segmap[DIRTY]; + p->max_search = dirty_i->nr_dirty[DIRTY]; p->ofs_unit = sbi->segs_per_sec; } + + if (p->max_search > MAX_VICTIM_SEARCH) + p->max_search = MAX_VICTIM_SEARCH; + p->offset = sbi->last_victim[p->gc_mode]; } @@ -290,7 +311,7 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, if (cost == max_cost) continue; - if (nsearched++ >= MAX_VICTIM_SEARCH) { + if (nsearched++ >= p.max_search) { sbi->last_victim[p.gc_mode] = segno; break; } @@ -407,8 +428,7 @@ next_step: /* set page dirty and write it */ if (gc_type == FG_GC) { - f2fs_submit_bio(sbi, NODE, true); - wait_on_page_writeback(node_page); + f2fs_wait_on_page_writeback(node_page, NODE, true); set_page_dirty(node_page); } else { if (!PageWriteback(node_page)) @@ -447,7 +467,7 @@ next_step: * as indirect or double indirect node blocks, are given, it must be a caller's * bug. */ -block_t start_bidx_of_node(unsigned int node_ofs) +block_t start_bidx_of_node(unsigned int node_ofs, struct f2fs_inode_info *fi) { unsigned int indirect_blks = 2 * NIDS_PER_BLOCK + 4; unsigned int bidx; @@ -464,7 +484,7 @@ block_t start_bidx_of_node(unsigned int node_ofs) int dec = (node_ofs - indirect_blks - 3) / (NIDS_PER_BLOCK + 1); bidx = node_ofs - 5 - dec; } - return bidx * ADDRS_PER_BLOCK + ADDRS_PER_INODE; + return bidx * ADDRS_PER_BLOCK + ADDRS_PER_INODE(fi); } static int check_dnode(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, @@ -508,10 +528,7 @@ static void move_data_page(struct inode *inode, struct page *page, int gc_type) } else { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - if (PageWriteback(page)) { - f2fs_submit_bio(sbi, DATA, true); - wait_on_page_writeback(page); - } + f2fs_wait_on_page_writeback(page, DATA, true); if (clear_page_dirty_for_io(page) && S_ISDIR(inode->i_mode)) { @@ -575,7 +592,6 @@ next_step: continue; } - start_bidx = start_bidx_of_node(nofs); ofs_in_node = le16_to_cpu(entry->ofs_in_node); if (phase == 2) { @@ -583,6 +599,8 @@ next_step: if (IS_ERR(inode)) continue; + start_bidx = start_bidx_of_node(nofs, F2FS_I(inode)); + data_page = find_data_page(inode, start_bidx + ofs_in_node, false); if (IS_ERR(data_page)) @@ -593,6 +611,8 @@ next_step: } else { inode = find_gc_inode(dni.ino, ilist); if (inode) { + start_bidx = start_bidx_of_node(nofs, + F2FS_I(inode)); data_page = get_lock_data_page(inode, start_bidx + ofs_in_node); if (IS_ERR(data_page)) diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h index 2c6a6bd08322..507056d22205 100644 --- a/fs/f2fs/gc.h +++ b/fs/f2fs/gc.h @@ -13,18 +13,26 @@ * whether IO subsystem is idle * or not */ -#define GC_THREAD_MIN_SLEEP_TIME 30000 /* milliseconds */ -#define GC_THREAD_MAX_SLEEP_TIME 60000 -#define GC_THREAD_NOGC_SLEEP_TIME 300000 /* wait 5 min */ +#define DEF_GC_THREAD_MIN_SLEEP_TIME 30000 /* milliseconds */ +#define DEF_GC_THREAD_MAX_SLEEP_TIME 60000 +#define DEF_GC_THREAD_NOGC_SLEEP_TIME 300000 /* wait 5 min */ #define LIMIT_INVALID_BLOCK 40 /* percentage over total user space */ #define LIMIT_FREE_BLOCK 40 /* percentage over invalid + free space */ /* Search max. number of dirty segments to select a victim segment */ -#define MAX_VICTIM_SEARCH 20 +#define MAX_VICTIM_SEARCH 4096 /* covers 8GB */ struct f2fs_gc_kthread { struct task_struct *f2fs_gc_task; wait_queue_head_t gc_wait_queue_head; + + /* for gc sleep time */ + unsigned int min_sleep_time; + unsigned int max_sleep_time; + unsigned int no_gc_sleep_time; + + /* for changing gc mode */ + unsigned int gc_idle; }; struct inode_entry { @@ -56,25 +64,25 @@ static inline block_t limit_free_user_blocks(struct f2fs_sb_info *sbi) return (long)(reclaimable_user_blocks * LIMIT_FREE_BLOCK) / 100; } -static inline long increase_sleep_time(long wait) +static inline long increase_sleep_time(struct f2fs_gc_kthread *gc_th, long wait) { - if (wait == GC_THREAD_NOGC_SLEEP_TIME) + if (wait == gc_th->no_gc_sleep_time) return wait; - wait += GC_THREAD_MIN_SLEEP_TIME; - if (wait > GC_THREAD_MAX_SLEEP_TIME) - wait = GC_THREAD_MAX_SLEEP_TIME; + wait += gc_th->min_sleep_time; + if (wait > gc_th->max_sleep_time) + wait = gc_th->max_sleep_time; return wait; } -static inline long decrease_sleep_time(long wait) +static inline long decrease_sleep_time(struct f2fs_gc_kthread *gc_th, long wait) { - if (wait == GC_THREAD_NOGC_SLEEP_TIME) - wait = GC_THREAD_MAX_SLEEP_TIME; + if (wait == gc_th->no_gc_sleep_time) + wait = gc_th->max_sleep_time; - wait -= GC_THREAD_MIN_SLEEP_TIME; - if (wait <= GC_THREAD_MIN_SLEEP_TIME) - wait = GC_THREAD_MIN_SLEEP_TIME; + wait -= gc_th->min_sleep_time; + if (wait <= gc_th->min_sleep_time) + wait = gc_th->min_sleep_time; return wait; } diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 2b2d45d19e3e..9339cd292047 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -56,7 +56,7 @@ static int do_read_inode(struct inode *inode) if (IS_ERR(node_page)) return PTR_ERR(node_page); - rn = page_address(node_page); + rn = F2FS_NODE(node_page); ri = &(rn->i); inode->i_mode = le16_to_cpu(ri->i_mode); @@ -85,6 +85,7 @@ static int do_read_inode(struct inode *inode) fi->i_advise = ri->i_advise; fi->i_pino = le32_to_cpu(ri->i_pino); get_extent_info(&fi->ext, ri->i_ext); + get_inline_info(fi, ri); f2fs_put_page(node_page, 1); return 0; } @@ -151,9 +152,9 @@ void update_inode(struct inode *inode, struct page *node_page) struct f2fs_node *rn; struct f2fs_inode *ri; - wait_on_page_writeback(node_page); + f2fs_wait_on_page_writeback(node_page, NODE, false); - rn = page_address(node_page); + rn = F2FS_NODE(node_page); ri = &(rn->i); ri->i_mode = cpu_to_le16(inode->i_mode); @@ -164,6 +165,7 @@ void update_inode(struct inode *inode, struct page *node_page) ri->i_size = cpu_to_le64(i_size_read(inode)); ri->i_blocks = cpu_to_le64(inode->i_blocks); set_raw_extent(&F2FS_I(inode)->ext, &ri->i_ext); + set_raw_inline(F2FS_I(inode), ri); ri->i_atime = cpu_to_le64(inode->i_atime.tv_sec); ri->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); @@ -221,9 +223,6 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) if (!is_inode_flag_set(F2FS_I(inode), FI_DIRTY_INODE)) return 0; - if (wbc) - f2fs_balance_fs(sbi); - /* * We need to lock here to prevent from producing dirty node pages * during the urgent cleaning time when runing out of free sections. @@ -231,6 +230,10 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) ilock = mutex_lock_op(sbi); ret = update_inode_page(inode); mutex_unlock_op(sbi, ilock); + + if (wbc) + f2fs_balance_fs(sbi); + return ret; } diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 64c07169df05..2a5359c990fc 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -83,21 +83,11 @@ static int is_multimedia_file(const unsigned char *s, const char *sub) { size_t slen = strlen(s); size_t sublen = strlen(sub); - int ret; if (sublen > slen) return 0; - ret = memcmp(s + slen - sublen, sub, sublen); - if (ret) { /* compare upper case */ - int i; - char upper_sub[8]; - for (i = 0; i < sublen && i < sizeof(upper_sub); i++) - upper_sub[i] = toupper(sub[i]); - return !memcmp(s + slen - sublen, upper_sub, sublen); - } - - return !ret; + return !strncasecmp(s + slen - sublen, sub, sublen); } /* @@ -239,7 +229,7 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry) if (!de) goto fail; - err = check_orphan_space(sbi); + err = acquire_orphan_inode(sbi); if (err) { kunmap(page); f2fs_put_page(page, 0); @@ -393,7 +383,7 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *old_inode = old_dentry->d_inode; struct inode *new_inode = new_dentry->d_inode; struct page *old_dir_page; - struct page *old_page; + struct page *old_page, *new_page; struct f2fs_dir_entry *old_dir_entry = NULL; struct f2fs_dir_entry *old_entry; struct f2fs_dir_entry *new_entry; @@ -415,7 +405,6 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, ilock = mutex_lock_op(sbi); if (new_inode) { - struct page *new_page; err = -ENOTEMPTY; if (old_dir_entry && !f2fs_empty_dir(new_inode)) @@ -427,14 +416,28 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, if (!new_entry) goto out_dir; + err = acquire_orphan_inode(sbi); + if (err) + goto put_out_dir; + + if (update_dent_inode(old_inode, &new_dentry->d_name)) { + release_orphan_inode(sbi); + goto put_out_dir; + } + f2fs_set_link(new_dir, new_entry, new_page, old_inode); new_inode->i_ctime = CURRENT_TIME; if (old_dir_entry) drop_nlink(new_inode); drop_nlink(new_inode); + if (!new_inode->i_nlink) add_orphan_inode(sbi, new_inode->i_ino); + else + release_orphan_inode(sbi); + + update_inode_page(old_inode); update_inode_page(new_inode); } else { err = f2fs_add_link(new_dentry, old_inode); @@ -467,6 +470,8 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, mutex_unlock_op(sbi, ilock); return 0; +put_out_dir: + f2fs_put_page(new_page, 1); out_dir: if (old_dir_entry) { kunmap(old_dir_page); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index b418aee09573..51ef27894433 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -315,9 +315,10 @@ cache: * The maximum depth is four. * Offset[0] will have raw inode offset. */ -static int get_node_path(long block, int offset[4], unsigned int noffset[4]) +static int get_node_path(struct f2fs_inode_info *fi, long block, + int offset[4], unsigned int noffset[4]) { - const long direct_index = ADDRS_PER_INODE; + const long direct_index = ADDRS_PER_INODE(fi); const long direct_blks = ADDRS_PER_BLOCK; const long dptrs_per_blk = NIDS_PER_BLOCK; const long indirect_blks = ADDRS_PER_BLOCK * NIDS_PER_BLOCK; @@ -405,7 +406,7 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) int level, i; int err = 0; - level = get_node_path(index, offset, noffset); + level = get_node_path(F2FS_I(dn->inode), index, offset, noffset); nids[0] = dn->inode->i_ino; npage[0] = dn->inode_page; @@ -565,7 +566,7 @@ static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs, return PTR_ERR(page); } - rn = (struct f2fs_node *)page_address(page); + rn = F2FS_NODE(page); if (depth < 3) { for (i = ofs; i < NIDS_PER_BLOCK; i++, freed++) { child_nid = le32_to_cpu(rn->in.nid[i]); @@ -687,7 +688,7 @@ int truncate_inode_blocks(struct inode *inode, pgoff_t from) trace_f2fs_truncate_inode_blocks_enter(inode, from); - level = get_node_path(from, offset, noffset); + level = get_node_path(F2FS_I(inode), from, offset, noffset); restart: page = get_node_page(sbi, inode->i_ino); if (IS_ERR(page)) { @@ -698,7 +699,7 @@ restart: set_new_dnode(&dn, inode, page, NULL, 0); unlock_page(page); - rn = page_address(page); + rn = F2FS_NODE(page); switch (level) { case 0: case 1: @@ -771,6 +772,33 @@ fail: return err > 0 ? 0 : err; } +int truncate_xattr_node(struct inode *inode, struct page *page) +{ + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + nid_t nid = F2FS_I(inode)->i_xattr_nid; + struct dnode_of_data dn; + struct page *npage; + + if (!nid) + return 0; + + npage = get_node_page(sbi, nid); + if (IS_ERR(npage)) + return PTR_ERR(npage); + + F2FS_I(inode)->i_xattr_nid = 0; + + /* need to do checkpoint during fsync */ + F2FS_I(inode)->xattr_ver = cur_cp_version(F2FS_CKPT(sbi)); + + set_new_dnode(&dn, inode, page, npage, nid); + + if (page) + dn.inode_page_locked = 1; + truncate_node(&dn); + return 0; +} + /* * Caller should grab and release a mutex by calling mutex_lock_op() and * mutex_unlock_op(). @@ -781,22 +809,16 @@ int remove_inode_page(struct inode *inode) struct page *page; nid_t ino = inode->i_ino; struct dnode_of_data dn; + int err; page = get_node_page(sbi, ino); if (IS_ERR(page)) return PTR_ERR(page); - if (F2FS_I(inode)->i_xattr_nid) { - nid_t nid = F2FS_I(inode)->i_xattr_nid; - struct page *npage = get_node_page(sbi, nid); - - if (IS_ERR(npage)) - return PTR_ERR(npage); - - F2FS_I(inode)->i_xattr_nid = 0; - set_new_dnode(&dn, inode, page, npage, nid); - dn.inode_page_locked = 1; - truncate_node(&dn); + err = truncate_xattr_node(inode, page); + if (err) { + f2fs_put_page(page, 1); + return err; } /* 0 is possible, after f2fs_new_inode() is failed */ @@ -833,29 +855,32 @@ struct page *new_node_page(struct dnode_of_data *dn, if (!page) return ERR_PTR(-ENOMEM); - get_node_info(sbi, dn->nid, &old_ni); + if (!inc_valid_node_count(sbi, dn->inode, 1)) { + err = -ENOSPC; + goto fail; + } - SetPageUptodate(page); - fill_node_footer(page, dn->nid, dn->inode->i_ino, ofs, true); + get_node_info(sbi, dn->nid, &old_ni); /* Reinitialize old_ni with new node page */ BUG_ON(old_ni.blk_addr != NULL_ADDR); new_ni = old_ni; new_ni.ino = dn->inode->i_ino; - - if (!inc_valid_node_count(sbi, dn->inode, 1)) { - err = -ENOSPC; - goto fail; - } set_node_addr(sbi, &new_ni, NEW_ADDR); + + fill_node_footer(page, dn->nid, dn->inode->i_ino, ofs, true); set_cold_node(dn->inode, page); + SetPageUptodate(page); + set_page_dirty(page); + + if (ofs == XATTR_NODE_OFFSET) + F2FS_I(dn->inode)->i_xattr_nid = dn->nid; dn->node_page = page; if (ipage) update_inode(dn->inode, ipage); else sync_inode_page(dn); - set_page_dirty(page); if (ofs == 0) inc_valid_inode_count(sbi); @@ -916,7 +941,6 @@ void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid) f2fs_put_page(apage, 0); else if (err == LOCKED_PAGE) f2fs_put_page(apage, 1); - return; } struct page *get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid) @@ -1167,9 +1191,9 @@ static int f2fs_write_node_page(struct page *page, /* * It is very important to gather dirty pages and write at once, so that we can * submit a big bio without interfering other data writes. - * Be default, 512 pages (2MB), a segment size, is quite reasonable. + * Be default, 512 pages (2MB) * 3 node types, is more reasonable. */ -#define COLLECT_DIRTY_NODES 512 +#define COLLECT_DIRTY_NODES 1536 static int f2fs_write_node_pages(struct address_space *mapping, struct writeback_control *wbc) { @@ -1187,9 +1211,10 @@ static int f2fs_write_node_pages(struct address_space *mapping, return 0; /* if mounting is failed, skip writing node pages */ - wbc->nr_to_write = max_hw_blocks(sbi); + wbc->nr_to_write = 3 * max_hw_blocks(sbi); sync_node_pages(sbi, 0, wbc); - wbc->nr_to_write = nr_to_write - (max_hw_blocks(sbi) - wbc->nr_to_write); + wbc->nr_to_write = nr_to_write - (3 * max_hw_blocks(sbi) - + wbc->nr_to_write); return 0; } @@ -1444,6 +1469,9 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid) struct f2fs_nm_info *nm_i = NM_I(sbi); struct free_nid *i; + if (!nid) + return; + spin_lock(&nm_i->free_nid_list_lock); i = __lookup_free_nid_list(nid, &nm_i->free_nid_list); BUG_ON(!i || i->state != NID_ALLOC); @@ -1484,8 +1512,8 @@ int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) SetPageUptodate(ipage); fill_node_footer(ipage, ino, ino, 0, true); - src = (struct f2fs_node *)page_address(page); - dst = (struct f2fs_node *)page_address(ipage); + src = F2FS_NODE(page); + dst = F2FS_NODE(ipage); memcpy(dst, src, (unsigned long)&src->i.i_ext - (unsigned long)&src->i); dst->i.i_size = 0; @@ -1515,8 +1543,8 @@ int restore_node_summary(struct f2fs_sb_info *sbi, /* alloc temporal page for read node */ page = alloc_page(GFP_NOFS | __GFP_ZERO); - if (IS_ERR(page)) - return PTR_ERR(page); + if (!page) + return -ENOMEM; lock_page(page); /* scan the node segment */ @@ -1535,7 +1563,7 @@ int restore_node_summary(struct f2fs_sb_info *sbi, goto out; lock_page(page); - rn = (struct f2fs_node *)page_address(page); + rn = F2FS_NODE(page); sum_entry->nid = rn->footer.nid; sum_entry->version = 0; sum_entry->ofs_in_node = 0; diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index c65fb4f4230f..3496bb3e15dc 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -155,8 +155,7 @@ static inline void set_to_next_nat(struct f2fs_nm_info *nm_i, nid_t start_nid) static inline void fill_node_footer(struct page *page, nid_t nid, nid_t ino, unsigned int ofs, bool reset) { - void *kaddr = page_address(page); - struct f2fs_node *rn = (struct f2fs_node *)kaddr; + struct f2fs_node *rn = F2FS_NODE(page); if (reset) memset(rn, 0, sizeof(*rn)); rn->footer.nid = cpu_to_le32(nid); @@ -166,10 +165,8 @@ static inline void fill_node_footer(struct page *page, nid_t nid, static inline void copy_node_footer(struct page *dst, struct page *src) { - void *src_addr = page_address(src); - void *dst_addr = page_address(dst); - struct f2fs_node *src_rn = (struct f2fs_node *)src_addr; - struct f2fs_node *dst_rn = (struct f2fs_node *)dst_addr; + struct f2fs_node *src_rn = F2FS_NODE(src); + struct f2fs_node *dst_rn = F2FS_NODE(dst); memcpy(&dst_rn->footer, &src_rn->footer, sizeof(struct node_footer)); } @@ -177,45 +174,40 @@ static inline void fill_node_footer_blkaddr(struct page *page, block_t blkaddr) { struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb); struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); - void *kaddr = page_address(page); - struct f2fs_node *rn = (struct f2fs_node *)kaddr; + struct f2fs_node *rn = F2FS_NODE(page); + rn->footer.cp_ver = ckpt->checkpoint_ver; rn->footer.next_blkaddr = cpu_to_le32(blkaddr); } static inline nid_t ino_of_node(struct page *node_page) { - void *kaddr = page_address(node_page); - struct f2fs_node *rn = (struct f2fs_node *)kaddr; + struct f2fs_node *rn = F2FS_NODE(node_page); return le32_to_cpu(rn->footer.ino); } static inline nid_t nid_of_node(struct page *node_page) { - void *kaddr = page_address(node_page); - struct f2fs_node *rn = (struct f2fs_node *)kaddr; + struct f2fs_node *rn = F2FS_NODE(node_page); return le32_to_cpu(rn->footer.nid); } static inline unsigned int ofs_of_node(struct page *node_page) { - void *kaddr = page_address(node_page); - struct f2fs_node *rn = (struct f2fs_node *)kaddr; + struct f2fs_node *rn = F2FS_NODE(node_page); unsigned flag = le32_to_cpu(rn->footer.flag); return flag >> OFFSET_BIT_SHIFT; } static inline unsigned long long cpver_of_node(struct page *node_page) { - void *kaddr = page_address(node_page); - struct f2fs_node *rn = (struct f2fs_node *)kaddr; + struct f2fs_node *rn = F2FS_NODE(node_page); return le64_to_cpu(rn->footer.cp_ver); } static inline block_t next_blkaddr_of_node(struct page *node_page) { - void *kaddr = page_address(node_page); - struct f2fs_node *rn = (struct f2fs_node *)kaddr; + struct f2fs_node *rn = F2FS_NODE(node_page); return le32_to_cpu(rn->footer.next_blkaddr); } @@ -237,6 +229,10 @@ static inline block_t next_blkaddr_of_node(struct page *node_page) static inline bool IS_DNODE(struct page *node_page) { unsigned int ofs = ofs_of_node(node_page); + + if (ofs == XATTR_NODE_OFFSET) + return false; + if (ofs == 3 || ofs == 4 + NIDS_PER_BLOCK || ofs == 5 + 2 * NIDS_PER_BLOCK) return false; @@ -250,7 +246,7 @@ static inline bool IS_DNODE(struct page *node_page) static inline void set_nid(struct page *p, int off, nid_t nid, bool i) { - struct f2fs_node *rn = (struct f2fs_node *)page_address(p); + struct f2fs_node *rn = F2FS_NODE(p); wait_on_page_writeback(p); @@ -263,7 +259,8 @@ static inline void set_nid(struct page *p, int off, nid_t nid, bool i) static inline nid_t get_nid(struct page *p, int off, bool i) { - struct f2fs_node *rn = (struct f2fs_node *)page_address(p); + struct f2fs_node *rn = F2FS_NODE(p); + if (i) return le32_to_cpu(rn->i.i_nid[off - NODE_DIR1_BLOCK]); return le32_to_cpu(rn->in.nid[off]); @@ -314,8 +311,7 @@ static inline void clear_cold_data(struct page *page) static inline int is_node(struct page *page, int type) { - void *kaddr = page_address(page); - struct f2fs_node *rn = (struct f2fs_node *)kaddr; + struct f2fs_node *rn = F2FS_NODE(page); return le32_to_cpu(rn->footer.flag) & (1 << type); } @@ -325,7 +321,7 @@ static inline int is_node(struct page *page, int type) static inline void set_cold_node(struct inode *inode, struct page *page) { - struct f2fs_node *rn = (struct f2fs_node *)page_address(page); + struct f2fs_node *rn = F2FS_NODE(page); unsigned int flag = le32_to_cpu(rn->footer.flag); if (S_ISDIR(inode->i_mode)) @@ -337,7 +333,7 @@ static inline void set_cold_node(struct inode *inode, struct page *page) static inline void set_mark(struct page *page, int mark, int type) { - struct f2fs_node *rn = (struct f2fs_node *)page_address(page); + struct f2fs_node *rn = F2FS_NODE(page); unsigned int flag = le32_to_cpu(rn->footer.flag); if (mark) flag |= (0x1 << type); diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index d56d951c2253..51ef5eec33d7 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -40,8 +40,7 @@ static struct fsync_inode_entry *get_fsync_inode(struct list_head *head, static int recover_dentry(struct page *ipage, struct inode *inode) { - void *kaddr = page_address(ipage); - struct f2fs_node *raw_node = (struct f2fs_node *)kaddr; + struct f2fs_node *raw_node = F2FS_NODE(ipage); struct f2fs_inode *raw_inode = &(raw_node->i); nid_t pino = le32_to_cpu(raw_inode->i_pino); struct f2fs_dir_entry *de; @@ -93,8 +92,7 @@ out: static int recover_inode(struct inode *inode, struct page *node_page) { - void *kaddr = page_address(node_page); - struct f2fs_node *raw_node = (struct f2fs_node *)kaddr; + struct f2fs_node *raw_node = F2FS_NODE(node_page); struct f2fs_inode *raw_inode = &(raw_node->i); if (!IS_INODE(node_page)) @@ -119,7 +117,7 @@ static int recover_inode(struct inode *inode, struct page *node_page) static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) { - unsigned long long cp_ver = le64_to_cpu(sbi->ckpt->checkpoint_ver); + unsigned long long cp_ver = cur_cp_version(F2FS_CKPT(sbi)); struct curseg_info *curseg; struct page *page; block_t blkaddr; @@ -131,8 +129,8 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) /* read node page */ page = alloc_page(GFP_F2FS_ZERO); - if (IS_ERR(page)) - return PTR_ERR(page); + if (!page) + return -ENOMEM; lock_page(page); while (1) { @@ -215,6 +213,7 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi, void *kaddr; struct inode *inode; struct page *node_page; + unsigned int offset; block_t bidx; int i; @@ -259,8 +258,8 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi, node_page = get_node_page(sbi, nid); if (IS_ERR(node_page)) return PTR_ERR(node_page); - bidx = start_bidx_of_node(ofs_of_node(node_page)) + - le16_to_cpu(sum.ofs_in_node); + + offset = ofs_of_node(node_page); ino = ino_of_node(node_page); f2fs_put_page(node_page, 1); @@ -269,6 +268,9 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi, if (IS_ERR(inode)) return PTR_ERR(inode); + bidx = start_bidx_of_node(offset, F2FS_I(inode)) + + le16_to_cpu(sum.ofs_in_node); + truncate_hole(inode, bidx, bidx + 1); iput(inode); return 0; @@ -277,6 +279,7 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi, static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, struct page *page, block_t blkaddr) { + struct f2fs_inode_info *fi = F2FS_I(inode); unsigned int start, end; struct dnode_of_data dn; struct f2fs_summary sum; @@ -284,9 +287,9 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, int err = 0, recovered = 0; int ilock; - start = start_bidx_of_node(ofs_of_node(page)); + start = start_bidx_of_node(ofs_of_node(page), fi); if (IS_INODE(page)) - end = start + ADDRS_PER_INODE; + end = start + ADDRS_PER_INODE(fi); else end = start + ADDRS_PER_BLOCK; @@ -357,7 +360,7 @@ err: static int recover_data(struct f2fs_sb_info *sbi, struct list_head *head, int type) { - unsigned long long cp_ver = le64_to_cpu(sbi->ckpt->checkpoint_ver); + unsigned long long cp_ver = cur_cp_version(F2FS_CKPT(sbi)); struct curseg_info *curseg; struct page *page; int err = 0; @@ -369,7 +372,7 @@ static int recover_data(struct f2fs_sb_info *sbi, /* read node page */ page = alloc_page(GFP_NOFS | __GFP_ZERO); - if (IS_ERR(page)) + if (!page) return -ENOMEM; lock_page(page); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index a86d125a9885..09af9c7b0f52 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -117,7 +117,6 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno) } mutex_unlock(&dirty_i->seglist_lock); - return; } /* @@ -261,7 +260,6 @@ static void __add_sum_entry(struct f2fs_sb_info *sbi, int type, void *addr = curseg->sum_blk; addr += curseg->next_blkoff * sizeof(struct f2fs_summary); memcpy(addr, sum, sizeof(struct f2fs_summary)); - return; } /* @@ -542,12 +540,9 @@ static void allocate_segment_by_default(struct f2fs_sb_info *sbi, { struct curseg_info *curseg = CURSEG_I(sbi, type); - if (force) { + if (force) new_curseg(sbi, type, true); - goto out; - } - - if (type == CURSEG_WARM_NODE) + else if (type == CURSEG_WARM_NODE) new_curseg(sbi, type, false); else if (curseg->alloc_type == LFS && is_next_segment_free(sbi, type)) new_curseg(sbi, type, false); @@ -555,11 +550,9 @@ static void allocate_segment_by_default(struct f2fs_sb_info *sbi, change_curseg(sbi, type, true); else new_curseg(sbi, type, false); -out: #ifdef CONFIG_F2FS_STAT_FS sbi->segment_count[curseg->alloc_type]++; #endif - return; } void allocate_new_segments(struct f2fs_sb_info *sbi) @@ -611,18 +604,12 @@ static void f2fs_end_io_write(struct bio *bio, int err) struct bio *f2fs_bio_alloc(struct block_device *bdev, int npages) { struct bio *bio; - struct bio_private *priv; -retry: - priv = kmalloc(sizeof(struct bio_private), GFP_NOFS); - if (!priv) { - cond_resched(); - goto retry; - } /* No failure on bio allocation */ bio = bio_alloc(GFP_NOIO, npages); bio->bi_bdev = bdev; - bio->bi_private = priv; + bio->bi_private = NULL; + return bio; } @@ -681,8 +668,17 @@ static void submit_write_page(struct f2fs_sb_info *sbi, struct page *page, do_submit_bio(sbi, type, false); alloc_new: if (sbi->bio[type] == NULL) { + struct bio_private *priv; +retry: + priv = kmalloc(sizeof(struct bio_private), GFP_NOFS); + if (!priv) { + cond_resched(); + goto retry; + } + sbi->bio[type] = f2fs_bio_alloc(bdev, max_hw_blocks(sbi)); sbi->bio[type]->bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr); + sbi->bio[type]->bi_private = priv; /* * The end_io will be assigned at the sumbission phase. * Until then, let bio_add_page() merge consecutive IOs as much @@ -702,6 +698,16 @@ alloc_new: trace_f2fs_submit_write_page(page, blk_addr, type); } +void f2fs_wait_on_page_writeback(struct page *page, + enum page_type type, bool sync) +{ + struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb); + if (PageWriteback(page)) { + f2fs_submit_bio(sbi, type, sync); + wait_on_page_writeback(page); + } +} + static bool __has_curseg_space(struct f2fs_sb_info *sbi, int type) { struct curseg_info *curseg = CURSEG_I(sbi, type); @@ -1179,7 +1185,6 @@ void write_node_summaries(struct f2fs_sb_info *sbi, block_t start_blk) { if (is_set_ckpt_flags(F2FS_CKPT(sbi), CP_UMOUNT_FLAG)) write_normal_summaries(sbi, start_blk, CURSEG_HOT_NODE); - return; } int lookup_journal_in_cursum(struct f2fs_summary_block *sum, int type, diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 062424a0e4c3..bdd10eab8c40 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -142,6 +142,7 @@ struct victim_sel_policy { int alloc_mode; /* LFS or SSR */ int gc_mode; /* GC_CB or GC_GREEDY */ unsigned long *dirty_segmap; /* dirty segment bitmap */ + unsigned int max_search; /* maximum # of segments to search */ unsigned int offset; /* last scanned bitmap offset */ unsigned int ofs_unit; /* bitmap search unit */ unsigned int min_cost; /* minimum cost */ @@ -453,7 +454,8 @@ static inline int reserved_sections(struct f2fs_sb_info *sbi) static inline bool need_SSR(struct f2fs_sb_info *sbi) { - return (free_sections(sbi) < overprovision_sections(sbi)); + return ((prefree_segments(sbi) / sbi->segs_per_sec) + + free_sections(sbi) < overprovision_sections(sbi)); } static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, int freed) @@ -470,7 +472,7 @@ static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, int freed) static inline int utilization(struct f2fs_sb_info *sbi) { - return div_u64(valid_user_blocks(sbi) * 100, sbi->user_block_count); + return div_u64((u64)valid_user_blocks(sbi) * 100, sbi->user_block_count); } /* diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 75c7dc363e92..13d0a0fe49dd 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -18,20 +18,25 @@ #include <linux/parser.h> #include <linux/mount.h> #include <linux/seq_file.h> +#include <linux/proc_fs.h> #include <linux/random.h> #include <linux/exportfs.h> #include <linux/blkdev.h> #include <linux/f2fs_fs.h> +#include <linux/sysfs.h> #include "f2fs.h" #include "node.h" #include "segment.h" #include "xattr.h" +#include "gc.h" #define CREATE_TRACE_POINTS #include <trace/events/f2fs.h> +static struct proc_dir_entry *f2fs_proc_root; static struct kmem_cache *f2fs_inode_cachep; +static struct kset *f2fs_kset; enum { Opt_gc_background, @@ -42,6 +47,7 @@ enum { Opt_noacl, Opt_active_logs, Opt_disable_ext_identify, + Opt_inline_xattr, Opt_err, }; @@ -54,9 +60,117 @@ static match_table_t f2fs_tokens = { {Opt_noacl, "noacl"}, {Opt_active_logs, "active_logs=%u"}, {Opt_disable_ext_identify, "disable_ext_identify"}, + {Opt_inline_xattr, "inline_xattr"}, {Opt_err, NULL}, }; +/* Sysfs support for f2fs */ +struct f2fs_attr { + struct attribute attr; + ssize_t (*show)(struct f2fs_attr *, struct f2fs_sb_info *, char *); + ssize_t (*store)(struct f2fs_attr *, struct f2fs_sb_info *, + const char *, size_t); + int offset; +}; + +static ssize_t f2fs_sbi_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + struct f2fs_gc_kthread *gc_kth = sbi->gc_thread; + unsigned int *ui; + + if (!gc_kth) + return -EINVAL; + + ui = (unsigned int *)(((char *)gc_kth) + a->offset); + + return snprintf(buf, PAGE_SIZE, "%u\n", *ui); +} + +static ssize_t f2fs_sbi_store(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, + const char *buf, size_t count) +{ + struct f2fs_gc_kthread *gc_kth = sbi->gc_thread; + unsigned long t; + unsigned int *ui; + ssize_t ret; + + if (!gc_kth) + return -EINVAL; + + ui = (unsigned int *)(((char *)gc_kth) + a->offset); + + ret = kstrtoul(skip_spaces(buf), 0, &t); + if (ret < 0) + return ret; + *ui = t; + return count; +} + +static ssize_t f2fs_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, + s_kobj); + struct f2fs_attr *a = container_of(attr, struct f2fs_attr, attr); + + return a->show ? a->show(a, sbi, buf) : 0; +} + +static ssize_t f2fs_attr_store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t len) +{ + struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, + s_kobj); + struct f2fs_attr *a = container_of(attr, struct f2fs_attr, attr); + + return a->store ? a->store(a, sbi, buf, len) : 0; +} + +static void f2fs_sb_release(struct kobject *kobj) +{ + struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, + s_kobj); + complete(&sbi->s_kobj_unregister); +} + +#define F2FS_ATTR_OFFSET(_name, _mode, _show, _store, _elname) \ +static struct f2fs_attr f2fs_attr_##_name = { \ + .attr = {.name = __stringify(_name), .mode = _mode }, \ + .show = _show, \ + .store = _store, \ + .offset = offsetof(struct f2fs_gc_kthread, _elname), \ +} + +#define F2FS_RW_ATTR(name, elname) \ + F2FS_ATTR_OFFSET(name, 0644, f2fs_sbi_show, f2fs_sbi_store, elname) + +F2FS_RW_ATTR(gc_min_sleep_time, min_sleep_time); +F2FS_RW_ATTR(gc_max_sleep_time, max_sleep_time); +F2FS_RW_ATTR(gc_no_gc_sleep_time, no_gc_sleep_time); +F2FS_RW_ATTR(gc_idle, gc_idle); + +#define ATTR_LIST(name) (&f2fs_attr_##name.attr) +static struct attribute *f2fs_attrs[] = { + ATTR_LIST(gc_min_sleep_time), + ATTR_LIST(gc_max_sleep_time), + ATTR_LIST(gc_no_gc_sleep_time), + ATTR_LIST(gc_idle), + NULL, +}; + +static const struct sysfs_ops f2fs_attr_ops = { + .show = f2fs_attr_show, + .store = f2fs_attr_store, +}; + +static struct kobj_type f2fs_ktype = { + .default_attrs = f2fs_attrs, + .sysfs_ops = &f2fs_attr_ops, + .release = f2fs_sb_release, +}; + void f2fs_msg(struct super_block *sb, const char *level, const char *fmt, ...) { struct va_format vaf; @@ -126,11 +240,18 @@ static int parse_options(struct super_block *sb, char *options) case Opt_nouser_xattr: clear_opt(sbi, XATTR_USER); break; + case Opt_inline_xattr: + set_opt(sbi, INLINE_XATTR); + break; #else case Opt_nouser_xattr: f2fs_msg(sb, KERN_INFO, "nouser_xattr options not supported"); break; + case Opt_inline_xattr: + f2fs_msg(sb, KERN_INFO, + "inline_xattr options not supported"); + break; #endif #ifdef CONFIG_F2FS_FS_POSIX_ACL case Opt_noacl: @@ -180,6 +301,9 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) set_inode_flag(fi, FI_NEW_INODE); + if (test_opt(F2FS_SB(sb), INLINE_XATTR)) + set_inode_flag(fi, FI_INLINE_XATTR); + return &fi->vfs_inode; } @@ -205,7 +329,6 @@ static int f2fs_drop_inode(struct inode *inode) static void f2fs_dirty_inode(struct inode *inode, int flags) { set_inode_flag(F2FS_I(inode), FI_DIRTY_INODE); - return; } static void f2fs_i_callback(struct rcu_head *head) @@ -223,6 +346,12 @@ static void f2fs_put_super(struct super_block *sb) { struct f2fs_sb_info *sbi = F2FS_SB(sb); + if (sbi->s_proc) { + remove_proc_entry("segment_info", sbi->s_proc); + remove_proc_entry(sb->s_id, f2fs_proc_root); + } + kobject_del(&sbi->s_kobj); + f2fs_destroy_stats(sbi); stop_gc_thread(sbi); @@ -236,6 +365,8 @@ static void f2fs_put_super(struct super_block *sb) destroy_segment_manager(sbi); kfree(sbi->ckpt); + kobject_put(&sbi->s_kobj); + wait_for_completion(&sbi->s_kobj_unregister); sb->s_fs_info = NULL; brelse(sbi->raw_super_buf); @@ -325,6 +456,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",user_xattr"); else seq_puts(seq, ",nouser_xattr"); + if (test_opt(sbi, INLINE_XATTR)) + seq_puts(seq, ",inline_xattr"); #endif #ifdef CONFIG_F2FS_FS_POSIX_ACL if (test_opt(sbi, POSIX_ACL)) @@ -340,6 +473,36 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) return 0; } +static int segment_info_seq_show(struct seq_file *seq, void *offset) +{ + struct super_block *sb = seq->private; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + unsigned int total_segs = le32_to_cpu(sbi->raw_super->segment_count_main); + int i; + + for (i = 0; i < total_segs; i++) { + seq_printf(seq, "%u", get_valid_blocks(sbi, i, 1)); + if (i != 0 && (i % 10) == 0) + seq_puts(seq, "\n"); + else + seq_puts(seq, " "); + } + return 0; +} + +static int segment_info_open_fs(struct inode *inode, struct file *file) +{ + return single_open(file, segment_info_seq_show, PDE_DATA(inode)); +} + +static const struct file_operations f2fs_seq_segment_info_fops = { + .owner = THIS_MODULE, + .open = segment_info_open_fs, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + static int f2fs_remount(struct super_block *sb, int *flags, char *data) { struct f2fs_sb_info *sbi = F2FS_SB(sb); @@ -455,7 +618,7 @@ static const struct export_operations f2fs_export_ops = { static loff_t max_file_size(unsigned bits) { - loff_t result = ADDRS_PER_INODE; + loff_t result = (DEF_ADDRS_PER_INODE - F2FS_INLINE_XATTR_ADDRS); loff_t leaf_count = ADDRS_PER_BLOCK; /* two direct node blocks */ @@ -766,6 +929,13 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) if (err) goto fail; + if (f2fs_proc_root) + sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root); + + if (sbi->s_proc) + proc_create_data("segment_info", S_IRUGO, sbi->s_proc, + &f2fs_seq_segment_info_fops, sb); + if (test_opt(sbi, DISCARD)) { struct request_queue *q = bdev_get_queue(sb->s_bdev); if (!blk_queue_discard(q)) @@ -774,6 +944,13 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) "the device does not support discard"); } + sbi->s_kobj.kset = f2fs_kset; + init_completion(&sbi->s_kobj_unregister); + err = kobject_init_and_add(&sbi->s_kobj, &f2fs_ktype, NULL, + "%s", sb->s_id); + if (err) + goto fail; + return 0; fail: stop_gc_thread(sbi); @@ -841,29 +1018,49 @@ static int __init init_f2fs_fs(void) goto fail; err = create_node_manager_caches(); if (err) - goto fail; + goto free_inodecache; err = create_gc_caches(); if (err) - goto fail; + goto free_node_manager_caches; err = create_checkpoint_caches(); if (err) - goto fail; + goto free_gc_caches; + f2fs_kset = kset_create_and_add("f2fs", NULL, fs_kobj); + if (!f2fs_kset) { + err = -ENOMEM; + goto free_checkpoint_caches; + } err = register_filesystem(&f2fs_fs_type); if (err) - goto fail; + goto free_kset; f2fs_create_root_stats(); + f2fs_proc_root = proc_mkdir("fs/f2fs", NULL); + return 0; + +free_kset: + kset_unregister(f2fs_kset); +free_checkpoint_caches: + destroy_checkpoint_caches(); +free_gc_caches: + destroy_gc_caches(); +free_node_manager_caches: + destroy_node_manager_caches(); +free_inodecache: + destroy_inodecache(); fail: return err; } static void __exit exit_f2fs_fs(void) { + remove_proc_entry("fs/f2fs", NULL); f2fs_destroy_root_stats(); unregister_filesystem(&f2fs_fs_type); destroy_checkpoint_caches(); destroy_gc_caches(); destroy_node_manager_caches(); destroy_inodecache(); + kset_unregister(f2fs_kset); } module_init(init_f2fs_fs) diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 3ab07ecd86ca..1ac8a5f6e380 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -246,40 +246,170 @@ static inline const struct xattr_handler *f2fs_xattr_handler(int name_index) return handler; } +static struct f2fs_xattr_entry *__find_xattr(void *base_addr, int name_index, + size_t name_len, const char *name) +{ + struct f2fs_xattr_entry *entry; + + list_for_each_xattr(entry, base_addr) { + if (entry->e_name_index != name_index) + continue; + if (entry->e_name_len != name_len) + continue; + if (!memcmp(entry->e_name, name, name_len)) + break; + } + return entry; +} + +static void *read_all_xattrs(struct inode *inode, struct page *ipage) +{ + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct f2fs_xattr_header *header; + size_t size = PAGE_SIZE, inline_size = 0; + void *txattr_addr; + + inline_size = inline_xattr_size(inode); + + txattr_addr = kzalloc(inline_size + size, GFP_KERNEL); + if (!txattr_addr) + return NULL; + + /* read from inline xattr */ + if (inline_size) { + struct page *page = NULL; + void *inline_addr; + + if (ipage) { + inline_addr = inline_xattr_addr(ipage); + } else { + page = get_node_page(sbi, inode->i_ino); + if (IS_ERR(page)) + goto fail; + inline_addr = inline_xattr_addr(page); + } + memcpy(txattr_addr, inline_addr, inline_size); + f2fs_put_page(page, 1); + } + + /* read from xattr node block */ + if (F2FS_I(inode)->i_xattr_nid) { + struct page *xpage; + void *xattr_addr; + + /* The inode already has an extended attribute block. */ + xpage = get_node_page(sbi, F2FS_I(inode)->i_xattr_nid); + if (IS_ERR(xpage)) + goto fail; + + xattr_addr = page_address(xpage); + memcpy(txattr_addr + inline_size, xattr_addr, PAGE_SIZE); + f2fs_put_page(xpage, 1); + } + + header = XATTR_HDR(txattr_addr); + + /* never been allocated xattrs */ + if (le32_to_cpu(header->h_magic) != F2FS_XATTR_MAGIC) { + header->h_magic = cpu_to_le32(F2FS_XATTR_MAGIC); + header->h_refcount = cpu_to_le32(1); + } + return txattr_addr; +fail: + kzfree(txattr_addr); + return NULL; +} + +static inline int write_all_xattrs(struct inode *inode, __u32 hsize, + void *txattr_addr, struct page *ipage) +{ + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + size_t inline_size = 0; + void *xattr_addr; + struct page *xpage; + nid_t new_nid = 0; + int err; + + inline_size = inline_xattr_size(inode); + + if (hsize > inline_size && !F2FS_I(inode)->i_xattr_nid) + if (!alloc_nid(sbi, &new_nid)) + return -ENOSPC; + + /* write to inline xattr */ + if (inline_size) { + struct page *page = NULL; + void *inline_addr; + + if (ipage) { + inline_addr = inline_xattr_addr(ipage); + } else { + page = get_node_page(sbi, inode->i_ino); + if (IS_ERR(page)) { + alloc_nid_failed(sbi, new_nid); + return PTR_ERR(page); + } + inline_addr = inline_xattr_addr(page); + } + memcpy(inline_addr, txattr_addr, inline_size); + f2fs_put_page(page, 1); + + /* no need to use xattr node block */ + if (hsize <= inline_size) { + err = truncate_xattr_node(inode, ipage); + alloc_nid_failed(sbi, new_nid); + return err; + } + } + + /* write to xattr node block */ + if (F2FS_I(inode)->i_xattr_nid) { + xpage = get_node_page(sbi, F2FS_I(inode)->i_xattr_nid); + if (IS_ERR(xpage)) { + alloc_nid_failed(sbi, new_nid); + return PTR_ERR(xpage); + } + BUG_ON(new_nid); + } else { + struct dnode_of_data dn; + set_new_dnode(&dn, inode, NULL, NULL, new_nid); + xpage = new_node_page(&dn, XATTR_NODE_OFFSET, ipage); + if (IS_ERR(xpage)) { + alloc_nid_failed(sbi, new_nid); + return PTR_ERR(xpage); + } + alloc_nid_done(sbi, new_nid); + } + + xattr_addr = page_address(xpage); + memcpy(xattr_addr, txattr_addr + inline_size, PAGE_SIZE - + sizeof(struct node_footer)); + set_page_dirty(xpage); + f2fs_put_page(xpage, 1); + + /* need to checkpoint during fsync */ + F2FS_I(inode)->xattr_ver = cur_cp_version(F2FS_CKPT(sbi)); + return 0; +} + int f2fs_getxattr(struct inode *inode, int name_index, const char *name, void *buffer, size_t buffer_size) { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - struct f2fs_inode_info *fi = F2FS_I(inode); struct f2fs_xattr_entry *entry; - struct page *page; void *base_addr; - int error = 0, found = 0; + int error = 0; size_t value_len, name_len; if (name == NULL) return -EINVAL; name_len = strlen(name); - if (!fi->i_xattr_nid) - return -ENODATA; + base_addr = read_all_xattrs(inode, NULL); + if (!base_addr) + return -ENOMEM; - page = get_node_page(sbi, fi->i_xattr_nid); - if (IS_ERR(page)) - return PTR_ERR(page); - base_addr = page_address(page); - - list_for_each_xattr(entry, base_addr) { - if (entry->e_name_index != name_index) - continue; - if (entry->e_name_len != name_len) - continue; - if (!memcmp(entry->e_name, name, name_len)) { - found = 1; - break; - } - } - if (!found) { + entry = __find_xattr(base_addr, name_index, name_len, name); + if (IS_XATTR_LAST_ENTRY(entry)) { error = -ENODATA; goto cleanup; } @@ -298,28 +428,21 @@ int f2fs_getxattr(struct inode *inode, int name_index, const char *name, error = value_len; cleanup: - f2fs_put_page(page, 1); + kzfree(base_addr); return error; } ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) { struct inode *inode = dentry->d_inode; - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - struct f2fs_inode_info *fi = F2FS_I(inode); struct f2fs_xattr_entry *entry; - struct page *page; void *base_addr; int error = 0; size_t rest = buffer_size; - if (!fi->i_xattr_nid) - return 0; - - page = get_node_page(sbi, fi->i_xattr_nid); - if (IS_ERR(page)) - return PTR_ERR(page); - base_addr = page_address(page); + base_addr = read_all_xattrs(inode, NULL); + if (!base_addr) + return -ENOMEM; list_for_each_xattr(entry, base_addr) { const struct xattr_handler *handler = @@ -342,7 +465,7 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) } error = buffer_size - rest; cleanup: - f2fs_put_page(page, 1); + kzfree(base_addr); return error; } @@ -351,14 +474,13 @@ int f2fs_setxattr(struct inode *inode, int name_index, const char *name, { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); struct f2fs_inode_info *fi = F2FS_I(inode); - struct f2fs_xattr_header *header = NULL; struct f2fs_xattr_entry *here, *last; - struct page *page; void *base_addr; - int error, found, free, newsize; + int found, newsize; size_t name_len; - char *pval; int ilock; + __u32 new_hsize; + int error = -ENOMEM; if (name == NULL) return -EINVAL; @@ -368,67 +490,21 @@ int f2fs_setxattr(struct inode *inode, int name_index, const char *name, name_len = strlen(name); - if (name_len > F2FS_NAME_LEN || value_len > MAX_VALUE_LEN) + if (name_len > F2FS_NAME_LEN || value_len > MAX_VALUE_LEN(inode)) return -ERANGE; f2fs_balance_fs(sbi); ilock = mutex_lock_op(sbi); - if (!fi->i_xattr_nid) { - /* Allocate new attribute block */ - struct dnode_of_data dn; - - if (!alloc_nid(sbi, &fi->i_xattr_nid)) { - error = -ENOSPC; - goto exit; - } - set_new_dnode(&dn, inode, NULL, NULL, fi->i_xattr_nid); - mark_inode_dirty(inode); - - page = new_node_page(&dn, XATTR_NODE_OFFSET, ipage); - if (IS_ERR(page)) { - alloc_nid_failed(sbi, fi->i_xattr_nid); - fi->i_xattr_nid = 0; - error = PTR_ERR(page); - goto exit; - } - - alloc_nid_done(sbi, fi->i_xattr_nid); - base_addr = page_address(page); - header = XATTR_HDR(base_addr); - header->h_magic = cpu_to_le32(F2FS_XATTR_MAGIC); - header->h_refcount = cpu_to_le32(1); - } else { - /* The inode already has an extended attribute block. */ - page = get_node_page(sbi, fi->i_xattr_nid); - if (IS_ERR(page)) { - error = PTR_ERR(page); - goto exit; - } - - base_addr = page_address(page); - header = XATTR_HDR(base_addr); - } - - if (le32_to_cpu(header->h_magic) != F2FS_XATTR_MAGIC) { - error = -EIO; - goto cleanup; - } + base_addr = read_all_xattrs(inode, ipage); + if (!base_addr) + goto exit; /* find entry with wanted name. */ - found = 0; - list_for_each_xattr(here, base_addr) { - if (here->e_name_index != name_index) - continue; - if (here->e_name_len != name_len) - continue; - if (!memcmp(here->e_name, name, name_len)) { - found = 1; - break; - } - } + here = __find_xattr(base_addr, name_index, name_len, name); + found = IS_XATTR_LAST_ENTRY(here) ? 0 : 1; last = here; while (!IS_XATTR_LAST_ENTRY(last)) @@ -439,22 +515,25 @@ int f2fs_setxattr(struct inode *inode, int name_index, const char *name, /* 1. Check space */ if (value) { - /* If value is NULL, it is remove operation. + int free; + /* + * If value is NULL, it is remove operation. * In case of update operation, we caculate free. */ - free = MIN_OFFSET - ((char *)last - (char *)header); + free = MIN_OFFSET(inode) - ((char *)last - (char *)base_addr); if (found) free = free - ENTRY_SIZE(here); if (free < newsize) { error = -ENOSPC; - goto cleanup; + goto exit; } } /* 2. Remove old entry */ if (found) { - /* If entry is found, remove old entry. + /* + * If entry is found, remove old entry. * If not found, remove operation is not needed. */ struct f2fs_xattr_entry *next = XATTR_NEXT_ENTRY(here); @@ -465,10 +544,15 @@ int f2fs_setxattr(struct inode *inode, int name_index, const char *name, memset(last, 0, oldsize); } + new_hsize = (char *)last - (char *)base_addr; + /* 3. Write new entry */ if (value) { - /* Before we come here, old entry is removed. - * We just write new entry. */ + char *pval; + /* + * Before we come here, old entry is removed. + * We just write new entry. + */ memset(last, 0, newsize); last->e_name_index = name_index; last->e_name_len = name_len; @@ -476,26 +560,25 @@ int f2fs_setxattr(struct inode *inode, int name_index, const char *name, pval = last->e_name + name_len; memcpy(pval, value, value_len); last->e_value_size = cpu_to_le16(value_len); + new_hsize += newsize; } - set_page_dirty(page); - f2fs_put_page(page, 1); + error = write_all_xattrs(inode, new_hsize, base_addr, ipage); + if (error) + goto exit; if (is_inode_flag_set(fi, FI_ACL_MODE)) { inode->i_mode = fi->i_acl_mode; inode->i_ctime = CURRENT_TIME; clear_inode_flag(fi, FI_ACL_MODE); } + if (ipage) update_inode(inode, ipage); else update_inode_page(inode); - mutex_unlock_op(sbi, ilock); - - return 0; -cleanup: - f2fs_put_page(page, 1); exit: mutex_unlock_op(sbi, ilock); + kzfree(base_addr); return error; } diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h index 3c0817bef25d..02a08fb88a15 100644 --- a/fs/f2fs/xattr.h +++ b/fs/f2fs/xattr.h @@ -51,7 +51,7 @@ struct f2fs_xattr_entry { #define XATTR_HDR(ptr) ((struct f2fs_xattr_header *)(ptr)) #define XATTR_ENTRY(ptr) ((struct f2fs_xattr_entry *)(ptr)) -#define XATTR_FIRST_ENTRY(ptr) (XATTR_ENTRY(XATTR_HDR(ptr)+1)) +#define XATTR_FIRST_ENTRY(ptr) (XATTR_ENTRY(XATTR_HDR(ptr) + 1)) #define XATTR_ROUND (3) #define XATTR_ALIGN(size) ((size + XATTR_ROUND) & ~XATTR_ROUND) @@ -69,17 +69,16 @@ struct f2fs_xattr_entry { !IS_XATTR_LAST_ENTRY(entry);\ entry = XATTR_NEXT_ENTRY(entry)) +#define MIN_OFFSET(i) XATTR_ALIGN(inline_xattr_size(i) + PAGE_SIZE - \ + sizeof(struct node_footer) - sizeof(__u32)) -#define MIN_OFFSET XATTR_ALIGN(PAGE_SIZE - \ - sizeof(struct node_footer) - \ - sizeof(__u32)) - -#define MAX_VALUE_LEN (MIN_OFFSET - sizeof(struct f2fs_xattr_header) - \ - sizeof(struct f2fs_xattr_entry)) +#define MAX_VALUE_LEN(i) (MIN_OFFSET(i) - \ + sizeof(struct f2fs_xattr_header) - \ + sizeof(struct f2fs_xattr_entry)) /* * On-disk structure of f2fs_xattr - * We use only 1 block for xattr. + * We use inline xattrs space + 1 block for xattr. * * +--------------------+ * | f2fs_xattr_header | diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index 0cb4c1557f20..2e5fc268d324 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -1859,7 +1859,7 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len, memset(&rlist, 0, sizeof(struct gfs2_rgrp_list)); - ht = kzalloc(size, GFP_NOFS); + ht = kzalloc(size, GFP_NOFS | __GFP_NOWARN); if (ht == NULL) ht = vzalloc(size); if (!ht) diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index c348d6d88624..e5d408a7ea4a 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -117,8 +117,8 @@ static void destroy_inodecache(void) static int isofs_remount(struct super_block *sb, int *flags, char *data) { - /* we probably want a lot more here */ - *flags |= MS_RDONLY; + if (!(*flags & MS_RDONLY)) + return -EROFS; return 0; } @@ -763,15 +763,6 @@ root_found: */ s->s_maxbytes = 0x80000000000LL; - /* - * The CDROM is read-only, has no nodes (devices) on it, and since - * all of the files appear to be owned by root, we really do not want - * to allow suid. (suid or devices will not show up unless we have - * Rock Ridge extensions) - */ - - s->s_flags |= MS_RDONLY /* | MS_NODEV | MS_NOSUID */; - /* Set this for reference. Its not currently used except on write which we don't have .. */ @@ -1530,6 +1521,9 @@ struct inode *isofs_iget(struct super_block *sb, static struct dentry *isofs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { + /* We don't support read-write mounts */ + if (!(flags & MS_RDONLY)) + return ERR_PTR(-EACCES); return mount_bdev(fs_type, flags, dev_name, data, isofs_fill_super); } diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c index 11bb11f48b3a..bb217dcb41af 100644 --- a/fs/jbd/commit.c +++ b/fs/jbd/commit.c @@ -340,13 +340,13 @@ void journal_commit_transaction(journal_t *journal) J_ASSERT(journal->j_committing_transaction == NULL); commit_transaction = journal->j_running_transaction; - J_ASSERT(commit_transaction->t_state == T_RUNNING); trace_jbd_start_commit(journal, commit_transaction); jbd_debug(1, "JBD: starting commit of transaction %d\n", commit_transaction->t_tid); spin_lock(&journal->j_state_lock); + J_ASSERT(commit_transaction->t_state == T_RUNNING); commit_transaction->t_state = T_LOCKED; trace_jbd_commit_locking(journal, commit_transaction); diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c index 6510d6355729..2d04f9afafd7 100644 --- a/fs/jbd/journal.c +++ b/fs/jbd/journal.c @@ -90,6 +90,24 @@ static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *); static void __journal_abort_soft (journal_t *journal, int errno); static const char *journal_dev_name(journal_t *journal, char *buffer); +#ifdef CONFIG_JBD_DEBUG +void __jbd_debug(int level, const char *file, const char *func, + unsigned int line, const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + if (level > journal_enable_debug) + return; + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + printk(KERN_DEBUG "%s: (%s, %u): %pV\n", file, func, line, &vaf); + va_end(args); +} +EXPORT_SYMBOL(__jbd_debug); +#endif + /* * Helper function used to manage commit timeouts */ diff --git a/fs/namespace.c b/fs/namespace.c index 5997887cc64a..fc2b5226278d 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -830,6 +830,10 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root, if ((flag & CL_UNPRIVILEGED) && (mnt->mnt.mnt_flags & MNT_READONLY)) mnt->mnt.mnt_flags |= MNT_LOCK_READONLY; + /* Don't allow unprivileged users to reveal what is under a mount */ + if ((flag & CL_UNPRIVILEGED) && list_empty(&old->mnt_expire)) + mnt->mnt.mnt_flags |= MNT_LOCKED; + atomic_inc(&sb->s_active); mnt->mnt.mnt_sb = sb; mnt->mnt.mnt_root = dget(root); @@ -1326,6 +1330,8 @@ SYSCALL_DEFINE2(umount, char __user *, name, int, flags) goto dput_and_out; if (!check_mnt(mnt)) goto dput_and_out; + if (mnt->mnt.mnt_flags & MNT_LOCKED) + goto dput_and_out; retval = do_umount(mnt, flags); dput_and_out: @@ -1348,14 +1354,11 @@ SYSCALL_DEFINE1(oldumount, char __user *, name) #endif -static bool mnt_ns_loop(struct path *path) +static bool is_mnt_ns_file(struct dentry *dentry) { - /* Could bind mounting the mount namespace inode cause a - * mount namespace loop? - */ - struct inode *inode = path->dentry->d_inode; + /* Is this a proxy for a mount namespace? */ + struct inode *inode = dentry->d_inode; struct proc_ns *ei; - struct mnt_namespace *mnt_ns; if (!proc_ns_inode(inode)) return false; @@ -1364,7 +1367,19 @@ static bool mnt_ns_loop(struct path *path) if (ei->ns_ops != &mntns_operations) return false; - mnt_ns = ei->ns; + return true; +} + +static bool mnt_ns_loop(struct dentry *dentry) +{ + /* Could bind mounting the mount namespace inode cause a + * mount namespace loop? + */ + struct mnt_namespace *mnt_ns; + if (!is_mnt_ns_file(dentry)) + return false; + + mnt_ns = get_proc_ns(dentry->d_inode)->ns; return current->nsproxy->mnt_ns->seq >= mnt_ns->seq; } @@ -1373,13 +1388,17 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry, { struct mount *res, *p, *q, *r, *parent; - if (!(flag & CL_COPY_ALL) && IS_MNT_UNBINDABLE(mnt)) + if (!(flag & CL_COPY_UNBINDABLE) && IS_MNT_UNBINDABLE(mnt)) + return ERR_PTR(-EINVAL); + + if (!(flag & CL_COPY_MNT_NS_FILE) && is_mnt_ns_file(dentry)) return ERR_PTR(-EINVAL); res = q = clone_mnt(mnt, dentry, flag); if (IS_ERR(q)) return q; + q->mnt.mnt_flags &= ~MNT_LOCKED; q->mnt_mountpoint = mnt->mnt_mountpoint; p = mnt; @@ -1389,7 +1408,13 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry, continue; for (s = r; s; s = next_mnt(s, r)) { - if (!(flag & CL_COPY_ALL) && IS_MNT_UNBINDABLE(s)) { + if (!(flag & CL_COPY_UNBINDABLE) && + IS_MNT_UNBINDABLE(s)) { + s = skip_mnt_tree(s); + continue; + } + if (!(flag & CL_COPY_MNT_NS_FILE) && + is_mnt_ns_file(s->mnt.mnt_root)) { s = skip_mnt_tree(s); continue; } @@ -1695,6 +1720,19 @@ static int do_change_type(struct path *path, int flag) return err; } +static bool has_locked_children(struct mount *mnt, struct dentry *dentry) +{ + struct mount *child; + list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) { + if (!is_subdir(child->mnt_mountpoint, dentry)) + continue; + + if (child->mnt.mnt_flags & MNT_LOCKED) + return true; + } + return false; +} + /* * do loopback mount. */ @@ -1712,7 +1750,7 @@ static int do_loopback(struct path *path, const char *old_name, return err; err = -EINVAL; - if (mnt_ns_loop(&old_path)) + if (mnt_ns_loop(old_path.dentry)) goto out; mp = lock_mount(path); @@ -1730,8 +1768,11 @@ static int do_loopback(struct path *path, const char *old_name, if (!check_mnt(parent) || !check_mnt(old)) goto out2; + if (!recurse && has_locked_children(old, old_path.dentry)) + goto out2; + if (recurse) - mnt = copy_tree(old, old_path.dentry, 0); + mnt = copy_tree(old, old_path.dentry, CL_COPY_MNT_NS_FILE); else mnt = clone_mnt(old, old_path.dentry, 0); @@ -1740,6 +1781,8 @@ static int do_loopback(struct path *path, const char *old_name, goto out2; } + mnt->mnt.mnt_flags &= ~MNT_LOCKED; + err = graft_tree(mnt, parent, mp); if (err) { br_write_lock(&vfsmount_lock); @@ -1852,6 +1895,9 @@ static int do_move_mount(struct path *path, const char *old_name) if (!check_mnt(p) || !check_mnt(old)) goto out1; + if (old->mnt.mnt_flags & MNT_LOCKED) + goto out1; + err = -EINVAL; if (old_path.dentry != old_path.mnt->mnt_root) goto out1; @@ -2388,7 +2434,7 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns, namespace_lock(); /* First pass: copy the tree topology */ - copy_flags = CL_COPY_ALL | CL_EXPIRE; + copy_flags = CL_COPY_UNBINDABLE | CL_EXPIRE; if (user_ns != mnt_ns->user_ns) copy_flags |= CL_SHARED_TO_SLAVE | CL_UNPRIVILEGED; new = copy_tree(old, old->mnt.mnt_root, copy_flags); @@ -2423,6 +2469,10 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns, } p = next_mnt(p, old); q = next_mnt(q, new); + if (!q) + break; + while (p->mnt.mnt_root != q->mnt.mnt_root) + p = next_mnt(p, old); } namespace_unlock(); @@ -2629,6 +2679,8 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, goto out4; if (!check_mnt(root_mnt) || !check_mnt(new_mnt)) goto out4; + if (new_mnt->mnt.mnt_flags & MNT_LOCKED) + goto out4; error = -ENOENT; if (d_unlinked(new.dentry)) goto out4; @@ -2652,6 +2704,10 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, br_write_lock(&vfsmount_lock); detach_mnt(new_mnt, &parent_path); detach_mnt(root_mnt, &root_parent); + if (root_mnt->mnt.mnt_flags & MNT_LOCKED) { + new_mnt->mnt.mnt_flags |= MNT_LOCKED; + root_mnt->mnt.mnt_flags &= ~MNT_LOCKED; + } /* mount old root on put_old */ attach_mnt(root_mnt, old_mnt, old_mp); /* mount new_root on / */ @@ -2810,25 +2866,38 @@ bool current_chrooted(void) return chrooted; } -void update_mnt_policy(struct user_namespace *userns) +bool fs_fully_visible(struct file_system_type *type) { struct mnt_namespace *ns = current->nsproxy->mnt_ns; struct mount *mnt; + bool visible = false; - down_read(&namespace_sem); + if (unlikely(!ns)) + return false; + + namespace_lock(); list_for_each_entry(mnt, &ns->list, mnt_list) { - switch (mnt->mnt.mnt_sb->s_magic) { - case SYSFS_MAGIC: - userns->may_mount_sysfs = true; - break; - case PROC_SUPER_MAGIC: - userns->may_mount_proc = true; - break; + struct mount *child; + if (mnt->mnt.mnt_sb->s_type != type) + continue; + + /* This mount is not fully visible if there are any child mounts + * that cover anything except for empty directories. + */ + list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) { + struct inode *inode = child->mnt_mountpoint->d_inode; + if (!S_ISDIR(inode->i_mode)) + goto next; + if (inode->i_nlink != 2) + goto next; } - if (userns->may_mount_sysfs && userns->may_mount_proc) - break; + visible = true; + goto found; + next: ; } - up_read(&namespace_sem); +found: + namespace_unlock(); + return visible; } static void *mntns_get(struct task_struct *task) @@ -2859,8 +2928,8 @@ static int mntns_install(struct nsproxy *nsproxy, void *ns) struct path root; if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) || - !nsown_capable(CAP_SYS_CHROOT) || - !nsown_capable(CAP_SYS_ADMIN)) + !ns_capable(current_user_ns(), CAP_SYS_CHROOT) || + !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) return -EPERM; if (fs->users != 1) diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h index e5c7f15465b4..19f134e896a9 100644 --- a/fs/ocfs2/xattr.h +++ b/fs/ocfs2/xattr.h @@ -32,7 +32,7 @@ enum ocfs2_xattr_type { struct ocfs2_security_xattr_info { int enable; - char *name; + const char *name; void *value; size_t value_len; }; diff --git a/fs/open.c b/fs/open.c index 8070825b285b..2a731b0d08bc 100644 --- a/fs/open.c +++ b/fs/open.c @@ -443,7 +443,7 @@ retry: goto dput_and_out; error = -EPERM; - if (!nsown_capable(CAP_SYS_CHROOT)) + if (!ns_capable(current_user_ns(), CAP_SYS_CHROOT)) goto dput_and_out; error = security_path_chroot(&path); if (error) diff --git a/fs/pnode.h b/fs/pnode.h index b091445c1c4a..59e7eda1851e 100644 --- a/fs/pnode.h +++ b/fs/pnode.h @@ -19,11 +19,14 @@ #define CL_EXPIRE 0x01 #define CL_SLAVE 0x02 -#define CL_COPY_ALL 0x04 +#define CL_COPY_UNBINDABLE 0x04 #define CL_MAKE_SHARED 0x08 #define CL_PRIVATE 0x10 #define CL_SHARED_TO_SLAVE 0x20 #define CL_UNPRIVILEGED 0x40 +#define CL_COPY_MNT_NS_FILE 0x80 + +#define CL_COPY_ALL (CL_COPY_UNBINDABLE | CL_COPY_MNT_NS_FILE) static inline void set_mnt_shared(struct mount *mnt) { diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 073aea60cf8f..9f8ef9b7674d 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -285,6 +285,20 @@ static int proc_reg_mmap(struct file *file, struct vm_area_struct *vma) return rv; } +static unsigned long proc_reg_get_unmapped_area(struct file *file, unsigned long orig_addr, unsigned long len, unsigned long pgoff, unsigned long flags) +{ + struct proc_dir_entry *pde = PDE(file_inode(file)); + int rv = -EIO; + unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); + if (use_pde(pde)) { + get_unmapped_area = pde->proc_fops->get_unmapped_area; + if (get_unmapped_area) + rv = get_unmapped_area(file, orig_addr, len, pgoff, flags); + unuse_pde(pde); + } + return rv; +} + static int proc_reg_open(struct inode *inode, struct file *file) { struct proc_dir_entry *pde = PDE(inode); @@ -356,6 +370,7 @@ static const struct file_operations proc_reg_file_ops = { .compat_ioctl = proc_reg_compat_ioctl, #endif .mmap = proc_reg_mmap, + .get_unmapped_area = proc_reg_get_unmapped_area, .open = proc_reg_open, .release = proc_reg_release, }; @@ -368,6 +383,7 @@ static const struct file_operations proc_reg_file_ops_no_compat = { .poll = proc_reg_poll, .unlocked_ioctl = proc_reg_unlocked_ioctl, .mmap = proc_reg_mmap, + .get_unmapped_area = proc_reg_get_unmapped_area, .open = proc_reg_open, .release = proc_reg_release, }; diff --git a/fs/proc/root.c b/fs/proc/root.c index e0a790da726d..87dbcbef7fe4 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -110,7 +110,11 @@ static struct dentry *proc_mount(struct file_system_type *fs_type, ns = task_active_pid_ns(current); options = data; - if (!current_user_ns()->may_mount_proc) + if (!capable(CAP_SYS_ADMIN) && !fs_fully_visible(fs_type)) + return ERR_PTR(-EPERM); + + /* Does the mounter have privilege over the pid namespace? */ + if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN)) return ERR_PTR(-EPERM); } diff --git a/fs/reiserfs/bitmap.c b/fs/reiserfs/bitmap.c index a98b7740a0fc..dc9a6829f7c6 100644 --- a/fs/reiserfs/bitmap.c +++ b/fs/reiserfs/bitmap.c @@ -423,8 +423,11 @@ static void _reiserfs_free_block(struct reiserfs_transaction_handle *th, set_sb_free_blocks(rs, sb_free_blocks(rs) + 1); journal_mark_dirty(th, s, sbh); - if (for_unformatted) + if (for_unformatted) { + int depth = reiserfs_write_unlock_nested(s); dquot_free_block_nodirty(inode, 1); + reiserfs_write_lock_nested(s, depth); + } } void reiserfs_free_block(struct reiserfs_transaction_handle *th, @@ -1128,6 +1131,7 @@ static inline int blocknrs_and_prealloc_arrays_from_search_start b_blocknr_t finish = SB_BLOCK_COUNT(s) - 1; int passno = 0; int nr_allocated = 0; + int depth; determine_prealloc_size(hint); if (!hint->formatted_node) { @@ -1137,10 +1141,13 @@ static inline int blocknrs_and_prealloc_arrays_from_search_start "reiserquota: allocating %d blocks id=%u", amount_needed, hint->inode->i_uid); #endif + depth = reiserfs_write_unlock_nested(s); quota_ret = dquot_alloc_block_nodirty(hint->inode, amount_needed); - if (quota_ret) /* Quota exceeded? */ + if (quota_ret) { /* Quota exceeded? */ + reiserfs_write_lock_nested(s, depth); return QUOTA_EXCEEDED; + } if (hint->preallocate && hint->prealloc_size) { #ifdef REISERQUOTA_DEBUG reiserfs_debug(s, REISERFS_DEBUG_CODE, @@ -1153,6 +1160,7 @@ static inline int blocknrs_and_prealloc_arrays_from_search_start hint->preallocate = hint->prealloc_size = 0; } /* for unformatted nodes, force large allocations */ + reiserfs_write_lock_nested(s, depth); } do { @@ -1181,9 +1189,11 @@ static inline int blocknrs_and_prealloc_arrays_from_search_start hint->inode->i_uid); #endif /* Free not allocated blocks */ + depth = reiserfs_write_unlock_nested(s); dquot_free_block_nodirty(hint->inode, amount_needed + hint->prealloc_size - nr_allocated); + reiserfs_write_lock_nested(s, depth); } while (nr_allocated--) reiserfs_free_block(hint->th, hint->inode, @@ -1214,10 +1224,13 @@ static inline int blocknrs_and_prealloc_arrays_from_search_start REISERFS_I(hint->inode)->i_prealloc_count, hint->inode->i_uid); #endif + + depth = reiserfs_write_unlock_nested(s); dquot_free_block_nodirty(hint->inode, amount_needed + hint->prealloc_size - nr_allocated - REISERFS_I(hint->inode)-> i_prealloc_count); + reiserfs_write_lock_nested(s, depth); } return CARRY_ON; @@ -1340,10 +1353,11 @@ struct buffer_head *reiserfs_read_bitmap_block(struct super_block *sb, "reading failed", __func__, block); else { if (buffer_locked(bh)) { + int depth; PROC_INFO_INC(sb, scan_bitmap.wait); - reiserfs_write_unlock(sb); + depth = reiserfs_write_unlock_nested(sb); __wait_on_buffer(bh); - reiserfs_write_lock(sb); + reiserfs_write_lock_nested(sb, depth); } BUG_ON(!buffer_uptodate(bh)); BUG_ON(atomic_read(&bh->b_count) == 0); diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c index 03e4ca5624d6..1fd2051109a3 100644 --- a/fs/reiserfs/dir.c +++ b/fs/reiserfs/dir.c @@ -71,6 +71,7 @@ int reiserfs_readdir_inode(struct inode *inode, struct dir_context *ctx) char small_buf[32]; /* avoid kmalloc if we can */ struct reiserfs_dir_entry de; int ret = 0; + int depth; reiserfs_write_lock(inode->i_sb); @@ -181,17 +182,17 @@ int reiserfs_readdir_inode(struct inode *inode, struct dir_context *ctx) * Since filldir might sleep, we can release * the write lock here for other waiters */ - reiserfs_write_unlock(inode->i_sb); + depth = reiserfs_write_unlock_nested(inode->i_sb); if (!dir_emit (ctx, local_buf, d_reclen, d_ino, DT_UNKNOWN)) { - reiserfs_write_lock(inode->i_sb); + reiserfs_write_lock_nested(inode->i_sb, depth); if (local_buf != small_buf) { kfree(local_buf); } goto end; } - reiserfs_write_lock(inode->i_sb); + reiserfs_write_lock_nested(inode->i_sb, depth); if (local_buf != small_buf) { kfree(local_buf); } diff --git a/fs/reiserfs/fix_node.c b/fs/reiserfs/fix_node.c index 430e0658704c..dc4d41530316 100644 --- a/fs/reiserfs/fix_node.c +++ b/fs/reiserfs/fix_node.c @@ -1022,9 +1022,9 @@ static int get_far_parent(struct tree_balance *tb, if (buffer_locked(*pcom_father)) { /* Release the write lock while the buffer is busy */ - reiserfs_write_unlock(tb->tb_sb); + int depth = reiserfs_write_unlock_nested(tb->tb_sb); __wait_on_buffer(*pcom_father); - reiserfs_write_lock(tb->tb_sb); + reiserfs_write_lock_nested(tb->tb_sb, depth); if (FILESYSTEM_CHANGED_TB(tb)) { brelse(*pcom_father); return REPEAT_SEARCH; @@ -1929,9 +1929,9 @@ static int get_direct_parent(struct tree_balance *tb, int h) return REPEAT_SEARCH; if (buffer_locked(bh)) { - reiserfs_write_unlock(tb->tb_sb); + int depth = reiserfs_write_unlock_nested(tb->tb_sb); __wait_on_buffer(bh); - reiserfs_write_lock(tb->tb_sb); + reiserfs_write_lock_nested(tb->tb_sb, depth); if (FILESYSTEM_CHANGED_TB(tb)) return REPEAT_SEARCH; } @@ -1952,6 +1952,7 @@ static int get_neighbors(struct tree_balance *tb, int h) unsigned long son_number; struct super_block *sb = tb->tb_sb; struct buffer_head *bh; + int depth; PROC_INFO_INC(sb, get_neighbors[h]); @@ -1969,9 +1970,9 @@ static int get_neighbors(struct tree_balance *tb, int h) tb->FL[h]) ? tb->lkey[h] : B_NR_ITEMS(tb-> FL[h]); son_number = B_N_CHILD_NUM(tb->FL[h], child_position); - reiserfs_write_unlock(sb); + depth = reiserfs_write_unlock_nested(tb->tb_sb); bh = sb_bread(sb, son_number); - reiserfs_write_lock(sb); + reiserfs_write_lock_nested(tb->tb_sb, depth); if (!bh) return IO_ERROR; if (FILESYSTEM_CHANGED_TB(tb)) { @@ -2009,9 +2010,9 @@ static int get_neighbors(struct tree_balance *tb, int h) child_position = (bh == tb->FR[h]) ? tb->rkey[h] + 1 : 0; son_number = B_N_CHILD_NUM(tb->FR[h], child_position); - reiserfs_write_unlock(sb); + depth = reiserfs_write_unlock_nested(tb->tb_sb); bh = sb_bread(sb, son_number); - reiserfs_write_lock(sb); + reiserfs_write_lock_nested(tb->tb_sb, depth); if (!bh) return IO_ERROR; if (FILESYSTEM_CHANGED_TB(tb)) { @@ -2272,6 +2273,7 @@ static int wait_tb_buffers_until_unlocked(struct tree_balance *tb) } if (locked) { + int depth; #ifdef CONFIG_REISERFS_CHECK repeat_counter++; if ((repeat_counter % 10000) == 0) { @@ -2286,9 +2288,9 @@ static int wait_tb_buffers_until_unlocked(struct tree_balance *tb) REPEAT_SEARCH : CARRY_ON; } #endif - reiserfs_write_unlock(tb->tb_sb); + depth = reiserfs_write_unlock_nested(tb->tb_sb); __wait_on_buffer(locked); - reiserfs_write_lock(tb->tb_sb); + reiserfs_write_lock_nested(tb->tb_sb, depth); if (FILESYSTEM_CHANGED_TB(tb)) return REPEAT_SEARCH; } @@ -2359,9 +2361,9 @@ int fix_nodes(int op_mode, struct tree_balance *tb, /* if it possible in indirect_to_direct conversion */ if (buffer_locked(tbS0)) { - reiserfs_write_unlock(tb->tb_sb); + int depth = reiserfs_write_unlock_nested(tb->tb_sb); __wait_on_buffer(tbS0); - reiserfs_write_lock(tb->tb_sb); + reiserfs_write_lock_nested(tb->tb_sb, depth); if (FILESYSTEM_CHANGED_TB(tb)) return REPEAT_SEARCH; } diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 0048cc16a6a8..ad62bdbb451e 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -30,7 +30,6 @@ void reiserfs_evict_inode(struct inode *inode) JOURNAL_PER_BALANCE_CNT * 2 + 2 * REISERFS_QUOTA_INIT_BLOCKS(inode->i_sb); struct reiserfs_transaction_handle th; - int depth; int err; if (!inode->i_nlink && !is_bad_inode(inode)) @@ -40,12 +39,13 @@ void reiserfs_evict_inode(struct inode *inode) if (inode->i_nlink) goto no_delete; - depth = reiserfs_write_lock_once(inode->i_sb); - /* The = 0 happens when we abort creating a new inode for some reason like lack of space.. */ if (!(inode->i_state & I_NEW) && INODE_PKEY(inode)->k_objectid != 0) { /* also handles bad_inode case */ + reiserfs_delete_xattrs(inode); + reiserfs_write_lock(inode->i_sb); + if (journal_begin(&th, inode->i_sb, jbegin_count)) goto out; reiserfs_update_inode_transaction(inode); @@ -57,8 +57,11 @@ void reiserfs_evict_inode(struct inode *inode) /* Do quota update inside a transaction for journaled quotas. We must do that * after delete_object so that quota updates go into the same transaction as * stat data deletion */ - if (!err) + if (!err) { + int depth = reiserfs_write_unlock_nested(inode->i_sb); dquot_free_inode(inode); + reiserfs_write_lock_nested(inode->i_sb, depth); + } if (journal_end(&th, inode->i_sb, jbegin_count)) goto out; @@ -72,12 +75,12 @@ void reiserfs_evict_inode(struct inode *inode) /* all items of file are deleted, so we can remove "save" link */ remove_save_link(inode, 0 /* not truncate */ ); /* we can't do anything * about an error here */ +out: + reiserfs_write_unlock(inode->i_sb); } else { /* no object items are in the tree */ ; } - out: - reiserfs_write_unlock_once(inode->i_sb, depth); clear_inode(inode); /* note this must go after the journal_end to prevent deadlock */ dquot_drop(inode); inode->i_blocks = 0; @@ -610,7 +613,6 @@ int reiserfs_get_block(struct inode *inode, sector_t block, __le32 *item; int done; int fs_gen; - int lock_depth; struct reiserfs_transaction_handle *th = NULL; /* space reserved in transaction batch: . 3 balancings in direct->indirect conversion @@ -626,11 +628,11 @@ int reiserfs_get_block(struct inode *inode, sector_t block, loff_t new_offset = (((loff_t) block) << inode->i_sb->s_blocksize_bits) + 1; - lock_depth = reiserfs_write_lock_once(inode->i_sb); + reiserfs_write_lock(inode->i_sb); version = get_inode_item_key_version(inode); if (!file_capable(inode, block)) { - reiserfs_write_unlock_once(inode->i_sb, lock_depth); + reiserfs_write_unlock(inode->i_sb); return -EFBIG; } @@ -642,7 +644,7 @@ int reiserfs_get_block(struct inode *inode, sector_t block, /* find number of block-th logical block of the file */ ret = _get_block_create_0(inode, block, bh_result, create | GET_BLOCK_READ_DIRECT); - reiserfs_write_unlock_once(inode->i_sb, lock_depth); + reiserfs_write_unlock(inode->i_sb); return ret; } /* @@ -760,7 +762,7 @@ int reiserfs_get_block(struct inode *inode, sector_t block, if (!dangle && th) retval = reiserfs_end_persistent_transaction(th); - reiserfs_write_unlock_once(inode->i_sb, lock_depth); + reiserfs_write_unlock(inode->i_sb); /* the item was found, so new blocks were not added to the file ** there is no need to make sure the inode is updated with this @@ -1011,11 +1013,7 @@ int reiserfs_get_block(struct inode *inode, sector_t block, * long time. reschedule if needed and also release the write * lock for others. */ - if (need_resched()) { - reiserfs_write_unlock_once(inode->i_sb, lock_depth); - schedule(); - lock_depth = reiserfs_write_lock_once(inode->i_sb); - } + reiserfs_cond_resched(inode->i_sb); retval = search_for_position_by_key(inode->i_sb, &key, &path); if (retval == IO_ERROR) { @@ -1050,7 +1048,7 @@ int reiserfs_get_block(struct inode *inode, sector_t block, retval = err; } - reiserfs_write_unlock_once(inode->i_sb, lock_depth); + reiserfs_write_unlock(inode->i_sb); reiserfs_check_path(&path); return retval; } @@ -1509,14 +1507,15 @@ struct inode *reiserfs_iget(struct super_block *s, const struct cpu_key *key) { struct inode *inode; struct reiserfs_iget_args args; + int depth; args.objectid = key->on_disk_key.k_objectid; args.dirid = key->on_disk_key.k_dir_id; - reiserfs_write_unlock(s); + depth = reiserfs_write_unlock_nested(s); inode = iget5_locked(s, key->on_disk_key.k_objectid, reiserfs_find_actor, reiserfs_init_locked_inode, (void *)(&args)); - reiserfs_write_lock(s); + reiserfs_write_lock_nested(s, depth); if (!inode) return ERR_PTR(-ENOMEM); @@ -1772,7 +1771,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th, struct inode *inode, struct reiserfs_security_handle *security) { - struct super_block *sb; + struct super_block *sb = dir->i_sb; struct reiserfs_iget_args args; INITIALIZE_PATH(path_to_key); struct cpu_key key; @@ -1780,12 +1779,13 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th, struct stat_data sd; int retval; int err; + int depth; BUG_ON(!th->t_trans_id); - reiserfs_write_unlock(inode->i_sb); + depth = reiserfs_write_unlock_nested(sb); err = dquot_alloc_inode(inode); - reiserfs_write_lock(inode->i_sb); + reiserfs_write_lock_nested(sb, depth); if (err) goto out_end_trans; if (!dir->i_nlink) { @@ -1793,8 +1793,6 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th, goto out_bad_inode; } - sb = dir->i_sb; - /* item head of new item */ ih.ih_key.k_dir_id = reiserfs_choose_packing(dir); ih.ih_key.k_objectid = cpu_to_le32(reiserfs_get_unused_objectid(th)); @@ -1812,10 +1810,10 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th, memcpy(INODE_PKEY(inode), &(ih.ih_key), KEY_SIZE); args.dirid = le32_to_cpu(ih.ih_key.k_dir_id); - reiserfs_write_unlock(inode->i_sb); + depth = reiserfs_write_unlock_nested(inode->i_sb); err = insert_inode_locked4(inode, args.objectid, reiserfs_find_actor, &args); - reiserfs_write_lock(inode->i_sb); + reiserfs_write_lock_nested(inode->i_sb, depth); if (err) { err = -EINVAL; goto out_bad_inode; @@ -1941,7 +1939,9 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th, } if (reiserfs_posixacl(inode->i_sb)) { + reiserfs_write_unlock(inode->i_sb); retval = reiserfs_inherit_default_acl(th, dir, dentry, inode); + reiserfs_write_lock(inode->i_sb); if (retval) { err = retval; reiserfs_check_path(&path_to_key); @@ -1956,7 +1956,9 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th, inode->i_flags |= S_PRIVATE; if (security->name) { + reiserfs_write_unlock(inode->i_sb); retval = reiserfs_security_write(th, inode, security); + reiserfs_write_lock(inode->i_sb); if (retval) { err = retval; reiserfs_check_path(&path_to_key); @@ -1982,14 +1984,16 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th, INODE_PKEY(inode)->k_objectid = 0; /* Quota change must be inside a transaction for journaling */ + depth = reiserfs_write_unlock_nested(inode->i_sb); dquot_free_inode(inode); + reiserfs_write_lock_nested(inode->i_sb, depth); out_end_trans: journal_end(th, th->t_super, th->t_blocks_allocated); - reiserfs_write_unlock(inode->i_sb); /* Drop can be outside and it needs more credits so it's better to have it outside */ + depth = reiserfs_write_unlock_nested(inode->i_sb); dquot_drop(inode); - reiserfs_write_lock(inode->i_sb); + reiserfs_write_lock_nested(inode->i_sb, depth); inode->i_flags |= S_NOQUOTA; make_bad_inode(inode); @@ -2103,9 +2107,8 @@ int reiserfs_truncate_file(struct inode *inode, int update_timestamps) int error; struct buffer_head *bh = NULL; int err2; - int lock_depth; - lock_depth = reiserfs_write_lock_once(inode->i_sb); + reiserfs_write_lock(inode->i_sb); if (inode->i_size > 0) { error = grab_tail_page(inode, &page, &bh); @@ -2174,7 +2177,7 @@ int reiserfs_truncate_file(struct inode *inode, int update_timestamps) page_cache_release(page); } - reiserfs_write_unlock_once(inode->i_sb, lock_depth); + reiserfs_write_unlock(inode->i_sb); return 0; out: @@ -2183,7 +2186,7 @@ int reiserfs_truncate_file(struct inode *inode, int update_timestamps) page_cache_release(page); } - reiserfs_write_unlock_once(inode->i_sb, lock_depth); + reiserfs_write_unlock(inode->i_sb); return error; } @@ -2648,10 +2651,11 @@ int __reiserfs_write_begin(struct page *page, unsigned from, unsigned len) struct inode *inode = page->mapping->host; int ret; int old_ref = 0; + int depth; - reiserfs_write_unlock(inode->i_sb); + depth = reiserfs_write_unlock_nested(inode->i_sb); reiserfs_wait_on_write_block(inode->i_sb); - reiserfs_write_lock(inode->i_sb); + reiserfs_write_lock_nested(inode->i_sb, depth); fix_tail_page_for_writing(page); if (reiserfs_transaction_running(inode->i_sb)) { @@ -2708,7 +2712,6 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping, int update_sd = 0; struct reiserfs_transaction_handle *th; unsigned start; - int lock_depth = 0; bool locked = false; if ((unsigned long)fsdata & AOP_FLAG_CONT_EXPAND) @@ -2737,7 +2740,7 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping, */ if (pos + copied > inode->i_size) { struct reiserfs_transaction_handle myth; - lock_depth = reiserfs_write_lock_once(inode->i_sb); + reiserfs_write_lock(inode->i_sb); locked = true; /* If the file have grown beyond the border where it can have a tail, unmark it as needing a tail @@ -2768,7 +2771,7 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping, } if (th) { if (!locked) { - lock_depth = reiserfs_write_lock_once(inode->i_sb); + reiserfs_write_lock(inode->i_sb); locked = true; } if (!update_sd) @@ -2780,7 +2783,7 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping, out: if (locked) - reiserfs_write_unlock_once(inode->i_sb, lock_depth); + reiserfs_write_unlock(inode->i_sb); unlock_page(page); page_cache_release(page); @@ -2790,7 +2793,7 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping, return ret == 0 ? copied : ret; journal_error: - reiserfs_write_unlock_once(inode->i_sb, lock_depth); + reiserfs_write_unlock(inode->i_sb); locked = false; if (th) { if (!update_sd) @@ -2808,10 +2811,11 @@ int reiserfs_commit_write(struct file *f, struct page *page, int ret = 0; int update_sd = 0; struct reiserfs_transaction_handle *th = NULL; + int depth; - reiserfs_write_unlock(inode->i_sb); + depth = reiserfs_write_unlock_nested(inode->i_sb); reiserfs_wait_on_write_block(inode->i_sb); - reiserfs_write_lock(inode->i_sb); + reiserfs_write_lock_nested(inode->i_sb, depth); if (reiserfs_transaction_running(inode->i_sb)) { th = current->journal_info; @@ -3110,7 +3114,6 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr) { struct inode *inode = dentry->d_inode; unsigned int ia_valid; - int depth; int error; error = inode_change_ok(inode, attr); @@ -3122,13 +3125,14 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr) if (is_quota_modification(inode, attr)) dquot_initialize(inode); - depth = reiserfs_write_lock_once(inode->i_sb); + reiserfs_write_lock(inode->i_sb); if (attr->ia_valid & ATTR_SIZE) { /* version 2 items will be caught by the s_maxbytes check ** done for us in vmtruncate */ if (get_inode_item_key_version(inode) == KEY_FORMAT_3_5 && attr->ia_size > MAX_NON_LFS) { + reiserfs_write_unlock(inode->i_sb); error = -EFBIG; goto out; } @@ -3150,8 +3154,10 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr) if (err) error = err; } - if (error) + if (error) { + reiserfs_write_unlock(inode->i_sb); goto out; + } /* * file size is changed, ctime and mtime are * to be updated @@ -3159,6 +3165,7 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr) attr->ia_valid |= (ATTR_MTIME | ATTR_CTIME); } } + reiserfs_write_unlock(inode->i_sb); if ((((attr->ia_valid & ATTR_UID) && (from_kuid(&init_user_ns, attr->ia_uid) & ~0xffff)) || ((attr->ia_valid & ATTR_GID) && (from_kgid(&init_user_ns, attr->ia_gid) & ~0xffff))) && @@ -3183,14 +3190,16 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr) return error; /* (user+group)*(old+new) structure - we count quota info and , inode write (sb, inode) */ + reiserfs_write_lock(inode->i_sb); error = journal_begin(&th, inode->i_sb, jbegin_count); + reiserfs_write_unlock(inode->i_sb); if (error) goto out; - reiserfs_write_unlock_once(inode->i_sb, depth); error = dquot_transfer(inode, attr); - depth = reiserfs_write_lock_once(inode->i_sb); + reiserfs_write_lock(inode->i_sb); if (error) { journal_end(&th, inode->i_sb, jbegin_count); + reiserfs_write_unlock(inode->i_sb); goto out; } @@ -3202,17 +3211,11 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr) inode->i_gid = attr->ia_gid; mark_inode_dirty(inode); error = journal_end(&th, inode->i_sb, jbegin_count); + reiserfs_write_unlock(inode->i_sb); if (error) goto out; } - /* - * Relax the lock here, as it might truncate the - * inode pages and wait for inode pages locks. - * To release such page lock, the owner needs the - * reiserfs lock - */ - reiserfs_write_unlock_once(inode->i_sb, depth); if ((attr->ia_valid & ATTR_SIZE) && attr->ia_size != i_size_read(inode)) { error = inode_newsize_ok(inode, attr->ia_size); @@ -3226,16 +3229,13 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr) setattr_copy(inode, attr); mark_inode_dirty(inode); } - depth = reiserfs_write_lock_once(inode->i_sb); if (!error && reiserfs_posixacl(inode->i_sb)) { if (attr->ia_valid & ATTR_MODE) error = reiserfs_acl_chmod(inode); } - out: - reiserfs_write_unlock_once(inode->i_sb, depth); - +out: return error; } diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c index 15cb5fe6b425..946ccbf5b5a1 100644 --- a/fs/reiserfs/ioctl.c +++ b/fs/reiserfs/ioctl.c @@ -167,7 +167,6 @@ int reiserfs_commit_write(struct file *f, struct page *page, int reiserfs_unpack(struct inode *inode, struct file *filp) { int retval = 0; - int depth; int index; struct page *page; struct address_space *mapping; @@ -183,11 +182,11 @@ int reiserfs_unpack(struct inode *inode, struct file *filp) return 0; } - depth = reiserfs_write_lock_once(inode->i_sb); - /* we need to make sure nobody is changing the file size beneath us */ reiserfs_mutex_lock_safe(&inode->i_mutex, inode->i_sb); + reiserfs_write_lock(inode->i_sb); + write_from = inode->i_size & (blocksize - 1); /* if we are on a block boundary, we are already unpacked. */ if (write_from == 0) { @@ -221,6 +220,6 @@ int reiserfs_unpack(struct inode *inode, struct file *filp) out: mutex_unlock(&inode->i_mutex); - reiserfs_write_unlock_once(inode->i_sb, depth); + reiserfs_write_unlock(inode->i_sb); return retval; } diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 742fdd4c209a..73feacc49b2e 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -947,9 +947,11 @@ static int reiserfs_async_progress_wait(struct super_block *s) struct reiserfs_journal *j = SB_JOURNAL(s); if (atomic_read(&j->j_async_throttle)) { - reiserfs_write_unlock(s); + int depth; + + depth = reiserfs_write_unlock_nested(s); congestion_wait(BLK_RW_ASYNC, HZ / 10); - reiserfs_write_lock(s); + reiserfs_write_lock_nested(s, depth); } return 0; @@ -972,6 +974,7 @@ static int flush_commit_list(struct super_block *s, struct reiserfs_journal *journal = SB_JOURNAL(s); int retval = 0; int write_len; + int depth; reiserfs_check_lock_depth(s, "flush_commit_list"); @@ -1018,12 +1021,12 @@ static int flush_commit_list(struct super_block *s, * We might sleep in numerous places inside * write_ordered_buffers. Relax the write lock. */ - reiserfs_write_unlock(s); + depth = reiserfs_write_unlock_nested(s); ret = write_ordered_buffers(&journal->j_dirty_buffers_lock, journal, jl, &jl->j_bh_list); if (ret < 0 && retval == 0) retval = ret; - reiserfs_write_lock(s); + reiserfs_write_lock_nested(s, depth); } BUG_ON(!list_empty(&jl->j_bh_list)); /* @@ -1043,9 +1046,9 @@ static int flush_commit_list(struct super_block *s, tbh = journal_find_get_block(s, bn); if (tbh) { if (buffer_dirty(tbh)) { - reiserfs_write_unlock(s); + depth = reiserfs_write_unlock_nested(s); ll_rw_block(WRITE, 1, &tbh); - reiserfs_write_lock(s); + reiserfs_write_lock_nested(s, depth); } put_bh(tbh) ; } @@ -1057,17 +1060,17 @@ static int flush_commit_list(struct super_block *s, (jl->j_start + i) % SB_ONDISK_JOURNAL_SIZE(s); tbh = journal_find_get_block(s, bn); - reiserfs_write_unlock(s); - wait_on_buffer(tbh); - reiserfs_write_lock(s); + depth = reiserfs_write_unlock_nested(s); + __wait_on_buffer(tbh); + reiserfs_write_lock_nested(s, depth); // since we're using ll_rw_blk above, it might have skipped over // a locked buffer. Double check here // /* redundant, sync_dirty_buffer() checks */ if (buffer_dirty(tbh)) { - reiserfs_write_unlock(s); + depth = reiserfs_write_unlock_nested(s); sync_dirty_buffer(tbh); - reiserfs_write_lock(s); + reiserfs_write_lock_nested(s, depth); } if (unlikely(!buffer_uptodate(tbh))) { #ifdef CONFIG_REISERFS_CHECK @@ -1091,12 +1094,12 @@ static int flush_commit_list(struct super_block *s, if (buffer_dirty(jl->j_commit_bh)) BUG(); mark_buffer_dirty(jl->j_commit_bh) ; - reiserfs_write_unlock(s); + depth = reiserfs_write_unlock_nested(s); if (reiserfs_barrier_flush(s)) __sync_dirty_buffer(jl->j_commit_bh, WRITE_FLUSH_FUA); else sync_dirty_buffer(jl->j_commit_bh); - reiserfs_write_lock(s); + reiserfs_write_lock_nested(s, depth); } /* If there was a write error in the journal - we can't commit this @@ -1228,15 +1231,16 @@ static int _update_journal_header_block(struct super_block *sb, { struct reiserfs_journal_header *jh; struct reiserfs_journal *journal = SB_JOURNAL(sb); + int depth; if (reiserfs_is_journal_aborted(journal)) return -EIO; if (trans_id >= journal->j_last_flush_trans_id) { if (buffer_locked((journal->j_header_bh))) { - reiserfs_write_unlock(sb); - wait_on_buffer((journal->j_header_bh)); - reiserfs_write_lock(sb); + depth = reiserfs_write_unlock_nested(sb); + __wait_on_buffer(journal->j_header_bh); + reiserfs_write_lock_nested(sb, depth); if (unlikely(!buffer_uptodate(journal->j_header_bh))) { #ifdef CONFIG_REISERFS_CHECK reiserfs_warning(sb, "journal-699", @@ -1254,14 +1258,14 @@ static int _update_journal_header_block(struct super_block *sb, jh->j_mount_id = cpu_to_le32(journal->j_mount_id); set_buffer_dirty(journal->j_header_bh); - reiserfs_write_unlock(sb); + depth = reiserfs_write_unlock_nested(sb); if (reiserfs_barrier_flush(sb)) __sync_dirty_buffer(journal->j_header_bh, WRITE_FLUSH_FUA); else sync_dirty_buffer(journal->j_header_bh); - reiserfs_write_lock(sb); + reiserfs_write_lock_nested(sb, depth); if (!buffer_uptodate(journal->j_header_bh)) { reiserfs_warning(sb, "journal-837", "IO error during journal replay"); @@ -1341,6 +1345,7 @@ static int flush_journal_list(struct super_block *s, unsigned long j_len_saved = jl->j_len; struct reiserfs_journal *journal = SB_JOURNAL(s); int err = 0; + int depth; BUG_ON(j_len_saved <= 0); @@ -1495,9 +1500,9 @@ static int flush_journal_list(struct super_block *s, "cn->bh is NULL"); } - reiserfs_write_unlock(s); - wait_on_buffer(cn->bh); - reiserfs_write_lock(s); + depth = reiserfs_write_unlock_nested(s); + __wait_on_buffer(cn->bh); + reiserfs_write_lock_nested(s, depth); if (!cn->bh) { reiserfs_panic(s, "journal-1012", @@ -1974,6 +1979,7 @@ static int journal_compare_desc_commit(struct super_block *sb, /* returns 0 if it did not find a description block ** returns -1 if it found a corrupt commit block ** returns 1 if both desc and commit were valid +** NOTE: only called during fs mount */ static int journal_transaction_is_valid(struct super_block *sb, struct buffer_head *d_bh, @@ -2073,8 +2079,9 @@ static void brelse_array(struct buffer_head **heads, int num) /* ** given the start, and values for the oldest acceptable transactions, -** this either reads in a replays a transaction, or returns because the transaction -** is invalid, or too old. +** this either reads in a replays a transaction, or returns because the +** transaction is invalid, or too old. +** NOTE: only called during fs mount */ static int journal_read_transaction(struct super_block *sb, unsigned long cur_dblock, @@ -2208,10 +2215,7 @@ static int journal_read_transaction(struct super_block *sb, ll_rw_block(READ, get_desc_trans_len(desc), log_blocks); for (i = 0; i < get_desc_trans_len(desc); i++) { - reiserfs_write_unlock(sb); wait_on_buffer(log_blocks[i]); - reiserfs_write_lock(sb); - if (!buffer_uptodate(log_blocks[i])) { reiserfs_warning(sb, "journal-1212", "REPLAY FAILURE fsck required! " @@ -2318,12 +2322,13 @@ static struct buffer_head *reiserfs_breada(struct block_device *dev, /* ** read and replay the log -** on a clean unmount, the journal header's next unflushed pointer will be to an invalid -** transaction. This tests that before finding all the transactions in the log, which makes normal mount times fast. -** -** After a crash, this starts with the next unflushed transaction, and replays until it finds one too old, or invalid. -** +** on a clean unmount, the journal header's next unflushed pointer will +** be to an invalid transaction. This tests that before finding all the +** transactions in the log, which makes normal mount times fast. +** After a crash, this starts with the next unflushed transaction, and +** replays until it finds one too old, or invalid. ** On exit, it sets things up so the first transaction will work correctly. +** NOTE: only called during fs mount */ static int journal_read(struct super_block *sb) { @@ -2501,14 +2506,18 @@ static int journal_read(struct super_block *sb) "replayed %d transactions in %lu seconds\n", replay_count, get_seconds() - start); } + /* needed to satisfy the locking in _update_journal_header_block */ + reiserfs_write_lock(sb); if (!bdev_read_only(sb->s_bdev) && _update_journal_header_block(sb, journal->j_start, journal->j_last_flush_trans_id)) { + reiserfs_write_unlock(sb); /* replay failed, caller must call free_journal_ram and abort ** the mount */ return -1; } + reiserfs_write_unlock(sb); return 0; } @@ -2828,13 +2837,7 @@ int journal_init(struct super_block *sb, const char *j_dev_name, goto free_and_return; } - /* - * Journal_read needs to be inspected in order to push down - * the lock further inside (or even remove it). - */ - reiserfs_write_lock(sb); ret = journal_read(sb); - reiserfs_write_unlock(sb); if (ret < 0) { reiserfs_warning(sb, "reiserfs-2006", "Replay Failure, unable to mount"); @@ -2923,9 +2926,9 @@ static void queue_log_writer(struct super_block *s) add_wait_queue(&journal->j_join_wait, &wait); set_current_state(TASK_UNINTERRUPTIBLE); if (test_bit(J_WRITERS_QUEUED, &journal->j_state)) { - reiserfs_write_unlock(s); + int depth = reiserfs_write_unlock_nested(s); schedule(); - reiserfs_write_lock(s); + reiserfs_write_lock_nested(s, depth); } __set_current_state(TASK_RUNNING); remove_wait_queue(&journal->j_join_wait, &wait); @@ -2943,9 +2946,12 @@ static void let_transaction_grow(struct super_block *sb, unsigned int trans_id) struct reiserfs_journal *journal = SB_JOURNAL(sb); unsigned long bcount = journal->j_bcount; while (1) { - reiserfs_write_unlock(sb); + int depth; + + depth = reiserfs_write_unlock_nested(sb); schedule_timeout_uninterruptible(1); - reiserfs_write_lock(sb); + reiserfs_write_lock_nested(sb, depth); + journal->j_current_jl->j_state |= LIST_COMMIT_PENDING; while ((atomic_read(&journal->j_wcount) > 0 || atomic_read(&journal->j_jlock)) && @@ -2976,6 +2982,7 @@ static int do_journal_begin_r(struct reiserfs_transaction_handle *th, struct reiserfs_transaction_handle myth; int sched_count = 0; int retval; + int depth; reiserfs_check_lock_depth(sb, "journal_begin"); BUG_ON(nblocks > journal->j_trans_max); @@ -2996,9 +3003,9 @@ static int do_journal_begin_r(struct reiserfs_transaction_handle *th, if (test_bit(J_WRITERS_BLOCKED, &journal->j_state)) { unlock_journal(sb); - reiserfs_write_unlock(sb); + depth = reiserfs_write_unlock_nested(sb); reiserfs_wait_on_write_block(sb); - reiserfs_write_lock(sb); + reiserfs_write_lock_nested(sb, depth); PROC_INFO_INC(sb, journal.journal_relock_writers); goto relock; } @@ -3821,6 +3828,7 @@ void reiserfs_restore_prepared_buffer(struct super_block *sb, if (test_clear_buffer_journal_restore_dirty(bh) && buffer_journal_dirty(bh)) { struct reiserfs_journal_cnode *cn; + reiserfs_write_lock(sb); cn = get_journal_hash_dev(sb, journal->j_list_hash_table, bh->b_blocknr); @@ -3828,6 +3836,7 @@ void reiserfs_restore_prepared_buffer(struct super_block *sb, set_buffer_journal_test(bh); mark_buffer_dirty(bh); } + reiserfs_write_unlock(sb); } clear_buffer_journal_prepared(bh); } @@ -3911,6 +3920,7 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, unsigned long jindex; unsigned int commit_trans_id; int trans_half; + int depth; BUG_ON(th->t_refcount > 1); BUG_ON(!th->t_trans_id); @@ -4116,9 +4126,7 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, next = cn->next; free_cnode(sb, cn); cn = next; - reiserfs_write_unlock(sb); - cond_resched(); - reiserfs_write_lock(sb); + reiserfs_cond_resched(sb); } /* we are done with both the c_bh and d_bh, but @@ -4165,10 +4173,10 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, * is lost. */ if (!list_empty(&jl->j_tail_bh_list)) { - reiserfs_write_unlock(sb); + depth = reiserfs_write_unlock_nested(sb); write_ordered_buffers(&journal->j_dirty_buffers_lock, journal, jl, &jl->j_tail_bh_list); - reiserfs_write_lock(sb); + reiserfs_write_lock_nested(sb, depth); } BUG_ON(!list_empty(&jl->j_tail_bh_list)); mutex_unlock(&jl->j_commit_mutex); diff --git a/fs/reiserfs/lock.c b/fs/reiserfs/lock.c index d735bc8470e3..045b83ef9fd9 100644 --- a/fs/reiserfs/lock.c +++ b/fs/reiserfs/lock.c @@ -48,30 +48,35 @@ void reiserfs_write_unlock(struct super_block *s) } } -/* - * If we already own the lock, just exit and don't increase the depth. - * Useful when we don't want to lock more than once. - * - * We always return the lock_depth we had before calling - * this function. - */ -int reiserfs_write_lock_once(struct super_block *s) +int __must_check reiserfs_write_unlock_nested(struct super_block *s) { struct reiserfs_sb_info *sb_i = REISERFS_SB(s); + int depth; - if (sb_i->lock_owner != current) { - mutex_lock(&sb_i->lock); - sb_i->lock_owner = current; - return sb_i->lock_depth++; - } + /* this can happen when the lock isn't always held */ + if (sb_i->lock_owner != current) + return -1; + + depth = sb_i->lock_depth; + + sb_i->lock_depth = -1; + sb_i->lock_owner = NULL; + mutex_unlock(&sb_i->lock); - return sb_i->lock_depth; + return depth; } -void reiserfs_write_unlock_once(struct super_block *s, int lock_depth) +void reiserfs_write_lock_nested(struct super_block *s, int depth) { - if (lock_depth == -1) - reiserfs_write_unlock(s); + struct reiserfs_sb_info *sb_i = REISERFS_SB(s); + + /* this can happen when the lock isn't always held */ + if (depth == -1) + return; + + mutex_lock(&sb_i->lock); + sb_i->lock_owner = current; + sb_i->lock_depth = depth; } /* @@ -82,9 +87,7 @@ void reiserfs_check_lock_depth(struct super_block *sb, char *caller) { struct reiserfs_sb_info *sb_i = REISERFS_SB(sb); - if (sb_i->lock_depth < 0) - reiserfs_panic(sb, "%s called without kernel lock held %d", - caller); + WARN_ON(sb_i->lock_depth < 0); } #ifdef CONFIG_REISERFS_CHECK diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c index 8567fb847601..dc5236f6de1b 100644 --- a/fs/reiserfs/namei.c +++ b/fs/reiserfs/namei.c @@ -325,7 +325,6 @@ static struct dentry *reiserfs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { int retval; - int lock_depth; struct inode *inode = NULL; struct reiserfs_dir_entry de; INITIALIZE_PATH(path_to_entry); @@ -333,12 +332,7 @@ static struct dentry *reiserfs_lookup(struct inode *dir, struct dentry *dentry, if (REISERFS_MAX_NAME(dir->i_sb->s_blocksize) < dentry->d_name.len) return ERR_PTR(-ENAMETOOLONG); - /* - * Might be called with or without the write lock, must be careful - * to not recursively hold it in case we want to release the lock - * before rescheduling. - */ - lock_depth = reiserfs_write_lock_once(dir->i_sb); + reiserfs_write_lock(dir->i_sb); de.de_gen_number_bit_string = NULL; retval = @@ -349,7 +343,7 @@ static struct dentry *reiserfs_lookup(struct inode *dir, struct dentry *dentry, inode = reiserfs_iget(dir->i_sb, (struct cpu_key *)&(de.de_dir_id)); if (!inode || IS_ERR(inode)) { - reiserfs_write_unlock_once(dir->i_sb, lock_depth); + reiserfs_write_unlock(dir->i_sb); return ERR_PTR(-EACCES); } @@ -358,7 +352,7 @@ static struct dentry *reiserfs_lookup(struct inode *dir, struct dentry *dentry, if (IS_PRIVATE(dir)) inode->i_flags |= S_PRIVATE; } - reiserfs_write_unlock_once(dir->i_sb, lock_depth); + reiserfs_write_unlock(dir->i_sb); if (retval == IO_ERROR) { return ERR_PTR(-EIO); } @@ -727,7 +721,6 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode struct inode *inode; struct reiserfs_transaction_handle th; struct reiserfs_security_handle security; - int lock_depth; /* We need blocks for transaction + (user+group)*(quotas for new inode + update of quota for directory owner) */ int jbegin_count = JOURNAL_PER_BALANCE_CNT * 3 + @@ -753,7 +746,7 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode return retval; } jbegin_count += retval; - lock_depth = reiserfs_write_lock_once(dir->i_sb); + reiserfs_write_lock(dir->i_sb); retval = journal_begin(&th, dir->i_sb, jbegin_count); if (retval) { @@ -804,7 +797,7 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode d_instantiate(dentry, inode); retval = journal_end(&th, dir->i_sb, jbegin_count); out_failed: - reiserfs_write_unlock_once(dir->i_sb, lock_depth); + reiserfs_write_unlock(dir->i_sb); return retval; } @@ -920,7 +913,6 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry) struct reiserfs_transaction_handle th; int jbegin_count; unsigned long savelink; - int depth; dquot_initialize(dir); @@ -934,7 +926,7 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry) JOURNAL_PER_BALANCE_CNT * 2 + 2 + 4 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb); - depth = reiserfs_write_lock_once(dir->i_sb); + reiserfs_write_lock(dir->i_sb); retval = journal_begin(&th, dir->i_sb, jbegin_count); if (retval) goto out_unlink; @@ -995,7 +987,7 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry) retval = journal_end(&th, dir->i_sb, jbegin_count); reiserfs_check_path(&path); - reiserfs_write_unlock_once(dir->i_sb, depth); + reiserfs_write_unlock(dir->i_sb); return retval; end_unlink: @@ -1005,7 +997,7 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry) if (err) retval = err; out_unlink: - reiserfs_write_unlock_once(dir->i_sb, depth); + reiserfs_write_unlock(dir->i_sb); return retval; } diff --git a/fs/reiserfs/prints.c b/fs/reiserfs/prints.c index c0b1112ab7e3..54944d5a4a6e 100644 --- a/fs/reiserfs/prints.c +++ b/fs/reiserfs/prints.c @@ -358,12 +358,13 @@ void __reiserfs_panic(struct super_block *sb, const char *id, dump_stack(); #endif if (sb) - panic(KERN_WARNING "REISERFS panic (device %s): %s%s%s: %s\n", + printk(KERN_WARNING "REISERFS panic (device %s): %s%s%s: %s\n", sb->s_id, id ? id : "", id ? " " : "", function, error_buf); else - panic(KERN_WARNING "REISERFS panic: %s%s%s: %s\n", + printk(KERN_WARNING "REISERFS panic: %s%s%s: %s\n", id ? id : "", id ? " " : "", function, error_buf); + BUG(); } void __reiserfs_error(struct super_block *sb, const char *id, diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h index 3df5ce6c724d..f8adaee537c2 100644 --- a/fs/reiserfs/reiserfs.h +++ b/fs/reiserfs/reiserfs.h @@ -630,8 +630,8 @@ static inline int __reiserfs_is_journal_aborted(struct reiserfs_journal */ void reiserfs_write_lock(struct super_block *s); void reiserfs_write_unlock(struct super_block *s); -int reiserfs_write_lock_once(struct super_block *s); -void reiserfs_write_unlock_once(struct super_block *s, int lock_depth); +int __must_check reiserfs_write_unlock_nested(struct super_block *s); +void reiserfs_write_lock_nested(struct super_block *s, int depth); #ifdef CONFIG_REISERFS_CHECK void reiserfs_lock_check_recursive(struct super_block *s); @@ -667,31 +667,33 @@ static inline void reiserfs_lock_check_recursive(struct super_block *s) { } * - The inode mutex */ static inline void reiserfs_mutex_lock_safe(struct mutex *m, - struct super_block *s) + struct super_block *s) { - reiserfs_lock_check_recursive(s); - reiserfs_write_unlock(s); + int depth; + + depth = reiserfs_write_unlock_nested(s); mutex_lock(m); - reiserfs_write_lock(s); + reiserfs_write_lock_nested(s, depth); } static inline void reiserfs_mutex_lock_nested_safe(struct mutex *m, unsigned int subclass, - struct super_block *s) + struct super_block *s) { - reiserfs_lock_check_recursive(s); - reiserfs_write_unlock(s); + int depth; + + depth = reiserfs_write_unlock_nested(s); mutex_lock_nested(m, subclass); - reiserfs_write_lock(s); + reiserfs_write_lock_nested(s, depth); } static inline void reiserfs_down_read_safe(struct rw_semaphore *sem, struct super_block *s) { - reiserfs_lock_check_recursive(s); - reiserfs_write_unlock(s); - down_read(sem); - reiserfs_write_lock(s); + int depth; + depth = reiserfs_write_unlock_nested(s); + down_read(sem); + reiserfs_write_lock_nested(s, depth); } /* @@ -701,9 +703,11 @@ reiserfs_down_read_safe(struct rw_semaphore *sem, struct super_block *s) static inline void reiserfs_cond_resched(struct super_block *s) { if (need_resched()) { - reiserfs_write_unlock(s); + int depth; + + depth = reiserfs_write_unlock_nested(s); schedule(); - reiserfs_write_lock(s); + reiserfs_write_lock_nested(s, depth); } } diff --git a/fs/reiserfs/resize.c b/fs/reiserfs/resize.c index 3ce02cff5e90..a4ef5cd606eb 100644 --- a/fs/reiserfs/resize.c +++ b/fs/reiserfs/resize.c @@ -34,6 +34,7 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new) unsigned long int block_count, free_blocks; int i; int copy_size; + int depth; sb = SB_DISK_SUPER_BLOCK(s); @@ -43,7 +44,9 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new) } /* check the device size */ + depth = reiserfs_write_unlock_nested(s); bh = sb_bread(s, block_count_new - 1); + reiserfs_write_lock_nested(s, depth); if (!bh) { printk("reiserfs_resize: can\'t read last block\n"); return -EINVAL; @@ -125,9 +128,12 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new) * transaction begins, and the new bitmaps don't matter if the * transaction fails. */ for (i = bmap_nr; i < bmap_nr_new; i++) { + int depth; /* don't use read_bitmap_block since it will cache * the uninitialized bitmap */ + depth = reiserfs_write_unlock_nested(s); bh = sb_bread(s, i * s->s_blocksize * 8); + reiserfs_write_lock_nested(s, depth); if (!bh) { vfree(bitmap); return -EIO; @@ -138,9 +144,9 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new) set_buffer_uptodate(bh); mark_buffer_dirty(bh); - reiserfs_write_unlock(s); + depth = reiserfs_write_unlock_nested(s); sync_dirty_buffer(bh); - reiserfs_write_lock(s); + reiserfs_write_lock_nested(s, depth); // update bitmap_info stuff bitmap[i].free_count = sb_blocksize(sb) * 8 - 1; brelse(bh); diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c index 2f40a4c70a4d..b14706a05d52 100644 --- a/fs/reiserfs/stree.c +++ b/fs/reiserfs/stree.c @@ -524,14 +524,14 @@ static int is_tree_node(struct buffer_head *bh, int level) * the caller (search_by_key) will perform other schedule-unsafe * operations just after calling this function. * - * @return true if we have unlocked + * @return depth of lock to be restored after read completes */ -static bool search_by_key_reada(struct super_block *s, +static int search_by_key_reada(struct super_block *s, struct buffer_head **bh, b_blocknr_t *b, int num) { int i, j; - bool unlocked = false; + int depth = -1; for (i = 0; i < num; i++) { bh[i] = sb_getblk(s, b[i]); @@ -549,15 +549,13 @@ static bool search_by_key_reada(struct super_block *s, * you have to make sure the prepared bit isn't set on this buffer */ if (!buffer_uptodate(bh[j])) { - if (!unlocked) { - reiserfs_write_unlock(s); - unlocked = true; - } + if (depth == -1) + depth = reiserfs_write_unlock_nested(s); ll_rw_block(READA, 1, bh + j); } brelse(bh[j]); } - return unlocked; + return depth; } /************************************************************************** @@ -645,26 +643,26 @@ int search_by_key(struct super_block *sb, const struct cpu_key *key, /* Key to s have a pointer to it. */ if ((bh = last_element->pe_buffer = sb_getblk(sb, block_number))) { - bool unlocked = false; - if (!buffer_uptodate(bh) && reada_count > 1) - /* may unlock the write lock */ - unlocked = search_by_key_reada(sb, reada_bh, - reada_blocks, reada_count); /* - * If we haven't already unlocked the write lock, - * then we need to do that here before reading - * the current block + * We'll need to drop the lock if we encounter any + * buffers that need to be read. If all of them are + * already up to date, we don't need to drop the lock. */ - if (!buffer_uptodate(bh) && !unlocked) { - reiserfs_write_unlock(sb); - unlocked = true; - } + int depth = -1; + + if (!buffer_uptodate(bh) && reada_count > 1) + depth = search_by_key_reada(sb, reada_bh, + reada_blocks, reada_count); + + if (!buffer_uptodate(bh) && depth == -1) + depth = reiserfs_write_unlock_nested(sb); + ll_rw_block(READ, 1, &bh); wait_on_buffer(bh); - if (unlocked) - reiserfs_write_lock(sb); + if (depth != -1) + reiserfs_write_lock_nested(sb, depth); if (!buffer_uptodate(bh)) goto io_error; } else { @@ -1059,9 +1057,7 @@ static char prepare_for_delete_or_cut(struct reiserfs_transaction_handle *th, st reiserfs_free_block(th, inode, block, 1); } - reiserfs_write_unlock(sb); - cond_resched(); - reiserfs_write_lock(sb); + reiserfs_cond_resched(sb); if (item_moved (&s_ih, path)) { need_re_search = 1; @@ -1190,6 +1186,7 @@ int reiserfs_delete_item(struct reiserfs_transaction_handle *th, struct item_head *q_ih; int quota_cut_bytes; int ret_value, del_size, removed; + int depth; #ifdef CONFIG_REISERFS_CHECK char mode; @@ -1299,7 +1296,9 @@ int reiserfs_delete_item(struct reiserfs_transaction_handle *th, "reiserquota delete_item(): freeing %u, id=%u type=%c", quota_cut_bytes, inode->i_uid, head2type(&s_ih)); #endif + depth = reiserfs_write_unlock_nested(inode->i_sb); dquot_free_space_nodirty(inode, quota_cut_bytes); + reiserfs_write_lock_nested(inode->i_sb, depth); /* Return deleted body length */ return ret_value; @@ -1325,6 +1324,7 @@ int reiserfs_delete_item(struct reiserfs_transaction_handle *th, void reiserfs_delete_solid_item(struct reiserfs_transaction_handle *th, struct inode *inode, struct reiserfs_key *key) { + struct super_block *sb = th->t_super; struct tree_balance tb; INITIALIZE_PATH(path); int item_len = 0; @@ -1377,14 +1377,17 @@ void reiserfs_delete_solid_item(struct reiserfs_transaction_handle *th, if (retval == CARRY_ON) { do_balance(&tb, NULL, NULL, M_DELETE); if (inode) { /* Should we count quota for item? (we don't count quotas for save-links) */ + int depth; #ifdef REISERQUOTA_DEBUG reiserfs_debug(th->t_super, REISERFS_DEBUG_CODE, "reiserquota delete_solid_item(): freeing %u id=%u type=%c", quota_cut_bytes, inode->i_uid, key2type(key)); #endif + depth = reiserfs_write_unlock_nested(sb); dquot_free_space_nodirty(inode, quota_cut_bytes); + reiserfs_write_lock_nested(sb, depth); } break; } @@ -1561,6 +1564,7 @@ int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th, int retval2 = -1; int quota_cut_bytes; loff_t tail_pos = 0; + int depth; BUG_ON(!th->t_trans_id); @@ -1733,7 +1737,9 @@ int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th, "reiserquota cut_from_item(): freeing %u id=%u type=%c", quota_cut_bytes, inode->i_uid, '?'); #endif + depth = reiserfs_write_unlock_nested(sb); dquot_free_space_nodirty(inode, quota_cut_bytes); + reiserfs_write_lock_nested(sb, depth); return ret_value; } @@ -1953,9 +1959,11 @@ int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th, struct tree const char *body, /* Pointer to the bytes to paste. */ int pasted_size) { /* Size of pasted bytes. */ + struct super_block *sb = inode->i_sb; struct tree_balance s_paste_balance; int retval; int fs_gen; + int depth; BUG_ON(!th->t_trans_id); @@ -1968,9 +1976,9 @@ int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th, struct tree key2type(&(key->on_disk_key))); #endif - reiserfs_write_unlock(inode->i_sb); + depth = reiserfs_write_unlock_nested(sb); retval = dquot_alloc_space_nodirty(inode, pasted_size); - reiserfs_write_lock(inode->i_sb); + reiserfs_write_lock_nested(sb, depth); if (retval) { pathrelse(search_path); return retval; @@ -2027,7 +2035,9 @@ int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th, struct tree pasted_size, inode->i_uid, key2type(&(key->on_disk_key))); #endif + depth = reiserfs_write_unlock_nested(sb); dquot_free_space_nodirty(inode, pasted_size); + reiserfs_write_lock_nested(sb, depth); return retval; } @@ -2050,6 +2060,7 @@ int reiserfs_insert_item(struct reiserfs_transaction_handle *th, BUG_ON(!th->t_trans_id); if (inode) { /* Do we count quotas for item? */ + int depth; fs_gen = get_generation(inode->i_sb); quota_bytes = ih_item_len(ih); @@ -2063,11 +2074,11 @@ int reiserfs_insert_item(struct reiserfs_transaction_handle *th, "reiserquota insert_item(): allocating %u id=%u type=%c", quota_bytes, inode->i_uid, head2type(ih)); #endif - reiserfs_write_unlock(inode->i_sb); /* We can't dirty inode here. It would be immediately written but * appropriate stat item isn't inserted yet... */ + depth = reiserfs_write_unlock_nested(inode->i_sb); retval = dquot_alloc_space_nodirty(inode, quota_bytes); - reiserfs_write_lock(inode->i_sb); + reiserfs_write_lock_nested(inode->i_sb, depth); if (retval) { pathrelse(path); return retval; @@ -2118,7 +2129,10 @@ int reiserfs_insert_item(struct reiserfs_transaction_handle *th, "reiserquota insert_item(): freeing %u id=%u type=%c", quota_bytes, inode->i_uid, head2type(ih)); #endif - if (inode) + if (inode) { + int depth = reiserfs_write_unlock_nested(inode->i_sb); dquot_free_space_nodirty(inode, quota_bytes); + reiserfs_write_lock_nested(inode->i_sb, depth); + } return retval; } diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index e2e202a07b31..3ead145dadc4 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -243,6 +243,7 @@ static int finish_unfinished(struct super_block *s) done = 0; REISERFS_SB(s)->s_is_unlinked_ok = 1; while (!retval) { + int depth; retval = search_item(s, &max_cpu_key, &path); if (retval != ITEM_NOT_FOUND) { reiserfs_error(s, "vs-2140", @@ -298,9 +299,9 @@ static int finish_unfinished(struct super_block *s) retval = remove_save_link_only(s, &save_link_key, 0); continue; } - reiserfs_write_unlock(s); + depth = reiserfs_write_unlock_nested(inode->i_sb); dquot_initialize(inode); - reiserfs_write_lock(s); + reiserfs_write_lock_nested(inode->i_sb, depth); if (truncate && S_ISDIR(inode->i_mode)) { /* We got a truncate request for a dir which is impossible. @@ -356,10 +357,12 @@ static int finish_unfinished(struct super_block *s) #ifdef CONFIG_QUOTA /* Turn quotas off */ + reiserfs_write_unlock(s); for (i = 0; i < MAXQUOTAS; i++) { if (sb_dqopt(s)->files[i] && quota_enabled[i]) dquot_quota_off(s, i); } + reiserfs_write_lock(s); if (ms_active_set) /* Restore the flag back */ s->s_flags &= ~MS_ACTIVE; @@ -623,7 +626,6 @@ static void reiserfs_dirty_inode(struct inode *inode, int flags) struct reiserfs_transaction_handle th; int err = 0; - int lock_depth; if (inode->i_sb->s_flags & MS_RDONLY) { reiserfs_warning(inode->i_sb, "clm-6006", @@ -631,7 +633,7 @@ static void reiserfs_dirty_inode(struct inode *inode, int flags) inode->i_ino); return; } - lock_depth = reiserfs_write_lock_once(inode->i_sb); + reiserfs_write_lock(inode->i_sb); /* this is really only used for atime updates, so they don't have ** to be included in O_SYNC or fsync @@ -644,7 +646,7 @@ static void reiserfs_dirty_inode(struct inode *inode, int flags) journal_end(&th, inode->i_sb, 1); out: - reiserfs_write_unlock_once(inode->i_sb, lock_depth); + reiserfs_write_unlock(inode->i_sb); } static int reiserfs_show_options(struct seq_file *seq, struct dentry *root) @@ -1334,7 +1336,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg) kfree(qf_names[i]); #endif err = -EINVAL; - goto out_unlock; + goto out_err_unlock; } #ifdef CONFIG_QUOTA handle_quota_files(s, qf_names, &qfmt); @@ -1378,35 +1380,32 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg) if (blocks) { err = reiserfs_resize(s, blocks); if (err != 0) - goto out_unlock; + goto out_err_unlock; } if (*mount_flags & MS_RDONLY) { + reiserfs_write_unlock(s); reiserfs_xattr_init(s, *mount_flags); /* remount read-only */ if (s->s_flags & MS_RDONLY) /* it is read-only already */ - goto out_ok; + goto out_ok_unlocked; - /* - * Drop write lock. Quota will retake it when needed and lock - * ordering requires calling dquot_suspend() without it. - */ - reiserfs_write_unlock(s); err = dquot_suspend(s, -1); if (err < 0) goto out_err; - reiserfs_write_lock(s); /* try to remount file system with read-only permissions */ if (sb_umount_state(rs) == REISERFS_VALID_FS || REISERFS_SB(s)->s_mount_state != REISERFS_VALID_FS) { - goto out_ok; + goto out_ok_unlocked; } + reiserfs_write_lock(s); + err = journal_begin(&th, s, 10); if (err) - goto out_unlock; + goto out_err_unlock; /* Mounting a rw partition read-only. */ reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1); @@ -1415,13 +1414,14 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg) } else { /* remount read-write */ if (!(s->s_flags & MS_RDONLY)) { + reiserfs_write_unlock(s); reiserfs_xattr_init(s, *mount_flags); - goto out_ok; /* We are read-write already */ + goto out_ok_unlocked; /* We are read-write already */ } if (reiserfs_is_journal_aborted(journal)) { err = journal->j_errno; - goto out_unlock; + goto out_err_unlock; } handle_data_mode(s, mount_options); @@ -1430,7 +1430,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg) s->s_flags &= ~MS_RDONLY; /* now it is safe to call journal_begin */ err = journal_begin(&th, s, 10); if (err) - goto out_unlock; + goto out_err_unlock; /* Mount a partition which is read-only, read-write */ reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1); @@ -1447,26 +1447,22 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg) SB_JOURNAL(s)->j_must_wait = 1; err = journal_end(&th, s, 10); if (err) - goto out_unlock; + goto out_err_unlock; + reiserfs_write_unlock(s); if (!(*mount_flags & MS_RDONLY)) { - /* - * Drop write lock. Quota will retake it when needed and lock - * ordering requires calling dquot_resume() without it. - */ - reiserfs_write_unlock(s); dquot_resume(s, -1); reiserfs_write_lock(s); finish_unfinished(s); + reiserfs_write_unlock(s); reiserfs_xattr_init(s, *mount_flags); } -out_ok: +out_ok_unlocked: replace_mount_options(s, new_opts); - reiserfs_write_unlock(s); return 0; -out_unlock: +out_err_unlock: reiserfs_write_unlock(s); out_err: kfree(new_opts); @@ -2013,12 +2009,14 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) goto error; } + reiserfs_write_unlock(s); if ((errval = reiserfs_lookup_privroot(s)) || (errval = reiserfs_xattr_init(s, s->s_flags))) { dput(s->s_root); s->s_root = NULL; - goto error; + goto error_unlocked; } + reiserfs_write_lock(s); /* look for files which were to be removed in previous session */ finish_unfinished(s); @@ -2027,12 +2025,14 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) reiserfs_info(s, "using 3.5.x disk format\n"); } + reiserfs_write_unlock(s); if ((errval = reiserfs_lookup_privroot(s)) || (errval = reiserfs_xattr_init(s, s->s_flags))) { dput(s->s_root); s->s_root = NULL; - goto error; + goto error_unlocked; } + reiserfs_write_lock(s); } // mark hash in super block: it could be unset. overwrite should be ok set_sb_hash_function_code(rs, function2code(sbi->s_hash_function)); @@ -2100,6 +2100,7 @@ static int reiserfs_write_dquot(struct dquot *dquot) { struct reiserfs_transaction_handle th; int ret, err; + int depth; reiserfs_write_lock(dquot->dq_sb); ret = @@ -2107,9 +2108,9 @@ static int reiserfs_write_dquot(struct dquot *dquot) REISERFS_QUOTA_TRANS_BLOCKS(dquot->dq_sb)); if (ret) goto out; - reiserfs_write_unlock(dquot->dq_sb); + depth = reiserfs_write_unlock_nested(dquot->dq_sb); ret = dquot_commit(dquot); - reiserfs_write_lock(dquot->dq_sb); + reiserfs_write_lock_nested(dquot->dq_sb, depth); err = journal_end(&th, dquot->dq_sb, REISERFS_QUOTA_TRANS_BLOCKS(dquot->dq_sb)); @@ -2124,6 +2125,7 @@ static int reiserfs_acquire_dquot(struct dquot *dquot) { struct reiserfs_transaction_handle th; int ret, err; + int depth; reiserfs_write_lock(dquot->dq_sb); ret = @@ -2131,9 +2133,9 @@ static int reiserfs_acquire_dquot(struct dquot *dquot) REISERFS_QUOTA_INIT_BLOCKS(dquot->dq_sb)); if (ret) goto out; - reiserfs_write_unlock(dquot->dq_sb); + depth = reiserfs_write_unlock_nested(dquot->dq_sb); ret = dquot_acquire(dquot); - reiserfs_write_lock(dquot->dq_sb); + reiserfs_write_lock_nested(dquot->dq_sb, depth); err = journal_end(&th, dquot->dq_sb, REISERFS_QUOTA_INIT_BLOCKS(dquot->dq_sb)); @@ -2186,15 +2188,16 @@ static int reiserfs_write_info(struct super_block *sb, int type) { struct reiserfs_transaction_handle th; int ret, err; + int depth; /* Data block + inode block */ reiserfs_write_lock(sb); ret = journal_begin(&th, sb, 2); if (ret) goto out; - reiserfs_write_unlock(sb); + depth = reiserfs_write_unlock_nested(sb); ret = dquot_commit_info(sb, type); - reiserfs_write_lock(sb); + reiserfs_write_lock_nested(sb, depth); err = journal_end(&th, sb, 2); if (!ret && err) ret = err; diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index c69cdd749f09..8a9e2dcfe004 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -81,8 +81,7 @@ static int xattr_unlink(struct inode *dir, struct dentry *dentry) int error; BUG_ON(!mutex_is_locked(&dir->i_mutex)); - reiserfs_mutex_lock_nested_safe(&dentry->d_inode->i_mutex, - I_MUTEX_CHILD, dir->i_sb); + mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD); error = dir->i_op->unlink(dir, dentry); mutex_unlock(&dentry->d_inode->i_mutex); @@ -96,8 +95,7 @@ static int xattr_rmdir(struct inode *dir, struct dentry *dentry) int error; BUG_ON(!mutex_is_locked(&dir->i_mutex)); - reiserfs_mutex_lock_nested_safe(&dentry->d_inode->i_mutex, - I_MUTEX_CHILD, dir->i_sb); + mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD); error = dir->i_op->rmdir(dir, dentry); if (!error) dentry->d_inode->i_flags |= S_DEAD; @@ -232,22 +230,17 @@ static int reiserfs_for_each_xattr(struct inode *inode, if (IS_PRIVATE(inode) || get_inode_sd_version(inode) == STAT_DATA_V1) return 0; - reiserfs_write_unlock(inode->i_sb); dir = open_xa_dir(inode, XATTR_REPLACE); if (IS_ERR(dir)) { err = PTR_ERR(dir); - reiserfs_write_lock(inode->i_sb); goto out; } else if (!dir->d_inode) { err = 0; - reiserfs_write_lock(inode->i_sb); goto out_dir; } mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_XATTR); - reiserfs_write_lock(inode->i_sb); - buf.xadir = dir; while (1) { err = reiserfs_readdir_inode(dir->d_inode, &buf.ctx); @@ -281,14 +274,17 @@ static int reiserfs_for_each_xattr(struct inode *inode, int blocks = JOURNAL_PER_BALANCE_CNT * 2 + 2 + 4 * REISERFS_QUOTA_TRANS_BLOCKS(inode->i_sb); struct reiserfs_transaction_handle th; + reiserfs_write_lock(inode->i_sb); err = journal_begin(&th, inode->i_sb, blocks); + reiserfs_write_unlock(inode->i_sb); if (!err) { int jerror; - reiserfs_mutex_lock_nested_safe( - &dir->d_parent->d_inode->i_mutex, - I_MUTEX_XATTR, inode->i_sb); + mutex_lock_nested(&dir->d_parent->d_inode->i_mutex, + I_MUTEX_XATTR); err = action(dir, data); + reiserfs_write_lock(inode->i_sb); jerror = journal_end(&th, inode->i_sb, blocks); + reiserfs_write_unlock(inode->i_sb); mutex_unlock(&dir->d_parent->d_inode->i_mutex); err = jerror ?: err; } @@ -455,9 +451,7 @@ static int lookup_and_delete_xattr(struct inode *inode, const char *name) } if (dentry->d_inode) { - reiserfs_write_lock(inode->i_sb); err = xattr_unlink(xadir->d_inode, dentry); - reiserfs_write_unlock(inode->i_sb); update_ctime(inode); } @@ -491,24 +485,17 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th, if (get_inode_sd_version(inode) == STAT_DATA_V1) return -EOPNOTSUPP; - reiserfs_write_unlock(inode->i_sb); - if (!buffer) { err = lookup_and_delete_xattr(inode, name); - reiserfs_write_lock(inode->i_sb); return err; } dentry = xattr_lookup(inode, name, flags); - if (IS_ERR(dentry)) { - reiserfs_write_lock(inode->i_sb); + if (IS_ERR(dentry)) return PTR_ERR(dentry); - } down_write(&REISERFS_I(inode)->i_xattr_sem); - reiserfs_write_lock(inode->i_sb); - xahash = xattr_hash(buffer, buffer_size); while (buffer_pos < buffer_size || buffer_pos == 0) { size_t chunk; @@ -538,6 +525,7 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th, rxh->h_hash = cpu_to_le32(xahash); } + reiserfs_write_lock(inode->i_sb); err = __reiserfs_write_begin(page, page_offset, chunk + skip); if (!err) { if (buffer) @@ -546,6 +534,7 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th, page_offset + chunk + skip); } + reiserfs_write_unlock(inode->i_sb); unlock_page(page); reiserfs_put_page(page); buffer_pos += chunk; @@ -563,10 +552,8 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th, .ia_valid = ATTR_SIZE | ATTR_CTIME, }; - reiserfs_write_unlock(inode->i_sb); mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_XATTR); inode_dio_wait(dentry->d_inode); - reiserfs_write_lock(inode->i_sb); err = reiserfs_setattr(dentry, &newattrs); mutex_unlock(&dentry->d_inode->i_mutex); @@ -592,18 +579,19 @@ int reiserfs_xattr_set(struct inode *inode, const char *name, reiserfs_write_lock(inode->i_sb); error = journal_begin(&th, inode->i_sb, jbegin_count); + reiserfs_write_unlock(inode->i_sb); if (error) { - reiserfs_write_unlock(inode->i_sb); return error; } error = reiserfs_xattr_set_handle(&th, inode, name, buffer, buffer_size, flags); + reiserfs_write_lock(inode->i_sb); error2 = journal_end(&th, inode->i_sb, jbegin_count); + reiserfs_write_unlock(inode->i_sb); if (error == 0) error = error2; - reiserfs_write_unlock(inode->i_sb); return error; } @@ -968,7 +956,7 @@ int reiserfs_lookup_privroot(struct super_block *s) int err = 0; /* If we don't have the privroot located yet - go find it */ - reiserfs_mutex_lock_safe(&s->s_root->d_inode->i_mutex, s); + mutex_lock(&s->s_root->d_inode->i_mutex); dentry = lookup_one_len(PRIVROOT_NAME, s->s_root, strlen(PRIVROOT_NAME)); if (!IS_ERR(dentry)) { @@ -996,14 +984,14 @@ int reiserfs_xattr_init(struct super_block *s, int mount_flags) goto error; if (!privroot->d_inode && !(mount_flags & MS_RDONLY)) { - reiserfs_mutex_lock_safe(&s->s_root->d_inode->i_mutex, s); + mutex_lock(&s->s_root->d_inode->i_mutex); err = create_privroot(REISERFS_SB(s)->priv_root); mutex_unlock(&s->s_root->d_inode->i_mutex); } if (privroot->d_inode) { s->s_xattr = reiserfs_xattr_handlers; - reiserfs_mutex_lock_safe(&privroot->d_inode->i_mutex, s); + mutex_lock(&privroot->d_inode->i_mutex); if (!REISERFS_SB(s)->xattr_root) { struct dentry *dentry; dentry = lookup_one_len(XAROOT_NAME, privroot, diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c index 6c8767fdfc6a..06c04f73da65 100644 --- a/fs/reiserfs/xattr_acl.c +++ b/fs/reiserfs/xattr_acl.c @@ -49,13 +49,15 @@ posix_acl_set(struct dentry *dentry, const char *name, const void *value, reiserfs_write_lock(inode->i_sb); error = journal_begin(&th, inode->i_sb, jcreate_blocks); + reiserfs_write_unlock(inode->i_sb); if (error == 0) { error = reiserfs_set_acl(&th, inode, type, acl); + reiserfs_write_lock(inode->i_sb); error2 = journal_end(&th, inode->i_sb, jcreate_blocks); + reiserfs_write_unlock(inode->i_sb); if (error2) error = error2; } - reiserfs_write_unlock(inode->i_sb); release_and_out: posix_acl_release(acl); @@ -435,12 +437,14 @@ int reiserfs_cache_default_acl(struct inode *inode) return nblocks; } +/* + * Called under i_mutex + */ int reiserfs_acl_chmod(struct inode *inode) { struct reiserfs_transaction_handle th; struct posix_acl *acl; size_t size; - int depth; int error; if (IS_PRIVATE(inode)) @@ -454,9 +458,7 @@ int reiserfs_acl_chmod(struct inode *inode) return 0; } - reiserfs_write_unlock(inode->i_sb); acl = reiserfs_get_acl(inode, ACL_TYPE_ACCESS); - reiserfs_write_lock(inode->i_sb); if (!acl) return 0; if (IS_ERR(acl)) @@ -466,16 +468,18 @@ int reiserfs_acl_chmod(struct inode *inode) return error; size = reiserfs_xattr_nblocks(inode, reiserfs_acl_size(acl->a_count)); - depth = reiserfs_write_lock_once(inode->i_sb); + reiserfs_write_lock(inode->i_sb); error = journal_begin(&th, inode->i_sb, size * 2); + reiserfs_write_unlock(inode->i_sb); if (!error) { int error2; error = reiserfs_set_acl(&th, inode, ACL_TYPE_ACCESS, acl); + reiserfs_write_lock(inode->i_sb); error2 = journal_end(&th, inode->i_sb, size * 2); + reiserfs_write_unlock(inode->i_sb); if (error2) error = error2; } - reiserfs_write_unlock_once(inode->i_sb, depth); posix_acl_release(acl); return error; } diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index fd7ce7a39f91..834ec2cdb7a3 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -112,8 +112,15 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type, struct super_block *sb; int error; - if (!(flags & MS_KERNMOUNT) && !current_user_ns()->may_mount_sysfs) - return ERR_PTR(-EPERM); + if (!(flags & MS_KERNMOUNT)) { + if (!capable(CAP_SYS_ADMIN) && !fs_fully_visible(fs_type)) + return ERR_PTR(-EPERM); + + for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++) { + if (!kobj_ns_current_may_mount(type)) + return ERR_PTR(-EPERM); + } + } info = kzalloc(sizeof(*info), GFP_KERNEL); if (!info) diff --git a/fs/udf/super.c b/fs/udf/super.c index 9ac4057a86c9..839a2bad7f45 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -630,6 +630,12 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options) struct udf_sb_info *sbi = UDF_SB(sb); int error = 0; + if (sbi->s_lvid_bh) { + int write_rev = le16_to_cpu(udf_sb_lvidiu(sbi)->minUDFWriteRev); + if (write_rev > UDF_MAX_WRITE_VERSION && !(*flags & MS_RDONLY)) + return -EACCES; + } + uopt.flags = sbi->s_flags; uopt.uid = sbi->s_uid; uopt.gid = sbi->s_gid; @@ -649,12 +655,6 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options) sbi->s_dmode = uopt.dmode; write_unlock(&sbi->s_cred_lock); - if (sbi->s_lvid_bh) { - int write_rev = le16_to_cpu(udf_sb_lvidiu(sbi)->minUDFWriteRev); - if (write_rev > UDF_MAX_WRITE_VERSION) - *flags |= MS_RDONLY; - } - if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) goto out_unlock; @@ -843,27 +843,38 @@ static int udf_find_fileset(struct super_block *sb, return 1; } +/* + * Load primary Volume Descriptor Sequence + * + * Return <0 on error, 0 on success. -EAGAIN is special meaning next sequence + * should be tried. + */ static int udf_load_pvoldesc(struct super_block *sb, sector_t block) { struct primaryVolDesc *pvoldesc; struct ustr *instr, *outstr; struct buffer_head *bh; uint16_t ident; - int ret = 1; + int ret = -ENOMEM; instr = kmalloc(sizeof(struct ustr), GFP_NOFS); if (!instr) - return 1; + return -ENOMEM; outstr = kmalloc(sizeof(struct ustr), GFP_NOFS); if (!outstr) goto out1; bh = udf_read_tagged(sb, block, block, &ident); - if (!bh) + if (!bh) { + ret = -EAGAIN; goto out2; + } - BUG_ON(ident != TAG_IDENT_PVD); + if (ident != TAG_IDENT_PVD) { + ret = -EIO; + goto out_bh; + } pvoldesc = (struct primaryVolDesc *)bh->b_data; @@ -889,8 +900,9 @@ static int udf_load_pvoldesc(struct super_block *sb, sector_t block) if (udf_CS0toUTF8(outstr, instr)) udf_debug("volSetIdent[] = '%s'\n", outstr->u_name); - brelse(bh); ret = 0; +out_bh: + brelse(bh); out2: kfree(outstr); out1: @@ -947,7 +959,7 @@ static int udf_load_metadata_files(struct super_block *sb, int partition) if (mdata->s_mirror_fe == NULL) { udf_err(sb, "Both metadata and mirror metadata inode efe can not found\n"); - goto error_exit; + return -EIO; } } @@ -964,23 +976,18 @@ static int udf_load_metadata_files(struct super_block *sb, int partition) addr.logicalBlockNum, addr.partitionReferenceNum); mdata->s_bitmap_fe = udf_iget(sb, &addr); - if (mdata->s_bitmap_fe == NULL) { if (sb->s_flags & MS_RDONLY) udf_warn(sb, "bitmap inode efe not found but it's ok since the disc is mounted read-only\n"); else { udf_err(sb, "bitmap inode efe not found and attempted read-write mount\n"); - goto error_exit; + return -EIO; } } } udf_debug("udf_load_metadata_files Ok\n"); - return 0; - -error_exit: - return 1; } static void udf_load_fileset(struct super_block *sb, struct buffer_head *bh, @@ -1069,7 +1076,7 @@ static int udf_fill_partdesc_info(struct super_block *sb, if (!map->s_uspace.s_table) { udf_debug("cannot load unallocSpaceTable (part %d)\n", p_index); - return 1; + return -EIO; } map->s_partition_flags |= UDF_PART_FLAG_UNALLOC_TABLE; udf_debug("unallocSpaceTable (part %d) @ %ld\n", @@ -1079,7 +1086,7 @@ static int udf_fill_partdesc_info(struct super_block *sb, if (phd->unallocSpaceBitmap.extLength) { struct udf_bitmap *bitmap = udf_sb_alloc_bitmap(sb, p_index); if (!bitmap) - return 1; + return -ENOMEM; map->s_uspace.s_bitmap = bitmap; bitmap->s_extPosition = le32_to_cpu( phd->unallocSpaceBitmap.extPosition); @@ -1102,7 +1109,7 @@ static int udf_fill_partdesc_info(struct super_block *sb, if (!map->s_fspace.s_table) { udf_debug("cannot load freedSpaceTable (part %d)\n", p_index); - return 1; + return -EIO; } map->s_partition_flags |= UDF_PART_FLAG_FREED_TABLE; @@ -1113,7 +1120,7 @@ static int udf_fill_partdesc_info(struct super_block *sb, if (phd->freedSpaceBitmap.extLength) { struct udf_bitmap *bitmap = udf_sb_alloc_bitmap(sb, p_index); if (!bitmap) - return 1; + return -ENOMEM; map->s_fspace.s_bitmap = bitmap; bitmap->s_extPosition = le32_to_cpu( phd->freedSpaceBitmap.extPosition); @@ -1165,7 +1172,7 @@ static int udf_load_vat(struct super_block *sb, int p_index, int type1_index) udf_find_vat_block(sb, p_index, type1_index, blocks - 1); } if (!sbi->s_vat_inode) - return 1; + return -EIO; if (map->s_partition_type == UDF_VIRTUAL_MAP15) { map->s_type_specific.s_virtual.s_start_offset = 0; @@ -1177,7 +1184,7 @@ static int udf_load_vat(struct super_block *sb, int p_index, int type1_index) pos = udf_block_map(sbi->s_vat_inode, 0); bh = sb_bread(sb, pos); if (!bh) - return 1; + return -EIO; vat20 = (struct virtualAllocationTable20 *)bh->b_data; } else { vat20 = (struct virtualAllocationTable20 *) @@ -1195,6 +1202,12 @@ static int udf_load_vat(struct super_block *sb, int p_index, int type1_index) return 0; } +/* + * Load partition descriptor block + * + * Returns <0 on error, 0 on success, -EAGAIN is special - try next descriptor + * sequence. + */ static int udf_load_partdesc(struct super_block *sb, sector_t block) { struct buffer_head *bh; @@ -1204,13 +1217,15 @@ static int udf_load_partdesc(struct super_block *sb, sector_t block) int i, type1_idx; uint16_t partitionNumber; uint16_t ident; - int ret = 0; + int ret; bh = udf_read_tagged(sb, block, block, &ident); if (!bh) - return 1; - if (ident != TAG_IDENT_PD) + return -EAGAIN; + if (ident != TAG_IDENT_PD) { + ret = 0; goto out_bh; + } p = (struct partitionDesc *)bh->b_data; partitionNumber = le16_to_cpu(p->partitionNumber); @@ -1229,10 +1244,13 @@ static int udf_load_partdesc(struct super_block *sb, sector_t block) if (i >= sbi->s_partitions) { udf_debug("Partition (%d) not found in partition map\n", partitionNumber); + ret = 0; goto out_bh; } ret = udf_fill_partdesc_info(sb, p, i); + if (ret < 0) + goto out_bh; /* * Now rescan for VIRTUAL or METADATA partitions when SPARABLE and @@ -1249,32 +1267,37 @@ static int udf_load_partdesc(struct super_block *sb, sector_t block) break; } - if (i >= sbi->s_partitions) + if (i >= sbi->s_partitions) { + ret = 0; goto out_bh; + } ret = udf_fill_partdesc_info(sb, p, i); - if (ret) + if (ret < 0) goto out_bh; if (map->s_partition_type == UDF_METADATA_MAP25) { ret = udf_load_metadata_files(sb, i); - if (ret) { + if (ret < 0) { udf_err(sb, "error loading MetaData partition map %d\n", i); goto out_bh; } } else { - ret = udf_load_vat(sb, i, type1_idx); - if (ret) - goto out_bh; /* - * Mark filesystem read-only if we have a partition with - * virtual map since we don't handle writing to it (we - * overwrite blocks instead of relocating them). + * If we have a partition with virtual map, we don't handle + * writing to it (we overwrite blocks instead of relocating + * them). */ - sb->s_flags |= MS_RDONLY; - pr_notice("Filesystem marked read-only because writing to pseudooverwrite partition is not implemented\n"); + if (!(sb->s_flags & MS_RDONLY)) { + ret = -EACCES; + goto out_bh; + } + ret = udf_load_vat(sb, i, type1_idx); + if (ret < 0) + goto out_bh; } + ret = 0; out_bh: /* In case loading failed, we handle cleanup in udf_fill_super */ brelse(bh); @@ -1340,11 +1363,11 @@ static int udf_load_logicalvol(struct super_block *sb, sector_t block, uint16_t ident; struct buffer_head *bh; unsigned int table_len; - int ret = 0; + int ret; bh = udf_read_tagged(sb, block, block, &ident); if (!bh) - return 1; + return -EAGAIN; BUG_ON(ident != TAG_IDENT_LVD); lvd = (struct logicalVolDesc *)bh->b_data; table_len = le32_to_cpu(lvd->mapTableLength); @@ -1352,7 +1375,7 @@ static int udf_load_logicalvol(struct super_block *sb, sector_t block, udf_err(sb, "error loading logical volume descriptor: " "Partition table too long (%u > %lu)\n", table_len, sb->s_blocksize - sizeof(*lvd)); - ret = 1; + ret = -EIO; goto out_bh; } @@ -1396,11 +1419,10 @@ static int udf_load_logicalvol(struct super_block *sb, sector_t block, } else if (!strncmp(upm2->partIdent.ident, UDF_ID_SPARABLE, strlen(UDF_ID_SPARABLE))) { - if (udf_load_sparable_map(sb, map, - (struct sparablePartitionMap *)gpm) < 0) { - ret = 1; + ret = udf_load_sparable_map(sb, map, + (struct sparablePartitionMap *)gpm); + if (ret < 0) goto out_bh; - } } else if (!strncmp(upm2->partIdent.ident, UDF_ID_METADATA, strlen(UDF_ID_METADATA))) { @@ -1465,7 +1487,7 @@ static int udf_load_logicalvol(struct super_block *sb, sector_t block, } if (lvd->integritySeqExt.extLength) udf_load_logicalvolint(sb, leea_to_cpu(lvd->integritySeqExt)); - + ret = 0; out_bh: brelse(bh); return ret; @@ -1503,22 +1525,18 @@ static void udf_load_logicalvolint(struct super_block *sb, struct kernel_extent_ } /* - * udf_process_sequence - * - * PURPOSE - * Process a main/reserve volume descriptor sequence. - * - * PRE-CONDITIONS - * sb Pointer to _locked_ superblock. - * block First block of first extent of the sequence. - * lastblock Lastblock of first extent of the sequence. + * Process a main/reserve volume descriptor sequence. + * @block First block of first extent of the sequence. + * @lastblock Lastblock of first extent of the sequence. + * @fileset There we store extent containing root fileset * - * HISTORY - * July 1, 1997 - Andrew E. Mileski - * Written, tested, and released. + * Returns <0 on error, 0 on success. -EAGAIN is special - try next descriptor + * sequence */ -static noinline int udf_process_sequence(struct super_block *sb, long block, - long lastblock, struct kernel_lb_addr *fileset) +static noinline int udf_process_sequence( + struct super_block *sb, + sector_t block, sector_t lastblock, + struct kernel_lb_addr *fileset) { struct buffer_head *bh = NULL; struct udf_vds_record vds[VDS_POS_LENGTH]; @@ -1529,6 +1547,7 @@ static noinline int udf_process_sequence(struct super_block *sb, long block, uint32_t vdsn; uint16_t ident; long next_s = 0, next_e = 0; + int ret; memset(vds, 0, sizeof(struct udf_vds_record) * VDS_POS_LENGTH); @@ -1543,7 +1562,7 @@ static noinline int udf_process_sequence(struct super_block *sb, long block, udf_err(sb, "Block %llu of volume descriptor sequence is corrupted or we could not read it\n", (unsigned long long)block); - return 1; + return -EAGAIN; } /* Process each descriptor (ISO 13346 3/8.3-8.4) */ @@ -1616,14 +1635,19 @@ static noinline int udf_process_sequence(struct super_block *sb, long block, */ if (!vds[VDS_POS_PRIMARY_VOL_DESC].block) { udf_err(sb, "Primary Volume Descriptor not found!\n"); - return 1; + return -EAGAIN; + } + ret = udf_load_pvoldesc(sb, vds[VDS_POS_PRIMARY_VOL_DESC].block); + if (ret < 0) + return ret; + + if (vds[VDS_POS_LOGICAL_VOL_DESC].block) { + ret = udf_load_logicalvol(sb, + vds[VDS_POS_LOGICAL_VOL_DESC].block, + fileset); + if (ret < 0) + return ret; } - if (udf_load_pvoldesc(sb, vds[VDS_POS_PRIMARY_VOL_DESC].block)) - return 1; - - if (vds[VDS_POS_LOGICAL_VOL_DESC].block && udf_load_logicalvol(sb, - vds[VDS_POS_LOGICAL_VOL_DESC].block, fileset)) - return 1; if (vds[VDS_POS_PARTITION_DESC].block) { /* @@ -1632,19 +1656,27 @@ static noinline int udf_process_sequence(struct super_block *sb, long block, */ for (block = vds[VDS_POS_PARTITION_DESC].block; block < vds[VDS_POS_TERMINATING_DESC].block; - block++) - if (udf_load_partdesc(sb, block)) - return 1; + block++) { + ret = udf_load_partdesc(sb, block); + if (ret < 0) + return ret; + } } return 0; } +/* + * Load Volume Descriptor Sequence described by anchor in bh + * + * Returns <0 on error, 0 on success + */ static int udf_load_sequence(struct super_block *sb, struct buffer_head *bh, struct kernel_lb_addr *fileset) { struct anchorVolDescPtr *anchor; - long main_s, main_e, reserve_s, reserve_e; + sector_t main_s, main_e, reserve_s, reserve_e; + int ret; anchor = (struct anchorVolDescPtr *)bh->b_data; @@ -1662,18 +1694,26 @@ static int udf_load_sequence(struct super_block *sb, struct buffer_head *bh, /* Process the main & reserve sequences */ /* responsible for finding the PartitionDesc(s) */ - if (!udf_process_sequence(sb, main_s, main_e, fileset)) - return 1; - udf_sb_free_partitions(sb); - if (!udf_process_sequence(sb, reserve_s, reserve_e, fileset)) - return 1; + ret = udf_process_sequence(sb, main_s, main_e, fileset); + if (ret != -EAGAIN) + return ret; udf_sb_free_partitions(sb); - return 0; + ret = udf_process_sequence(sb, reserve_s, reserve_e, fileset); + if (ret < 0) { + udf_sb_free_partitions(sb); + /* No sequence was OK, return -EIO */ + if (ret == -EAGAIN) + ret = -EIO; + } + return ret; } /* * Check whether there is an anchor block in the given block and * load Volume Descriptor Sequence if so. + * + * Returns <0 on error, 0 on success, -EAGAIN is special - try next anchor + * block */ static int udf_check_anchor_block(struct super_block *sb, sector_t block, struct kernel_lb_addr *fileset) @@ -1685,33 +1725,40 @@ static int udf_check_anchor_block(struct super_block *sb, sector_t block, if (UDF_QUERY_FLAG(sb, UDF_FLAG_VARCONV) && udf_fixed_to_variable(block) >= sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits) - return 0; + return -EAGAIN; bh = udf_read_tagged(sb, block, block, &ident); if (!bh) - return 0; + return -EAGAIN; if (ident != TAG_IDENT_AVDP) { brelse(bh); - return 0; + return -EAGAIN; } ret = udf_load_sequence(sb, bh, fileset); brelse(bh); return ret; } -/* Search for an anchor volume descriptor pointer */ -static sector_t udf_scan_anchors(struct super_block *sb, sector_t lastblock, - struct kernel_lb_addr *fileset) +/* + * Search for an anchor volume descriptor pointer. + * + * Returns < 0 on error, 0 on success. -EAGAIN is special - try next set + * of anchors. + */ +static int udf_scan_anchors(struct super_block *sb, sector_t *lastblock, + struct kernel_lb_addr *fileset) { sector_t last[6]; int i; struct udf_sb_info *sbi = UDF_SB(sb); int last_count = 0; + int ret; /* First try user provided anchor */ if (sbi->s_anchor) { - if (udf_check_anchor_block(sb, sbi->s_anchor, fileset)) - return lastblock; + ret = udf_check_anchor_block(sb, sbi->s_anchor, fileset); + if (ret != -EAGAIN) + return ret; } /* * according to spec, anchor is in either: @@ -1720,39 +1767,46 @@ static sector_t udf_scan_anchors(struct super_block *sb, sector_t lastblock, * lastblock * however, if the disc isn't closed, it could be 512. */ - if (udf_check_anchor_block(sb, sbi->s_session + 256, fileset)) - return lastblock; + ret = udf_check_anchor_block(sb, sbi->s_session + 256, fileset); + if (ret != -EAGAIN) + return ret; /* * The trouble is which block is the last one. Drives often misreport * this so we try various possibilities. */ - last[last_count++] = lastblock; - if (lastblock >= 1) - last[last_count++] = lastblock - 1; - last[last_count++] = lastblock + 1; - if (lastblock >= 2) - last[last_count++] = lastblock - 2; - if (lastblock >= 150) - last[last_count++] = lastblock - 150; - if (lastblock >= 152) - last[last_count++] = lastblock - 152; + last[last_count++] = *lastblock; + if (*lastblock >= 1) + last[last_count++] = *lastblock - 1; + last[last_count++] = *lastblock + 1; + if (*lastblock >= 2) + last[last_count++] = *lastblock - 2; + if (*lastblock >= 150) + last[last_count++] = *lastblock - 150; + if (*lastblock >= 152) + last[last_count++] = *lastblock - 152; for (i = 0; i < last_count; i++) { if (last[i] >= sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits) continue; - if (udf_check_anchor_block(sb, last[i], fileset)) - return last[i]; + ret = udf_check_anchor_block(sb, last[i], fileset); + if (ret != -EAGAIN) { + if (!ret) + *lastblock = last[i]; + return ret; + } if (last[i] < 256) continue; - if (udf_check_anchor_block(sb, last[i] - 256, fileset)) - return last[i]; + ret = udf_check_anchor_block(sb, last[i] - 256, fileset); + if (ret != -EAGAIN) { + if (!ret) + *lastblock = last[i]; + return ret; + } } /* Finally try block 512 in case media is open */ - if (udf_check_anchor_block(sb, sbi->s_session + 512, fileset)) - return last[0]; - return 0; + return udf_check_anchor_block(sb, sbi->s_session + 512, fileset); } /* @@ -1760,54 +1814,59 @@ static sector_t udf_scan_anchors(struct super_block *sb, sector_t lastblock, * area specified by it. The function expects sbi->s_lastblock to be the last * block on the media. * - * Return 1 if ok, 0 if not found. - * + * Return <0 on error, 0 if anchor found. -EAGAIN is special meaning anchor + * was not found. */ static int udf_find_anchor(struct super_block *sb, struct kernel_lb_addr *fileset) { - sector_t lastblock; struct udf_sb_info *sbi = UDF_SB(sb); + sector_t lastblock = sbi->s_last_block; + int ret; - lastblock = udf_scan_anchors(sb, sbi->s_last_block, fileset); - if (lastblock) + ret = udf_scan_anchors(sb, &lastblock, fileset); + if (ret != -EAGAIN) goto out; /* No anchor found? Try VARCONV conversion of block numbers */ UDF_SET_FLAG(sb, UDF_FLAG_VARCONV); + lastblock = udf_variable_to_fixed(sbi->s_last_block); /* Firstly, we try to not convert number of the last block */ - lastblock = udf_scan_anchors(sb, - udf_variable_to_fixed(sbi->s_last_block), - fileset); - if (lastblock) + ret = udf_scan_anchors(sb, &lastblock, fileset); + if (ret != -EAGAIN) goto out; + lastblock = sbi->s_last_block; /* Secondly, we try with converted number of the last block */ - lastblock = udf_scan_anchors(sb, sbi->s_last_block, fileset); - if (!lastblock) { + ret = udf_scan_anchors(sb, &lastblock, fileset); + if (ret < 0) { /* VARCONV didn't help. Clear it. */ UDF_CLEAR_FLAG(sb, UDF_FLAG_VARCONV); - return 0; } out: - sbi->s_last_block = lastblock; - return 1; + if (ret == 0) + sbi->s_last_block = lastblock; + return ret; } /* * Check Volume Structure Descriptor, find Anchor block and load Volume - * Descriptor Sequence + * Descriptor Sequence. + * + * Returns < 0 on error, 0 on success. -EAGAIN is special meaning anchor + * block was not found. */ static int udf_load_vrs(struct super_block *sb, struct udf_options *uopt, int silent, struct kernel_lb_addr *fileset) { struct udf_sb_info *sbi = UDF_SB(sb); loff_t nsr_off; + int ret; if (!sb_set_blocksize(sb, uopt->blocksize)) { if (!silent) udf_warn(sb, "Bad block size\n"); - return 0; + return -EINVAL; } sbi->s_last_block = uopt->lastblock; if (!uopt->novrs) { @@ -1828,12 +1887,13 @@ static int udf_load_vrs(struct super_block *sb, struct udf_options *uopt, /* Look for anchor block and load Volume Descriptor Sequence */ sbi->s_anchor = uopt->anchor; - if (!udf_find_anchor(sb, fileset)) { - if (!silent) + ret = udf_find_anchor(sb, fileset); + if (ret < 0) { + if (!silent && ret == -EAGAIN) udf_warn(sb, "No anchor found\n"); - return 0; + return ret; } - return 1; + return 0; } static void udf_open_lvid(struct super_block *sb) @@ -1939,7 +1999,7 @@ u64 lvid_get_unique_id(struct super_block *sb) static int udf_fill_super(struct super_block *sb, void *options, int silent) { - int ret; + int ret = -EINVAL; struct inode *inode = NULL; struct udf_options uopt; struct kernel_lb_addr rootdir, fileset; @@ -2011,7 +2071,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent) } else { uopt.blocksize = bdev_logical_block_size(sb->s_bdev); ret = udf_load_vrs(sb, &uopt, silent, &fileset); - if (!ret && uopt.blocksize != UDF_DEFAULT_BLOCKSIZE) { + if (ret == -EAGAIN && uopt.blocksize != UDF_DEFAULT_BLOCKSIZE) { if (!silent) pr_notice("Rescanning with blocksize %d\n", UDF_DEFAULT_BLOCKSIZE); @@ -2021,8 +2081,11 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent) ret = udf_load_vrs(sb, &uopt, silent, &fileset); } } - if (!ret) { - udf_warn(sb, "No partition found (1)\n"); + if (ret < 0) { + if (ret == -EAGAIN) { + udf_warn(sb, "No partition found (1)\n"); + ret = -EINVAL; + } goto error_out; } @@ -2040,9 +2103,13 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent) udf_err(sb, "minUDFReadRev=%x (max is %x)\n", le16_to_cpu(lvidiu->minUDFReadRev), UDF_MAX_READ_VERSION); + ret = -EINVAL; + goto error_out; + } else if (minUDFWriteRev > UDF_MAX_WRITE_VERSION && + !(sb->s_flags & MS_RDONLY)) { + ret = -EACCES; goto error_out; - } else if (minUDFWriteRev > UDF_MAX_WRITE_VERSION) - sb->s_flags |= MS_RDONLY; + } sbi->s_udfrev = minUDFWriteRev; @@ -2054,17 +2121,20 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent) if (!sbi->s_partitions) { udf_warn(sb, "No partition found (2)\n"); + ret = -EINVAL; goto error_out; } if (sbi->s_partmaps[sbi->s_partition].s_partition_flags & - UDF_PART_FLAG_READ_ONLY) { - pr_notice("Partition marked readonly; forcing readonly mount\n"); - sb->s_flags |= MS_RDONLY; + UDF_PART_FLAG_READ_ONLY && + !(sb->s_flags & MS_RDONLY)) { + ret = -EACCES; + goto error_out; } if (udf_find_fileset(sb, &fileset, &rootdir)) { udf_warn(sb, "No fileset found\n"); + ret = -EINVAL; goto error_out; } @@ -2086,6 +2156,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent) if (!inode) { udf_err(sb, "Error in udf_iget, block=%d, partition=%d\n", rootdir.logicalBlockNum, rootdir.partitionReferenceNum); + ret = -EIO; goto error_out; } @@ -2093,6 +2164,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent) sb->s_root = d_make_root(inode); if (!sb->s_root) { udf_err(sb, "Couldn't allocate root dentry\n"); + ret = -ENOMEM; goto error_out; } sb->s_maxbytes = MAX_LFS_FILESIZE; @@ -2113,7 +2185,7 @@ error_out: kfree(sbi); sb->s_fs_info = NULL; - return -EINVAL; + return ret; } void _udf_err(struct super_block *sb, const char *function, |