diff options
-rw-r--r-- | Documentation/filesystems/zonefs.rst | 15 | ||||
-rw-r--r-- | fs/zonefs/super.c | 221 | ||||
-rw-r--r-- | fs/zonefs/zonefs.h | 10 |
3 files changed, 233 insertions, 13 deletions
diff --git a/Documentation/filesystems/zonefs.rst b/Documentation/filesystems/zonefs.rst index 6c18bc8ce332..6b213fe9a33e 100644 --- a/Documentation/filesystems/zonefs.rst +++ b/Documentation/filesystems/zonefs.rst @@ -326,6 +326,21 @@ discover the amount of data that has been written to the zone. In the case of a read-only zone discovered at run-time, as indicated in the previous section. The size of the zone file is left unchanged from its last updated value. +A zoned block device (e.g. an NVMe Zoned Namespace device) may have limits on +the number of zones that can be active, that is, zones that are in the +implicit open, explicit open or closed conditions. This potential limitation +translates into a risk for applications to see write IO errors due to this +limit being exceeded if the zone of a file is not already active when a write +request is issued by the user. + +To avoid these potential errors, the "explicit-open" mount option forces zones +to be made active using an open zone command when a file is opened for writing +for the first time. If the zone open command succeeds, the application is then +guaranteed that write requests can be processed. Conversely, the +"explicit-open" mount option will result in a zone close command being issued +to the device on the last close() of a zone file if the zone is not full nor +empty. + Zonefs User Space Tools ======================= diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index 8ec7c8f109d7..64cc2a9c38c8 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -24,6 +24,39 @@ #include "zonefs.h" +static inline int zonefs_zone_mgmt(struct inode *inode, + enum req_opf op) +{ + struct zonefs_inode_info *zi = ZONEFS_I(inode); + int ret; + + lockdep_assert_held(&zi->i_truncate_mutex); + + ret = blkdev_zone_mgmt(inode->i_sb->s_bdev, op, zi->i_zsector, + zi->i_zone_size >> SECTOR_SHIFT, GFP_NOFS); + if (ret) { + zonefs_err(inode->i_sb, + "Zone management operation %s at %llu failed %d\n", + blk_op_str(op), zi->i_zsector, ret); + return ret; + } + + return 0; +} + +static inline void zonefs_i_size_write(struct inode *inode, loff_t isize) +{ + struct zonefs_inode_info *zi = ZONEFS_I(inode); + + i_size_write(inode, isize); + /* + * A full zone is no longer open/active and does not need + * explicit closing. + */ + if (isize >= zi->i_max_size) + zi->i_flags &= ~ZONEFS_ZONE_OPEN; +} + static int zonefs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, unsigned int flags, struct iomap *iomap, struct iomap *srcmap) @@ -302,6 +335,17 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx, } /* + * If the filesystem is mounted with the explicit-open mount option, we + * need to clear the ZONEFS_ZONE_OPEN flag if the zone transitioned to + * the read-only or offline condition, to avoid attempting an explicit + * close of the zone when the inode file is closed. + */ + if ((sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) && + (zone->cond == BLK_ZONE_COND_OFFLINE || + zone->cond == BLK_ZONE_COND_READONLY)) + zi->i_flags &= ~ZONEFS_ZONE_OPEN; + + /* * If error=remount-ro was specified, any error result in remounting * the volume as read-only. */ @@ -315,7 +359,7 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx, * invalid data. */ zonefs_update_stats(inode, data_size); - i_size_write(inode, data_size); + zonefs_i_size_write(inode, data_size); zi->i_wpoffset = data_size; return 0; @@ -328,7 +372,7 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx, * eventually correct the file size and zonefs inode write pointer offset * (which can be out of sync with the drive due to partial write failures). */ -static void zonefs_io_error(struct inode *inode, bool write) +static void __zonefs_io_error(struct inode *inode, bool write) { struct zonefs_inode_info *zi = ZONEFS_I(inode); struct super_block *sb = inode->i_sb; @@ -342,8 +386,6 @@ static void zonefs_io_error(struct inode *inode, bool write) }; int ret; - mutex_lock(&zi->i_truncate_mutex); - /* * Memory allocations in blkdev_report_zones() can trigger a memory * reclaim which may in turn cause a recursion into zonefs as well as @@ -359,7 +401,14 @@ static void zonefs_io_error(struct inode *inode, bool write) zonefs_err(sb, "Get inode %lu zone information failed %d\n", inode->i_ino, ret); memalloc_noio_restore(noio_flag); +} +static void zonefs_io_error(struct inode *inode, bool write) +{ + struct zonefs_inode_info *zi = ZONEFS_I(inode); + + mutex_lock(&zi->i_truncate_mutex); + __zonefs_io_error(inode, write); mutex_unlock(&zi->i_truncate_mutex); } @@ -397,13 +446,27 @@ static int zonefs_file_truncate(struct inode *inode, loff_t isize) if (isize == old_isize) goto unlock; - ret = blkdev_zone_mgmt(inode->i_sb->s_bdev, op, zi->i_zsector, - zi->i_zone_size >> SECTOR_SHIFT, GFP_NOFS); - if (ret) { - zonefs_err(inode->i_sb, - "Zone management operation at %llu failed %d", - zi->i_zsector, ret); + ret = zonefs_zone_mgmt(inode, op); + if (ret) goto unlock; + + /* + * If the mount option ZONEFS_MNTOPT_EXPLICIT_OPEN is set, + * take care of open zones. + */ + if (zi->i_flags & ZONEFS_ZONE_OPEN) { + /* + * Truncating a zone to EMPTY or FULL is the equivalent of + * closing the zone. For a truncation to 0, we need to + * re-open the zone to ensure new writes can be processed. + * For a truncation to the maximum file size, the zone is + * closed and writes cannot be accepted anymore, so clear + * the open flag. + */ + if (!isize) + ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_OPEN); + else + zi->i_flags &= ~ZONEFS_ZONE_OPEN; } zonefs_update_stats(inode, isize); @@ -584,7 +647,7 @@ static int zonefs_file_write_dio_end_io(struct kiocb *iocb, ssize_t size, mutex_lock(&zi->i_truncate_mutex); if (i_size_read(inode) < iocb->ki_pos + size) { zonefs_update_stats(inode, iocb->ki_pos + size); - i_size_write(inode, iocb->ki_pos + size); + zonefs_i_size_write(inode, iocb->ki_pos + size); } mutex_unlock(&zi->i_truncate_mutex); } @@ -865,8 +928,128 @@ inode_unlock: return ret; } +static inline bool zonefs_file_use_exp_open(struct inode *inode, struct file *file) +{ + struct zonefs_inode_info *zi = ZONEFS_I(inode); + struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb); + + if (!(sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN)) + return false; + + if (zi->i_ztype != ZONEFS_ZTYPE_SEQ) + return false; + + if (!(file->f_mode & FMODE_WRITE)) + return false; + + return true; +} + +static int zonefs_open_zone(struct inode *inode) +{ + struct zonefs_inode_info *zi = ZONEFS_I(inode); + struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb); + int ret = 0; + + mutex_lock(&zi->i_truncate_mutex); + + zi->i_wr_refcnt++; + if (zi->i_wr_refcnt == 1) { + + if (atomic_inc_return(&sbi->s_open_zones) > sbi->s_max_open_zones) { + atomic_dec(&sbi->s_open_zones); + ret = -EBUSY; + goto unlock; + } + + if (i_size_read(inode) < zi->i_max_size) { + ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_OPEN); + if (ret) { + zi->i_wr_refcnt--; + atomic_dec(&sbi->s_open_zones); + goto unlock; + } + zi->i_flags |= ZONEFS_ZONE_OPEN; + } + } + +unlock: + mutex_unlock(&zi->i_truncate_mutex); + + return ret; +} + +static int zonefs_file_open(struct inode *inode, struct file *file) +{ + int ret; + + ret = generic_file_open(inode, file); + if (ret) + return ret; + + if (zonefs_file_use_exp_open(inode, file)) + return zonefs_open_zone(inode); + + return 0; +} + +static void zonefs_close_zone(struct inode *inode) +{ + struct zonefs_inode_info *zi = ZONEFS_I(inode); + int ret = 0; + + mutex_lock(&zi->i_truncate_mutex); + zi->i_wr_refcnt--; + if (!zi->i_wr_refcnt) { + struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb); + struct super_block *sb = inode->i_sb; + + /* + * If the file zone is full, it is not open anymore and we only + * need to decrement the open count. + */ + if (!(zi->i_flags & ZONEFS_ZONE_OPEN)) + goto dec; + + ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_CLOSE); + if (ret) { + __zonefs_io_error(inode, false); + /* + * Leaving zones explicitly open may lead to a state + * where most zones cannot be written (zone resources + * exhausted). So take preventive action by remounting + * read-only. + */ + if (zi->i_flags & ZONEFS_ZONE_OPEN && + !(sb->s_flags & SB_RDONLY)) { + zonefs_warn(sb, "closing zone failed, remounting filesystem read-only\n"); + sb->s_flags |= SB_RDONLY; + } + } + zi->i_flags &= ~ZONEFS_ZONE_OPEN; +dec: + atomic_dec(&sbi->s_open_zones); + } + mutex_unlock(&zi->i_truncate_mutex); +} + +static int zonefs_file_release(struct inode *inode, struct file *file) +{ + /* + * If we explicitly open a zone we must close it again as well, but the + * zone management operation can fail (either due to an IO error or as + * the zone has gone offline or read-only). Make sure we don't fail the + * close(2) for user-space. + */ + if (zonefs_file_use_exp_open(inode, file)) + zonefs_close_zone(inode); + + return 0; +} + static const struct file_operations zonefs_file_operations = { - .open = generic_file_open, + .open = zonefs_file_open, + .release = zonefs_file_release, .fsync = zonefs_file_fsync, .mmap = zonefs_file_mmap, .llseek = zonefs_file_llseek, @@ -890,6 +1073,7 @@ static struct inode *zonefs_alloc_inode(struct super_block *sb) inode_init_once(&zi->i_vnode); mutex_init(&zi->i_truncate_mutex); init_rwsem(&zi->i_mmap_sem); + zi->i_wr_refcnt = 0; return &zi->i_vnode; } @@ -940,7 +1124,7 @@ static int zonefs_statfs(struct dentry *dentry, struct kstatfs *buf) enum { Opt_errors_ro, Opt_errors_zro, Opt_errors_zol, Opt_errors_repair, - Opt_err, + Opt_explicit_open, Opt_err, }; static const match_table_t tokens = { @@ -948,6 +1132,7 @@ static const match_table_t tokens = { { Opt_errors_zro, "errors=zone-ro"}, { Opt_errors_zol, "errors=zone-offline"}, { Opt_errors_repair, "errors=repair"}, + { Opt_explicit_open, "explicit-open" }, { Opt_err, NULL} }; @@ -984,6 +1169,9 @@ static int zonefs_parse_options(struct super_block *sb, char *options) sbi->s_mount_opts &= ~ZONEFS_MNTOPT_ERRORS_MASK; sbi->s_mount_opts |= ZONEFS_MNTOPT_ERRORS_REPAIR; break; + case Opt_explicit_open: + sbi->s_mount_opts |= ZONEFS_MNTOPT_EXPLICIT_OPEN; + break; default: return -EINVAL; } @@ -1403,6 +1591,13 @@ static int zonefs_fill_super(struct super_block *sb, void *data, int silent) sbi->s_gid = GLOBAL_ROOT_GID; sbi->s_perm = 0640; sbi->s_mount_opts = ZONEFS_MNTOPT_ERRORS_RO; + sbi->s_max_open_zones = bdev_max_open_zones(sb->s_bdev); + atomic_set(&sbi->s_open_zones, 0); + if (!sbi->s_max_open_zones && + sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) { + zonefs_info(sb, "No open zones limit. Ignoring explicit_open mount option\n"); + sbi->s_mount_opts &= ~ZONEFS_MNTOPT_EXPLICIT_OPEN; + } ret = zonefs_read_super(sb); if (ret) diff --git a/fs/zonefs/zonefs.h b/fs/zonefs/zonefs.h index 55b39970acb2..51141907097c 100644 --- a/fs/zonefs/zonefs.h +++ b/fs/zonefs/zonefs.h @@ -38,6 +38,8 @@ static inline enum zonefs_ztype zonefs_zone_type(struct blk_zone *zone) return ZONEFS_ZTYPE_SEQ; } +#define ZONEFS_ZONE_OPEN (1 << 0) + /* * In-memory inode data. */ @@ -74,6 +76,10 @@ struct zonefs_inode_info { */ struct mutex i_truncate_mutex; struct rw_semaphore i_mmap_sem; + + /* guarded by i_truncate_mutex */ + unsigned int i_wr_refcnt; + unsigned int i_flags; }; static inline struct zonefs_inode_info *ZONEFS_I(struct inode *inode) @@ -154,6 +160,7 @@ enum zonefs_features { #define ZONEFS_MNTOPT_ERRORS_MASK \ (ZONEFS_MNTOPT_ERRORS_RO | ZONEFS_MNTOPT_ERRORS_ZRO | \ ZONEFS_MNTOPT_ERRORS_ZOL | ZONEFS_MNTOPT_ERRORS_REPAIR) +#define ZONEFS_MNTOPT_EXPLICIT_OPEN (1 << 4) /* Explicit open/close of zones on open/close */ /* * In-memory Super block information. @@ -175,6 +182,9 @@ struct zonefs_sb_info { loff_t s_blocks; loff_t s_used_blocks; + + unsigned int s_max_open_zones; + atomic_t s_open_zones; }; static inline struct zonefs_sb_info *ZONEFS_SB(struct super_block *sb) |