summaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/Kconfig1
-rw-r--r--fs/bfs/inode.c4
-rw-r--r--fs/block_dev.c66
-rw-r--r--fs/btrfs/backref.c41
-rw-r--r--fs/btrfs/btrfs_inode.h7
-rw-r--r--fs/btrfs/compression.c18
-rw-r--r--fs/btrfs/ctree.c20
-rw-r--r--fs/btrfs/ctree.h34
-rw-r--r--fs/btrfs/delayed-inode.c46
-rw-r--r--fs/btrfs/delayed-inode.h6
-rw-r--r--fs/btrfs/delayed-ref.c8
-rw-r--r--fs/btrfs/delayed-ref.h8
-rw-r--r--fs/btrfs/dev-replace.c9
-rw-r--r--fs/btrfs/disk-io.c13
-rw-r--r--fs/btrfs/disk-io.h4
-rw-r--r--fs/btrfs/extent-tree.c35
-rw-r--r--fs/btrfs/extent_io.c59
-rw-r--r--fs/btrfs/extent_io.h8
-rw-r--r--fs/btrfs/extent_map.c10
-rw-r--r--fs/btrfs/extent_map.h3
-rw-r--r--fs/btrfs/file.c82
-rw-r--r--fs/btrfs/free-space-cache.c2
-rw-r--r--fs/btrfs/inode.c289
-rw-r--r--fs/btrfs/ioctl.c33
-rw-r--r--fs/btrfs/ordered-data.c20
-rw-r--r--fs/btrfs/ordered-data.h2
-rw-r--r--fs/btrfs/qgroup.c102
-rw-r--r--fs/btrfs/qgroup.h51
-rw-r--r--fs/btrfs/raid56.c38
-rw-r--r--fs/btrfs/reada.c37
-rw-r--r--fs/btrfs/root-tree.c3
-rw-r--r--fs/btrfs/scrub.c331
-rw-r--r--fs/btrfs/send.c23
-rw-r--r--fs/btrfs/super.c3
-rw-r--r--fs/btrfs/tests/btrfs-tests.c1
-rw-r--r--fs/btrfs/transaction.c48
-rw-r--r--fs/btrfs/transaction.h6
-rw-r--r--fs/btrfs/tree-log.c2
-rw-r--r--fs/btrfs/volumes.c854
-rw-r--r--fs/btrfs/volumes.h8
-rw-r--r--fs/ceph/addr.c10
-rw-r--r--fs/ceph/caps.c25
-rw-r--r--fs/ceph/debugfs.c21
-rw-r--r--fs/ceph/dir.c23
-rw-r--r--fs/ceph/file.c68
-rw-r--r--fs/ceph/inode.c17
-rw-r--r--fs/ceph/mds_client.c75
-rw-r--r--fs/ceph/mds_client.h15
-rw-r--r--fs/ceph/mdsmap.c44
-rw-r--r--fs/ceph/snap.c2
-rw-r--r--fs/ceph/super.c7
-rw-r--r--fs/ceph/super.h31
-rw-r--r--fs/ceph/xattr.c3
-rw-r--r--fs/dax.c95
-rw-r--r--fs/ext2/super.c1
-rw-r--r--fs/ext4/file.c21
-rw-r--r--fs/ext4/super.c1
-rw-r--r--fs/fuse/dev.c24
-rw-r--r--fs/fuse/file.c32
-rw-r--r--fs/fuse/fuse_i.h11
-rw-r--r--fs/fuse/inode.c9
-rw-r--r--fs/jbd2/journal.c2
-rw-r--r--fs/jffs2/readinode.c2
-rw-r--r--fs/lockd/clntlock.c1
-rw-r--r--fs/lockd/clntproc.c26
-rw-r--r--fs/lockd/svc.c6
-rw-r--r--fs/lockd/svclock.c18
-rw-r--r--fs/locks.c2
-rw-r--r--fs/namei.c48
-rw-r--r--fs/namespace.c18
-rw-r--r--fs/nfs/Kconfig5
-rw-r--r--fs/nfs/Makefile1
-rw-r--r--fs/nfs/callback.c26
-rw-r--r--fs/nfs/callback_proc.c47
-rw-r--r--fs/nfs/callback_xdr.c109
-rw-r--r--fs/nfs/client.c67
-rw-r--r--fs/nfs/dir.c104
-rw-r--r--fs/nfs/direct.c21
-rw-r--r--fs/nfs/file.c30
-rw-r--r--fs/nfs/filelayout/filelayout.c8
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.c24
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayoutdev.c10
-rw-r--r--fs/nfs/inode.c5
-rw-r--r--fs/nfs/internal.h5
-rw-r--r--fs/nfs/namespace.c34
-rw-r--r--fs/nfs/nfs3proc.c54
-rw-r--r--fs/nfs/nfs42proc.c24
-rw-r--r--fs/nfs/nfs42xdr.c22
-rw-r--r--fs/nfs/nfs4client.c283
-rw-r--r--fs/nfs/nfs4getroot.c3
-rw-r--r--fs/nfs/nfs4namespace.c7
-rw-r--r--fs/nfs/nfs4proc.c99
-rw-r--r--fs/nfs/nfs4state.c10
-rw-r--r--fs/nfs/nfs4xdr.c94
-rw-r--r--fs/nfs/objlayout/Kbuild5
-rw-r--r--fs/nfs/objlayout/objio_osd.c675
-rw-r--r--fs/nfs/objlayout/objlayout.c706
-rw-r--r--fs/nfs/objlayout/objlayout.h183
-rw-r--r--fs/nfs/objlayout/pnfs_osd_xdr_cli.c415
-rw-r--r--fs/nfs/pagelist.c77
-rw-r--r--fs/nfs/pnfs.c62
-rw-r--r--fs/nfs/pnfs.h6
-rw-r--r--fs/nfs/pnfs_nfs.c24
-rw-r--r--fs/nfs/proc.c2
-rw-r--r--fs/nfs/read.c9
-rw-r--r--fs/nfs/write.c121
-rw-r--r--fs/nfsd/nfs3xdr.c23
-rw-r--r--fs/nfsd/nfs4proc.c3
-rw-r--r--fs/nfsd/nfs4state.c25
-rw-r--r--fs/nfsd/nfs4xdr.c19
-rw-r--r--fs/nfsd/nfsxdr.c13
-rw-r--r--fs/nfsd/vfs.c24
-rw-r--r--fs/open.c12
-rw-r--r--fs/overlayfs/copy_up.c82
-rw-r--r--fs/overlayfs/dir.c37
-rw-r--r--fs/overlayfs/inode.c103
-rw-r--r--fs/overlayfs/namei.c141
-rw-r--r--fs/overlayfs/overlayfs.h41
-rw-r--r--fs/overlayfs/ovl_entry.h2
-rw-r--r--fs/overlayfs/super.c40
-rw-r--r--fs/overlayfs/util.c19
-rw-r--r--fs/pstore/ram.c2
-rw-r--r--fs/signalfd.c2
-rw-r--r--fs/ubifs/Kconfig13
-rw-r--r--fs/ubifs/debug.c4
-rw-r--r--fs/ubifs/ioctl.c6
-rw-r--r--fs/ubifs/recovery.c1
-rw-r--r--fs/ubifs/ubifs.h14
-rw-r--r--fs/ubifs/xattr.c6
-rw-r--r--fs/xfs/xfs_super.c1
130 files changed, 3024 insertions, 3979 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index 83eab52fb3f6..b0e42b6a96b9 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -39,6 +39,7 @@ config FS_DAX
depends on MMU
depends on !(ARM || MIPS || SPARC)
select FS_IOMAP
+ select DAX
help
Direct Access (DAX) can be used on memory-backed block devices.
If the block device supports DAX and the filesystem supports DAX,
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index f2deec0a62f0..25e312cb6071 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -1,7 +1,7 @@
/*
* fs/bfs/inode.c
* BFS superblock and inode operations.
- * Copyright (C) 1999-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
+ * Copyright (C) 1999-2006 Tigran Aivazian <aivazian.tigran@gmail.com>
* From fs/minix, Copyright (C) 1991, 1992 Linus Torvalds.
*
* Made endianness-clean by Andrew Stribblehill <ads@wompom.org>, 2005.
@@ -19,7 +19,7 @@
#include <linux/uaccess.h>
#include "bfs.h"
-MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
+MODULE_AUTHOR("Tigran Aivazian <aivazian.tigran@gmail.com>");
MODULE_DESCRIPTION("SCO UnixWare BFS filesystem for Linux");
MODULE_LICENSE("GPL");
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 2a305c1a2d88..519599dddd36 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -717,72 +717,6 @@ int bdev_write_page(struct block_device *bdev, sector_t sector,
}
EXPORT_SYMBOL_GPL(bdev_write_page);
-int bdev_dax_pgoff(struct block_device *bdev, sector_t sector, size_t size,
- pgoff_t *pgoff)
-{
- phys_addr_t phys_off = (get_start_sect(bdev) + sector) * 512;
-
- if (pgoff)
- *pgoff = PHYS_PFN(phys_off);
- if (phys_off % PAGE_SIZE || size % PAGE_SIZE)
- return -EINVAL;
- return 0;
-}
-EXPORT_SYMBOL(bdev_dax_pgoff);
-
-/**
- * bdev_dax_supported() - Check if the device supports dax for filesystem
- * @sb: The superblock of the device
- * @blocksize: The block size of the device
- *
- * This is a library function for filesystems to check if the block device
- * can be mounted with dax option.
- *
- * Return: negative errno if unsupported, 0 if supported.
- */
-int bdev_dax_supported(struct super_block *sb, int blocksize)
-{
- struct block_device *bdev = sb->s_bdev;
- struct dax_device *dax_dev;
- pgoff_t pgoff;
- int err, id;
- void *kaddr;
- pfn_t pfn;
- long len;
-
- if (blocksize != PAGE_SIZE) {
- vfs_msg(sb, KERN_ERR, "error: unsupported blocksize for dax");
- return -EINVAL;
- }
-
- err = bdev_dax_pgoff(bdev, 0, PAGE_SIZE, &pgoff);
- if (err) {
- vfs_msg(sb, KERN_ERR, "error: unaligned partition for dax");
- return err;
- }
-
- dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
- if (!dax_dev) {
- vfs_msg(sb, KERN_ERR, "error: device does not support dax");
- return -EOPNOTSUPP;
- }
-
- id = dax_read_lock();
- len = dax_direct_access(dax_dev, pgoff, 1, &kaddr, &pfn);
- dax_read_unlock(id);
-
- put_dax(dax_dev);
-
- if (len < 1) {
- vfs_msg(sb, KERN_ERR,
- "error: dax access failed (%ld)", len);
- return len < 0 ? len : -EIO;
- }
-
- return 0;
-}
-EXPORT_SYMBOL_GPL(bdev_dax_supported);
-
/*
* pseudo-fs
*/
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 7699e16784d3..24865da63d8f 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -26,6 +26,11 @@
#include "delayed-ref.h"
#include "locking.h"
+enum merge_mode {
+ MERGE_IDENTICAL_KEYS = 1,
+ MERGE_IDENTICAL_PARENTS,
+};
+
/* Just an arbitrary number so we can be sure this happened */
#define BACKREF_FOUND_SHARED 6
@@ -533,7 +538,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
* slot==nritems. In that case, go to the next leaf before we continue.
*/
if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
- if (time_seq == (u64)-1)
+ if (time_seq == SEQ_LAST)
ret = btrfs_next_leaf(root, path);
else
ret = btrfs_next_old_leaf(root, path, time_seq);
@@ -577,7 +582,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
eie = NULL;
}
next:
- if (time_seq == (u64)-1)
+ if (time_seq == SEQ_LAST)
ret = btrfs_next_item(root, path);
else
ret = btrfs_next_old_item(root, path, time_seq);
@@ -629,7 +634,7 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
if (path->search_commit_root)
root_level = btrfs_header_level(root->commit_root);
- else if (time_seq == (u64)-1)
+ else if (time_seq == SEQ_LAST)
root_level = btrfs_header_level(root->node);
else
root_level = btrfs_old_root_level(root, time_seq);
@@ -640,7 +645,7 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
}
path->lowest_level = level;
- if (time_seq == (u64)-1)
+ if (time_seq == SEQ_LAST)
ret = btrfs_search_slot(NULL, root, &ref->key_for_search, path,
0, 0);
else
@@ -809,14 +814,12 @@ static int __add_missing_keys(struct btrfs_fs_info *fs_info,
/*
* merge backrefs and adjust counts accordingly
*
- * mode = 1: merge identical keys, if key is set
- * FIXME: if we add more keys in __add_prelim_ref, we can merge more here.
- * additionally, we could even add a key range for the blocks we
- * looked into to merge even more (-> replace unresolved refs by those
- * having a parent).
- * mode = 2: merge identical parents
+ * FIXME: For MERGE_IDENTICAL_KEYS, if we add more keys in __add_prelim_ref
+ * then we can merge more here. Additionally, we could even add a key
+ * range for the blocks we looked into to merge even more (-> replace
+ * unresolved refs by those having a parent).
*/
-static void __merge_refs(struct list_head *head, int mode)
+static void __merge_refs(struct list_head *head, enum merge_mode mode)
{
struct __prelim_ref *pos1;
@@ -829,7 +832,7 @@ static void __merge_refs(struct list_head *head, int mode)
if (!ref_for_same_block(ref1, ref2))
continue;
- if (mode == 1) {
+ if (mode == MERGE_IDENTICAL_KEYS) {
if (!ref1->parent && ref2->parent)
swap(ref1, ref2);
} else {
@@ -1196,7 +1199,7 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
*
* NOTE: This can return values > 0
*
- * If time_seq is set to (u64)-1, it will not search delayed_refs, and behave
+ * If time_seq is set to SEQ_LAST, it will not search delayed_refs, and behave
* much like trans == NULL case, the difference only lies in it will not
* commit root.
* The special case is for qgroup to search roots in commit_transaction().
@@ -1243,7 +1246,7 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
path->skip_locking = 1;
}
- if (time_seq == (u64)-1)
+ if (time_seq == SEQ_LAST)
path->skip_locking = 1;
/*
@@ -1273,9 +1276,9 @@ again:
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
if (trans && likely(trans->type != __TRANS_DUMMY) &&
- time_seq != (u64)-1) {
+ time_seq != SEQ_LAST) {
#else
- if (trans && time_seq != (u64)-1) {
+ if (trans && time_seq != SEQ_LAST) {
#endif
/*
* look if there are updates for this ref queued and lock the
@@ -1286,7 +1289,7 @@ again:
head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
if (head) {
if (!mutex_trylock(&head->mutex)) {
- atomic_inc(&head->node.refs);
+ refcount_inc(&head->node.refs);
spin_unlock(&delayed_refs->lock);
btrfs_release_path(path);
@@ -1374,7 +1377,7 @@ again:
if (ret)
goto out;
- __merge_refs(&prefs, 1);
+ __merge_refs(&prefs, MERGE_IDENTICAL_KEYS);
ret = __resolve_indirect_refs(fs_info, path, time_seq, &prefs,
extent_item_pos, total_refs,
@@ -1382,7 +1385,7 @@ again:
if (ret)
goto out;
- __merge_refs(&prefs, 2);
+ __merge_refs(&prefs, MERGE_IDENTICAL_PARENTS);
while (!list_empty(&prefs)) {
ref = list_first_entry(&prefs, struct __prelim_ref, list);
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 0c6baaba0651..b8622e4d1744 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -125,6 +125,13 @@ struct btrfs_inode {
u64 delalloc_bytes;
/*
+ * Total number of bytes pending delalloc that fall within a file
+ * range that is either a hole or beyond EOF (and no prealloc extent
+ * exists in the range). This is always <= delalloc_bytes.
+ */
+ u64 new_delalloc_bytes;
+
+ /*
* total number of bytes pending defrag, used by stat to check whether
* it needs COW.
*/
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index c7721a6aa3bb..10e6b282d09d 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -44,7 +44,7 @@
struct compressed_bio {
/* number of bios pending for this compressed extent */
- atomic_t pending_bios;
+ refcount_t pending_bios;
/* the pages with the compressed data on them */
struct page **compressed_pages;
@@ -161,7 +161,7 @@ static void end_compressed_bio_read(struct bio *bio)
/* if there are more bios still pending for this compressed
* extent, just exit
*/
- if (!atomic_dec_and_test(&cb->pending_bios))
+ if (!refcount_dec_and_test(&cb->pending_bios))
goto out;
inode = cb->inode;
@@ -274,7 +274,7 @@ static void end_compressed_bio_write(struct bio *bio)
/* if there are more bios still pending for this compressed
* extent, just exit
*/
- if (!atomic_dec_and_test(&cb->pending_bios))
+ if (!refcount_dec_and_test(&cb->pending_bios))
goto out;
/* ok, we're the last bio for this extent, step one is to
@@ -342,7 +342,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS);
if (!cb)
return -ENOMEM;
- atomic_set(&cb->pending_bios, 0);
+ refcount_set(&cb->pending_bios, 0);
cb->errors = 0;
cb->inode = inode;
cb->start = start;
@@ -363,7 +363,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
bio->bi_private = cb;
bio->bi_end_io = end_compressed_bio_write;
- atomic_inc(&cb->pending_bios);
+ refcount_set(&cb->pending_bios, 1);
/* create and submit bios for the compressed pages */
bytes_left = compressed_len;
@@ -388,7 +388,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
* we inc the count. Otherwise, the cb might get
* freed before we're done setting it up
*/
- atomic_inc(&cb->pending_bios);
+ refcount_inc(&cb->pending_bios);
ret = btrfs_bio_wq_end_io(fs_info, bio,
BTRFS_WQ_ENDIO_DATA);
BUG_ON(ret); /* -ENOMEM */
@@ -607,7 +607,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
if (!cb)
goto out;
- atomic_set(&cb->pending_bios, 0);
+ refcount_set(&cb->pending_bios, 0);
cb->errors = 0;
cb->inode = inode;
cb->mirror_num = mirror_num;
@@ -656,7 +656,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
bio_set_op_attrs (comp_bio, REQ_OP_READ, 0);
comp_bio->bi_private = cb;
comp_bio->bi_end_io = end_compressed_bio_read;
- atomic_inc(&cb->pending_bios);
+ refcount_set(&cb->pending_bios, 1);
for (pg_index = 0; pg_index < nr_pages; pg_index++) {
page = cb->compressed_pages[pg_index];
@@ -685,7 +685,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
* we inc the count. Otherwise, the cb might get
* freed before we're done setting it up
*/
- atomic_inc(&cb->pending_bios);
+ refcount_inc(&cb->pending_bios);
if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
ret = btrfs_lookup_bio_sums(inode, comp_bio,
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 1c3b6c54d5ee..a3a75f1de002 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -567,7 +567,7 @@ tree_mod_log_insert_key(struct btrfs_fs_info *fs_info,
static noinline int
tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
struct extent_buffer *eb, int dst_slot, int src_slot,
- int nr_items, gfp_t flags)
+ int nr_items)
{
struct tree_mod_elem *tm = NULL;
struct tree_mod_elem **tm_list = NULL;
@@ -578,11 +578,11 @@ tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
if (!tree_mod_need_log(fs_info, eb))
return 0;
- tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), flags);
+ tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), GFP_NOFS);
if (!tm_list)
return -ENOMEM;
- tm = kzalloc(sizeof(*tm), flags);
+ tm = kzalloc(sizeof(*tm), GFP_NOFS);
if (!tm) {
ret = -ENOMEM;
goto free_tms;
@@ -596,7 +596,7 @@ tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot,
- MOD_LOG_KEY_REMOVE_WHILE_MOVING, flags);
+ MOD_LOG_KEY_REMOVE_WHILE_MOVING, GFP_NOFS);
if (!tm_list[i]) {
ret = -ENOMEM;
goto free_tms;
@@ -663,7 +663,7 @@ __tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
static noinline int
tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
struct extent_buffer *old_root,
- struct extent_buffer *new_root, gfp_t flags,
+ struct extent_buffer *new_root,
int log_removal)
{
struct tree_mod_elem *tm = NULL;
@@ -678,14 +678,14 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
if (log_removal && btrfs_header_level(old_root) > 0) {
nritems = btrfs_header_nritems(old_root);
tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *),
- flags);
+ GFP_NOFS);
if (!tm_list) {
ret = -ENOMEM;
goto free_tms;
}
for (i = 0; i < nritems; i++) {
tm_list[i] = alloc_tree_mod_elem(old_root, i,
- MOD_LOG_KEY_REMOVE_WHILE_FREEING, flags);
+ MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
if (!tm_list[i]) {
ret = -ENOMEM;
goto free_tms;
@@ -693,7 +693,7 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
}
}
- tm = kzalloc(sizeof(*tm), flags);
+ tm = kzalloc(sizeof(*tm), GFP_NOFS);
if (!tm) {
ret = -ENOMEM;
goto free_tms;
@@ -873,7 +873,7 @@ tree_mod_log_eb_move(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
{
int ret;
ret = tree_mod_log_insert_move(fs_info, dst, dst_offset, src_offset,
- nr_items, GFP_NOFS);
+ nr_items);
BUG_ON(ret < 0);
}
@@ -943,7 +943,7 @@ tree_mod_log_set_root_pointer(struct btrfs_root *root,
{
int ret;
ret = tree_mod_log_insert_root(root->fs_info, root->node,
- new_root_node, GFP_NOFS, log_removal);
+ new_root_node, log_removal);
BUG_ON(ret < 0);
}
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 3e21211e99c3..643c70d2b2e6 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -39,6 +39,7 @@
#include <linux/security.h>
#include <linux/sizes.h>
#include <linux/dynamic_debug.h>
+#include <linux/refcount.h>
#include "extent_io.h"
#include "extent_map.h"
#include "async-thread.h"
@@ -518,7 +519,7 @@ struct btrfs_caching_control {
struct btrfs_work work;
struct btrfs_block_group_cache *block_group;
u64 progress;
- atomic_t count;
+ refcount_t count;
};
/* Once caching_thread() finds this much free space, it will wake up waiters. */
@@ -538,6 +539,14 @@ struct btrfs_io_ctl {
unsigned check_crcs:1;
};
+/*
+ * Tree to record all locked full stripes of a RAID5/6 block group
+ */
+struct btrfs_full_stripe_locks_tree {
+ struct rb_root root;
+ struct mutex lock;
+};
+
struct btrfs_block_group_cache {
struct btrfs_key key;
struct btrfs_block_group_item item;
@@ -648,6 +657,9 @@ struct btrfs_block_group_cache {
* Protected by free_space_lock.
*/
int needs_free_space;
+
+ /* Record locked full stripes for RAID5/6 block group */
+ struct btrfs_full_stripe_locks_tree full_stripe_locks_root;
};
/* delayed seq elem */
@@ -658,6 +670,8 @@ struct seq_list {
#define SEQ_LIST_INIT(name) { .list = LIST_HEAD_INIT((name).list), .seq = 0 }
+#define SEQ_LAST ((u64)-1)
+
enum btrfs_orphan_cleanup_state {
ORPHAN_CLEANUP_STARTED = 1,
ORPHAN_CLEANUP_DONE = 2,
@@ -702,6 +716,11 @@ struct btrfs_delayed_root;
#define BTRFS_FS_BTREE_ERR 11
#define BTRFS_FS_LOG1_ERR 12
#define BTRFS_FS_LOG2_ERR 13
+/*
+ * Indicate that a whole-filesystem exclusive operation is running
+ * (device replace, resize, device add/delete, balance)
+ */
+#define BTRFS_FS_EXCL_OP 14
struct btrfs_fs_info {
u8 fsid[BTRFS_FSID_SIZE];
@@ -1066,8 +1085,6 @@ struct btrfs_fs_info {
/* device replace state */
struct btrfs_dev_replace dev_replace;
- atomic_t mutually_exclusive_operation_running;
-
struct percpu_counter bio_counter;
wait_queue_head_t replace_wait;
@@ -1220,7 +1237,7 @@ struct btrfs_root {
dev_t anon_dev;
spinlock_t root_item_lock;
- atomic_t refs;
+ refcount_t refs;
struct mutex delalloc_mutex;
spinlock_t delalloc_lock;
@@ -3646,6 +3663,12 @@ int btrfs_scrub_cancel_dev(struct btrfs_fs_info *info,
struct btrfs_device *dev);
int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid,
struct btrfs_scrub_progress *progress);
+static inline void btrfs_init_full_stripe_locks_tree(
+ struct btrfs_full_stripe_locks_tree *locks_root)
+{
+ locks_root->root = RB_ROOT;
+ mutex_init(&locks_root->lock);
+}
/* dev-replace.c */
void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info);
@@ -3670,8 +3693,7 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root,
struct btrfs_key *start, struct btrfs_key *end);
int btrfs_reada_wait(void *handle);
void btrfs_reada_detach(void *handle);
-int btree_readahead_hook(struct btrfs_fs_info *fs_info,
- struct extent_buffer *eb, int err);
+int btree_readahead_hook(struct extent_buffer *eb, int err);
static inline int is_fstree(u64 rootid)
{
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 1aff676f0e5b..8ae409b5a61d 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -52,7 +52,7 @@ static inline void btrfs_init_delayed_node(
{
delayed_node->root = root;
delayed_node->inode_id = inode_id;
- atomic_set(&delayed_node->refs, 0);
+ refcount_set(&delayed_node->refs, 0);
delayed_node->ins_root = RB_ROOT;
delayed_node->del_root = RB_ROOT;
mutex_init(&delayed_node->mutex);
@@ -81,7 +81,7 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(
node = READ_ONCE(btrfs_inode->delayed_node);
if (node) {
- atomic_inc(&node->refs);
+ refcount_inc(&node->refs);
return node;
}
@@ -89,14 +89,14 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(
node = radix_tree_lookup(&root->delayed_nodes_tree, ino);
if (node) {
if (btrfs_inode->delayed_node) {
- atomic_inc(&node->refs); /* can be accessed */
+ refcount_inc(&node->refs); /* can be accessed */
BUG_ON(btrfs_inode->delayed_node != node);
spin_unlock(&root->inode_lock);
return node;
}
btrfs_inode->delayed_node = node;
/* can be accessed and cached in the inode */
- atomic_add(2, &node->refs);
+ refcount_add(2, &node->refs);
spin_unlock(&root->inode_lock);
return node;
}
@@ -125,7 +125,7 @@ again:
btrfs_init_delayed_node(node, root, ino);
/* cached in the btrfs inode and can be accessed */
- atomic_add(2, &node->refs);
+ refcount_set(&node->refs, 2);
ret = radix_tree_preload(GFP_NOFS);
if (ret) {
@@ -166,7 +166,7 @@ static void btrfs_queue_delayed_node(struct btrfs_delayed_root *root,
} else {
list_add_tail(&node->n_list, &root->node_list);
list_add_tail(&node->p_list, &root->prepare_list);
- atomic_inc(&node->refs); /* inserted into list */
+ refcount_inc(&node->refs); /* inserted into list */
root->nodes++;
set_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags);
}
@@ -180,7 +180,7 @@ static void btrfs_dequeue_delayed_node(struct btrfs_delayed_root *root,
spin_lock(&root->lock);
if (test_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags)) {
root->nodes--;
- atomic_dec(&node->refs); /* not in the list */
+ refcount_dec(&node->refs); /* not in the list */
list_del_init(&node->n_list);
if (!list_empty(&node->p_list))
list_del_init(&node->p_list);
@@ -201,7 +201,7 @@ static struct btrfs_delayed_node *btrfs_first_delayed_node(
p = delayed_root->node_list.next;
node = list_entry(p, struct btrfs_delayed_node, n_list);
- atomic_inc(&node->refs);
+ refcount_inc(&node->refs);
out:
spin_unlock(&delayed_root->lock);
@@ -228,7 +228,7 @@ static struct btrfs_delayed_node *btrfs_next_delayed_node(
p = node->n_list.next;
next = list_entry(p, struct btrfs_delayed_node, n_list);
- atomic_inc(&next->refs);
+ refcount_inc(&next->refs);
out:
spin_unlock(&delayed_root->lock);
@@ -253,11 +253,11 @@ static void __btrfs_release_delayed_node(
btrfs_dequeue_delayed_node(delayed_root, delayed_node);
mutex_unlock(&delayed_node->mutex);
- if (atomic_dec_and_test(&delayed_node->refs)) {
+ if (refcount_dec_and_test(&delayed_node->refs)) {
bool free = false;
struct btrfs_root *root = delayed_node->root;
spin_lock(&root->inode_lock);
- if (atomic_read(&delayed_node->refs) == 0) {
+ if (refcount_read(&delayed_node->refs) == 0) {
radix_tree_delete(&root->delayed_nodes_tree,
delayed_node->inode_id);
free = true;
@@ -286,7 +286,7 @@ static struct btrfs_delayed_node *btrfs_first_prepared_delayed_node(
p = delayed_root->prepare_list.next;
list_del_init(p);
node = list_entry(p, struct btrfs_delayed_node, p_list);
- atomic_inc(&node->refs);
+ refcount_inc(&node->refs);
out:
spin_unlock(&delayed_root->lock);
@@ -308,7 +308,7 @@ static struct btrfs_delayed_item *btrfs_alloc_delayed_item(u32 data_len)
item->ins_or_del = 0;
item->bytes_reserved = 0;
item->delayed_node = NULL;
- atomic_set(&item->refs, 1);
+ refcount_set(&item->refs, 1);
}
return item;
}
@@ -483,7 +483,7 @@ static void btrfs_release_delayed_item(struct btrfs_delayed_item *item)
{
if (item) {
__btrfs_remove_delayed_item(item);
- if (atomic_dec_and_test(&item->refs))
+ if (refcount_dec_and_test(&item->refs))
kfree(item);
}
}
@@ -1600,14 +1600,14 @@ bool btrfs_readdir_get_delayed_items(struct inode *inode,
mutex_lock(&delayed_node->mutex);
item = __btrfs_first_delayed_insertion_item(delayed_node);
while (item) {
- atomic_inc(&item->refs);
+ refcount_inc(&item->refs);
list_add_tail(&item->readdir_list, ins_list);
item = __btrfs_next_delayed_item(item);
}
item = __btrfs_first_delayed_deletion_item(delayed_node);
while (item) {
- atomic_inc(&item->refs);
+ refcount_inc(&item->refs);
list_add_tail(&item->readdir_list, del_list);
item = __btrfs_next_delayed_item(item);
}
@@ -1621,7 +1621,7 @@ bool btrfs_readdir_get_delayed_items(struct inode *inode,
* insert/delete delayed items in this period. So we also needn't
* requeue or dequeue this delayed node.
*/
- atomic_dec(&delayed_node->refs);
+ refcount_dec(&delayed_node->refs);
return true;
}
@@ -1634,13 +1634,13 @@ void btrfs_readdir_put_delayed_items(struct inode *inode,
list_for_each_entry_safe(curr, next, ins_list, readdir_list) {
list_del(&curr->readdir_list);
- if (atomic_dec_and_test(&curr->refs))
+ if (refcount_dec_and_test(&curr->refs))
kfree(curr);
}
list_for_each_entry_safe(curr, next, del_list, readdir_list) {
list_del(&curr->readdir_list);
- if (atomic_dec_and_test(&curr->refs))
+ if (refcount_dec_and_test(&curr->refs))
kfree(curr);
}
@@ -1667,7 +1667,7 @@ int btrfs_should_delete_dir_index(struct list_head *del_list,
list_del(&curr->readdir_list);
ret = (curr->key.offset == index);
- if (atomic_dec_and_test(&curr->refs))
+ if (refcount_dec_and_test(&curr->refs))
kfree(curr);
if (ret)
@@ -1705,7 +1705,7 @@ int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
list_del(&curr->readdir_list);
if (curr->key.offset < ctx->pos) {
- if (atomic_dec_and_test(&curr->refs))
+ if (refcount_dec_and_test(&curr->refs))
kfree(curr);
continue;
}
@@ -1722,7 +1722,7 @@ int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
over = !dir_emit(ctx, name, name_len,
location.objectid, d_type);
- if (atomic_dec_and_test(&curr->refs))
+ if (refcount_dec_and_test(&curr->refs))
kfree(curr);
if (over)
@@ -1963,7 +1963,7 @@ void btrfs_kill_all_delayed_nodes(struct btrfs_root *root)
inode_id = delayed_nodes[n - 1]->inode_id + 1;
for (i = 0; i < n; i++)
- atomic_inc(&delayed_nodes[i]->refs);
+ refcount_inc(&delayed_nodes[i]->refs);
spin_unlock(&root->inode_lock);
for (i = 0; i < n; i++) {
diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h
index 40327cc3b99a..c4189d495934 100644
--- a/fs/btrfs/delayed-inode.h
+++ b/fs/btrfs/delayed-inode.h
@@ -26,7 +26,7 @@
#include <linux/list.h>
#include <linux/wait.h>
#include <linux/atomic.h>
-
+#include <linux/refcount.h>
#include "ctree.h"
/* types of the delayed item */
@@ -67,7 +67,7 @@ struct btrfs_delayed_node {
struct rb_root del_root;
struct mutex mutex;
struct btrfs_inode_item inode_item;
- atomic_t refs;
+ refcount_t refs;
u64 index_cnt;
unsigned long flags;
int count;
@@ -80,7 +80,7 @@ struct btrfs_delayed_item {
struct list_head readdir_list; /* used for readdir items */
u64 bytes_reserved;
struct btrfs_delayed_node *delayed_node;
- atomic_t refs;
+ refcount_t refs;
int ins_or_del;
u32 data_len;
char data[0];
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 6eb80952efb3..be70d90dfee5 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -164,7 +164,7 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
if (mutex_trylock(&head->mutex))
return 0;
- atomic_inc(&head->node.refs);
+ refcount_inc(&head->node.refs);
spin_unlock(&delayed_refs->lock);
mutex_lock(&head->mutex);
@@ -590,7 +590,7 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
delayed_refs = &trans->transaction->delayed_refs;
/* first set the basic ref node struct up */
- atomic_set(&ref->refs, 1);
+ refcount_set(&ref->refs, 1);
ref->bytenr = bytenr;
ref->num_bytes = num_bytes;
ref->ref_mod = count_mod;
@@ -682,7 +682,7 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
delayed_refs = &trans->transaction->delayed_refs;
/* first set the basic ref node struct up */
- atomic_set(&ref->refs, 1);
+ refcount_set(&ref->refs, 1);
ref->bytenr = bytenr;
ref->num_bytes = num_bytes;
ref->ref_mod = 1;
@@ -739,7 +739,7 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info,
seq = atomic64_read(&fs_info->tree_mod_seq);
/* first set the basic ref node struct up */
- atomic_set(&ref->refs, 1);
+ refcount_set(&ref->refs, 1);
ref->bytenr = bytenr;
ref->num_bytes = num_bytes;
ref->ref_mod = 1;
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 0e537f98f1a1..c0264ff01b53 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -18,6 +18,8 @@
#ifndef __DELAYED_REF__
#define __DELAYED_REF__
+#include <linux/refcount.h>
+
/* these are the possible values of struct btrfs_delayed_ref_node->action */
#define BTRFS_ADD_DELAYED_REF 1 /* add one backref to the tree */
#define BTRFS_DROP_DELAYED_REF 2 /* delete one backref from the tree */
@@ -53,7 +55,7 @@ struct btrfs_delayed_ref_node {
u64 seq;
/* ref count on this data structure */
- atomic_t refs;
+ refcount_t refs;
/*
* how many refs is this entry adding or deleting. For
@@ -220,8 +222,8 @@ btrfs_free_delayed_extent_op(struct btrfs_delayed_extent_op *op)
static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
{
- WARN_ON(atomic_read(&ref->refs) == 0);
- if (atomic_dec_and_test(&ref->refs)) {
+ WARN_ON(refcount_read(&ref->refs) == 0);
+ if (refcount_dec_and_test(&ref->refs)) {
WARN_ON(ref->in_tree);
switch (ref->type) {
case BTRFS_TREE_BLOCK_REF_KEY:
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index e653921f05d9..5fe1ca8abc70 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -546,8 +546,10 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
mutex_unlock(&fs_info->chunk_mutex);
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
mutex_unlock(&uuid_mutex);
+ btrfs_rm_dev_replace_blocked(fs_info);
if (tgt_device)
btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
+ btrfs_rm_dev_replace_unblocked(fs_info);
mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
return scrub_ret;
@@ -665,7 +667,7 @@ void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
srcdev = dev_replace->srcdev;
- args->status.progress_1000 = div_u64(dev_replace->cursor_left,
+ args->status.progress_1000 = div64_u64(dev_replace->cursor_left,
div_u64(btrfs_device_get_total_bytes(srcdev), 1000));
break;
}
@@ -784,8 +786,7 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info)
}
btrfs_dev_replace_unlock(dev_replace, 1);
- WARN_ON(atomic_xchg(
- &fs_info->mutually_exclusive_operation_running, 1));
+ WARN_ON(test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags));
task = kthread_run(btrfs_dev_replace_kthread, fs_info, "btrfs-devrepl");
return PTR_ERR_OR_ZERO(task);
}
@@ -814,7 +815,7 @@ static int btrfs_dev_replace_kthread(void *data)
(unsigned int)progress);
}
btrfs_dev_replace_continue_on_mount(fs_info);
- atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
+ clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
return 0;
}
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 061c1d1f774f..8685d67185d0 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -762,7 +762,7 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
err:
if (reads_done &&
test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
- btree_readahead_hook(fs_info, eb, ret);
+ btree_readahead_hook(eb, ret);
if (ret) {
/*
@@ -787,7 +787,7 @@ static int btree_io_failed_hook(struct page *page, int failed_mirror)
eb->read_mirror = failed_mirror;
atomic_dec(&eb->io_pages);
if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
- btree_readahead_hook(eb->fs_info, eb, -EIO);
+ btree_readahead_hook(eb, -EIO);
return -EIO; /* we fixed nothing */
}
@@ -1340,7 +1340,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
atomic_set(&root->log_writers, 0);
atomic_set(&root->log_batch, 0);
atomic_set(&root->orphan_inodes, 0);
- atomic_set(&root->refs, 1);
+ refcount_set(&root->refs, 1);
atomic_set(&root->will_be_snapshoted, 0);
atomic64_set(&root->qgroup_meta_rsv, 0);
root->log_transid = 0;
@@ -3497,10 +3497,11 @@ static void btrfs_end_empty_barrier(struct bio *bio)
*/
static int write_dev_flush(struct btrfs_device *device, int wait)
{
+ struct request_queue *q = bdev_get_queue(device->bdev);
struct bio *bio;
int ret = 0;
- if (device->nobarriers)
+ if (!test_bit(QUEUE_FLAG_WC, &q->queue_flags))
return 0;
if (wait) {
@@ -4321,7 +4322,7 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
head = rb_entry(node, struct btrfs_delayed_ref_head,
href_node);
if (!mutex_trylock(&head->mutex)) {
- atomic_inc(&head->node.refs);
+ refcount_inc(&head->node.refs);
spin_unlock(&delayed_refs->lock);
mutex_lock(&head->mutex);
@@ -4593,7 +4594,7 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info)
t = list_first_entry(&fs_info->trans_list,
struct btrfs_transaction, list);
if (t->state >= TRANS_STATE_COMMIT_START) {
- atomic_inc(&t->use_count);
+ refcount_inc(&t->use_count);
spin_unlock(&fs_info->trans_lock);
btrfs_wait_for_commit(fs_info, t->transid);
btrfs_put_transaction(t);
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 2e0ec29bfd69..21f1ceb85b76 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -101,14 +101,14 @@ struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info);
*/
static inline struct btrfs_root *btrfs_grab_fs_root(struct btrfs_root *root)
{
- if (atomic_inc_not_zero(&root->refs))
+ if (refcount_inc_not_zero(&root->refs))
return root;
return NULL;
}
static inline void btrfs_put_fs_root(struct btrfs_root *root)
{
- if (atomic_dec_and_test(&root->refs))
+ if (refcount_dec_and_test(&root->refs))
kfree(root);
}
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index be5477676cc8..e390451c72e6 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -131,6 +131,16 @@ void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
if (atomic_dec_and_test(&cache->count)) {
WARN_ON(cache->pinned > 0);
WARN_ON(cache->reserved > 0);
+
+ /*
+ * If not empty, someone is still holding mutex of
+ * full_stripe_lock, which can only be released by caller.
+ * And it will definitely cause use-after-free when caller
+ * tries to release full stripe lock.
+ *
+ * No better way to resolve, but only to warn.
+ */
+ WARN_ON(!RB_EMPTY_ROOT(&cache->full_stripe_locks_root.root));
kfree(cache->free_space_ctl);
kfree(cache);
}
@@ -316,14 +326,14 @@ get_caching_control(struct btrfs_block_group_cache *cache)
}
ctl = cache->caching_ctl;
- atomic_inc(&ctl->count);
+ refcount_inc(&ctl->count);
spin_unlock(&cache->lock);
return ctl;
}
static void put_caching_control(struct btrfs_caching_control *ctl)
{
- if (atomic_dec_and_test(&ctl->count))
+ if (refcount_dec_and_test(&ctl->count))
kfree(ctl);
}
@@ -599,7 +609,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
init_waitqueue_head(&caching_ctl->wait);
caching_ctl->block_group = cache;
caching_ctl->progress = cache->key.objectid;
- atomic_set(&caching_ctl->count, 1);
+ refcount_set(&caching_ctl->count, 1);
btrfs_init_work(&caching_ctl->work, btrfs_cache_helper,
caching_thread, NULL, NULL);
@@ -620,7 +630,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
struct btrfs_caching_control *ctl;
ctl = cache->caching_ctl;
- atomic_inc(&ctl->count);
+ refcount_inc(&ctl->count);
prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE);
spin_unlock(&cache->lock);
@@ -707,7 +717,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
}
down_write(&fs_info->commit_root_sem);
- atomic_inc(&caching_ctl->count);
+ refcount_inc(&caching_ctl->count);
list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
up_write(&fs_info->commit_root_sem);
@@ -892,7 +902,7 @@ search_again:
head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
if (head) {
if (!mutex_trylock(&head->mutex)) {
- atomic_inc(&head->node.refs);
+ refcount_inc(&head->node.refs);
spin_unlock(&delayed_refs->lock);
btrfs_release_path(path);
@@ -2980,7 +2990,7 @@ again:
struct btrfs_delayed_ref_node *ref;
ref = &head->node;
- atomic_inc(&ref->refs);
+ refcount_inc(&ref->refs);
spin_unlock(&delayed_refs->lock);
/*
@@ -3003,7 +3013,6 @@ again:
goto again;
}
out:
- assert_qgroups_uptodate(trans);
trans->can_flush_pending_bgs = can_flush_pending_bgs;
return 0;
}
@@ -3057,7 +3066,7 @@ static noinline int check_delayed_ref(struct btrfs_root *root,
}
if (!mutex_trylock(&head->mutex)) {
- atomic_inc(&head->node.refs);
+ refcount_inc(&head->node.refs);
spin_unlock(&delayed_refs->lock);
btrfs_release_path(path);
@@ -3443,7 +3452,8 @@ again:
/*
* don't bother trying to write stuff out _if_
* a) we're not cached,
- * b) we're with nospace_cache mount option.
+ * b) we're with nospace_cache mount option,
+ * c) we're with v2 space_cache (FREE_SPACE_TREE).
*/
dcs = BTRFS_DC_WRITTEN;
spin_unlock(&block_group->lock);
@@ -9917,6 +9927,7 @@ btrfs_create_block_group_cache(struct btrfs_fs_info *fs_info,
btrfs_init_free_space_ctl(cache);
atomic_set(&cache->trimming, 0);
mutex_init(&cache->free_space_lock);
+ btrfs_init_full_stripe_locks_tree(&cache->full_stripe_locks_root);
return cache;
}
@@ -10416,7 +10427,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
&fs_info->caching_block_groups, list)
if (ctl->block_group == block_group) {
caching_ctl = ctl;
- atomic_inc(&caching_ctl->count);
+ refcount_inc(&caching_ctl->count);
break;
}
}
@@ -10850,7 +10861,7 @@ static int btrfs_trim_free_extents(struct btrfs_device *device,
spin_lock(&fs_info->trans_lock);
trans = fs_info->running_transaction;
if (trans)
- atomic_inc(&trans->use_count);
+ refcount_inc(&trans->use_count);
spin_unlock(&fs_info->trans_lock);
ret = find_free_dev_extent_start(trans, device, minlen, start,
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 27fdb250b446..d8da3edf2ac3 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -68,7 +68,7 @@ void btrfs_leak_debug_check(void)
pr_err("BTRFS: state leak: start %llu end %llu state %u in tree %d refs %d\n",
state->start, state->end, state->state,
extent_state_in_tree(state),
- atomic_read(&state->refs));
+ refcount_read(&state->refs));
list_del(&state->leak_list);
kmem_cache_free(extent_state_cache, state);
}
@@ -238,7 +238,7 @@ static struct extent_state *alloc_extent_state(gfp_t mask)
state->failrec = NULL;
RB_CLEAR_NODE(&state->rb_node);
btrfs_leak_debug_add(&state->leak_list, &states);
- atomic_set(&state->refs, 1);
+ refcount_set(&state->refs, 1);
init_waitqueue_head(&state->wq);
trace_alloc_extent_state(state, mask, _RET_IP_);
return state;
@@ -248,7 +248,7 @@ void free_extent_state(struct extent_state *state)
{
if (!state)
return;
- if (atomic_dec_and_test(&state->refs)) {
+ if (refcount_dec_and_test(&state->refs)) {
WARN_ON(extent_state_in_tree(state));
btrfs_leak_debug_del(&state->leak_list);
trace_free_extent_state(state, _RET_IP_);
@@ -641,7 +641,7 @@ again:
if (cached && extent_state_in_tree(cached) &&
cached->start <= start && cached->end > start) {
if (clear)
- atomic_dec(&cached->refs);
+ refcount_dec(&cached->refs);
state = cached;
goto hit_next;
}
@@ -793,7 +793,7 @@ process_node:
if (state->state & bits) {
start = state->start;
- atomic_inc(&state->refs);
+ refcount_inc(&state->refs);
wait_on_state(tree, state);
free_extent_state(state);
goto again;
@@ -834,7 +834,7 @@ static void cache_state_if_flags(struct extent_state *state,
if (cached_ptr && !(*cached_ptr)) {
if (!flags || (state->state & flags)) {
*cached_ptr = state;
- atomic_inc(&state->refs);
+ refcount_inc(&state->refs);
}
}
}
@@ -1538,7 +1538,7 @@ static noinline u64 find_delalloc_range(struct extent_io_tree *tree,
if (!found) {
*start = state->start;
*cached_state = state;
- atomic_inc(&state->refs);
+ refcount_inc(&state->refs);
}
found++;
*end = state->end;
@@ -2004,16 +2004,11 @@ int repair_io_failure(struct btrfs_inode *inode, u64 start, u64 length,
u64 map_length = 0;
u64 sector;
struct btrfs_bio *bbio = NULL;
- struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
int ret;
ASSERT(!(fs_info->sb->s_flags & MS_RDONLY));
BUG_ON(!mirror_num);
- /* we can't repair anything in raid56 yet */
- if (btrfs_is_parity_mirror(map_tree, logical, length, mirror_num))
- return 0;
-
bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
if (!bio)
return -EIO;
@@ -2026,17 +2021,35 @@ int repair_io_failure(struct btrfs_inode *inode, u64 start, u64 length,
* read repair operation.
*/
btrfs_bio_counter_inc_blocked(fs_info);
- ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical,
- &map_length, &bbio, mirror_num);
- if (ret) {
- btrfs_bio_counter_dec(fs_info);
- bio_put(bio);
- return -EIO;
+ if (btrfs_is_parity_mirror(fs_info, logical, length, mirror_num)) {
+ /*
+ * Note that we don't use BTRFS_MAP_WRITE because it's supposed
+ * to update all raid stripes, but here we just want to correct
+ * bad stripe, thus BTRFS_MAP_READ is abused to only get the bad
+ * stripe's dev and sector.
+ */
+ ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical,
+ &map_length, &bbio, 0);
+ if (ret) {
+ btrfs_bio_counter_dec(fs_info);
+ bio_put(bio);
+ return -EIO;
+ }
+ ASSERT(bbio->mirror_num == 1);
+ } else {
+ ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical,
+ &map_length, &bbio, mirror_num);
+ if (ret) {
+ btrfs_bio_counter_dec(fs_info);
+ bio_put(bio);
+ return -EIO;
+ }
+ BUG_ON(mirror_num != bbio->mirror_num);
}
- BUG_ON(mirror_num != bbio->mirror_num);
- sector = bbio->stripes[mirror_num-1].physical >> 9;
+
+ sector = bbio->stripes[bbio->mirror_num - 1].physical >> 9;
bio->bi_iter.bi_sector = sector;
- dev = bbio->stripes[mirror_num-1].dev;
+ dev = bbio->stripes[bbio->mirror_num - 1].dev;
btrfs_put_bbio(bbio);
if (!dev || !dev->bdev || !dev->writeable) {
btrfs_bio_counter_dec(fs_info);
@@ -2859,7 +2872,7 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset,
em = *em_cached;
if (extent_map_in_tree(em) && start >= em->start &&
start < extent_map_end(em)) {
- atomic_inc(&em->refs);
+ refcount_inc(&em->refs);
return em;
}
@@ -2870,7 +2883,7 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset,
em = get_extent(BTRFS_I(inode), page, pg_offset, start, len, 0);
if (em_cached && !IS_ERR_OR_NULL(em)) {
BUG_ON(*em_cached);
- atomic_inc(&em->refs);
+ refcount_inc(&em->refs);
*em_cached = em;
}
return em;
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 3e4fad4a909d..1eafa2f0ede3 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -2,6 +2,7 @@
#define __EXTENTIO__
#include <linux/rbtree.h>
+#include <linux/refcount.h>
#include "ulist.h"
/* bits for the extent state */
@@ -14,14 +15,17 @@
#define EXTENT_DEFRAG (1U << 6)
#define EXTENT_BOUNDARY (1U << 9)
#define EXTENT_NODATASUM (1U << 10)
-#define EXTENT_DO_ACCOUNTING (1U << 11)
+#define EXTENT_CLEAR_META_RESV (1U << 11)
#define EXTENT_FIRST_DELALLOC (1U << 12)
#define EXTENT_NEED_WAIT (1U << 13)
#define EXTENT_DAMAGED (1U << 14)
#define EXTENT_NORESERVE (1U << 15)
#define EXTENT_QGROUP_RESERVED (1U << 16)
#define EXTENT_CLEAR_DATA_RESV (1U << 17)
+#define EXTENT_DELALLOC_NEW (1U << 18)
#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
+#define EXTENT_DO_ACCOUNTING (EXTENT_CLEAR_META_RESV | \
+ EXTENT_CLEAR_DATA_RESV)
#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
/*
@@ -143,7 +147,7 @@ struct extent_state {
/* ADD NEW ELEMENTS AFTER THIS */
wait_queue_head_t wq;
- atomic_t refs;
+ refcount_t refs;
unsigned state;
struct io_failure_record *failrec;
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 26f9ac719d20..69850155870c 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -55,7 +55,7 @@ struct extent_map *alloc_extent_map(void)
em->flags = 0;
em->compress_type = BTRFS_COMPRESS_NONE;
em->generation = 0;
- atomic_set(&em->refs, 1);
+ refcount_set(&em->refs, 1);
INIT_LIST_HEAD(&em->list);
return em;
}
@@ -71,8 +71,8 @@ void free_extent_map(struct extent_map *em)
{
if (!em)
return;
- WARN_ON(atomic_read(&em->refs) == 0);
- if (atomic_dec_and_test(&em->refs)) {
+ WARN_ON(refcount_read(&em->refs) == 0);
+ if (refcount_dec_and_test(&em->refs)) {
WARN_ON(extent_map_in_tree(em));
WARN_ON(!list_empty(&em->list));
if (test_bit(EXTENT_FLAG_FS_MAPPING, &em->flags))
@@ -322,7 +322,7 @@ static inline void setup_extent_mapping(struct extent_map_tree *tree,
struct extent_map *em,
int modified)
{
- atomic_inc(&em->refs);
+ refcount_inc(&em->refs);
em->mod_start = em->start;
em->mod_len = em->len;
@@ -381,7 +381,7 @@ __lookup_extent_mapping(struct extent_map_tree *tree,
if (strict && !(end > em->start && start < extent_map_end(em)))
return NULL;
- atomic_inc(&em->refs);
+ refcount_inc(&em->refs);
return em;
}
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index eb8b8fae036b..a67b2def5413 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -2,6 +2,7 @@
#define __EXTENTMAP__
#include <linux/rbtree.h>
+#include <linux/refcount.h>
#define EXTENT_MAP_LAST_BYTE ((u64)-4)
#define EXTENT_MAP_HOLE ((u64)-3)
@@ -41,7 +42,7 @@ struct extent_map {
*/
struct map_lookup *map_lookup;
};
- atomic_t refs;
+ refcount_t refs;
unsigned int compress_type;
struct list_head list;
};
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 520cb7230b2d..da1096eb1a40 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1404,6 +1404,47 @@ fail:
}
+static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode,
+ const u64 start,
+ const u64 len,
+ struct extent_state **cached_state)
+{
+ u64 search_start = start;
+ const u64 end = start + len - 1;
+
+ while (search_start < end) {
+ const u64 search_len = end - search_start + 1;
+ struct extent_map *em;
+ u64 em_len;
+ int ret = 0;
+
+ em = btrfs_get_extent(inode, NULL, 0, search_start,
+ search_len, 0);
+ if (IS_ERR(em))
+ return PTR_ERR(em);
+
+ if (em->block_start != EXTENT_MAP_HOLE)
+ goto next;
+
+ em_len = em->len;
+ if (em->start < search_start)
+ em_len -= search_start - em->start;
+ if (em_len > search_len)
+ em_len = search_len;
+
+ ret = set_extent_bit(&inode->io_tree, search_start,
+ search_start + em_len - 1,
+ EXTENT_DELALLOC_NEW,
+ NULL, cached_state, GFP_NOFS);
+next:
+ search_start = extent_map_end(em);
+ free_extent_map(em);
+ if (ret)
+ return ret;
+ }
+ return 0;
+}
+
/*
* This function locks the extent and properly waits for data=ordered extents
* to finish before allowing the pages to be modified if need.
@@ -1432,8 +1473,11 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
+ round_up(pos + write_bytes - start_pos,
fs_info->sectorsize) - 1;
- if (start_pos < inode->vfs_inode.i_size) {
+ if (start_pos < inode->vfs_inode.i_size ||
+ (inode->flags & BTRFS_INODE_PREALLOC)) {
struct btrfs_ordered_extent *ordered;
+ unsigned int clear_bits;
+
lock_extent_bits(&inode->io_tree, start_pos, last_pos,
cached_state);
ordered = btrfs_lookup_ordered_range(inode, start_pos,
@@ -1454,11 +1498,19 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
}
if (ordered)
btrfs_put_ordered_extent(ordered);
-
+ ret = btrfs_find_new_delalloc_bytes(inode, start_pos,
+ last_pos - start_pos + 1,
+ cached_state);
+ clear_bits = EXTENT_DIRTY | EXTENT_DELALLOC |
+ EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG;
+ if (ret)
+ clear_bits |= EXTENT_DELALLOC_NEW | EXTENT_LOCKED;
clear_extent_bit(&inode->io_tree, start_pos,
- last_pos, EXTENT_DIRTY | EXTENT_DELALLOC |
- EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
- 0, 0, cached_state, GFP_NOFS);
+ last_pos, clear_bits,
+ (clear_bits & EXTENT_LOCKED) ? 1 : 0,
+ 0, cached_state, GFP_NOFS);
+ if (ret)
+ return ret;
*lockstart = start_pos;
*lockend = last_pos;
ret = 1;
@@ -2342,13 +2394,8 @@ static int find_first_non_hole(struct inode *inode, u64 *start, u64 *len)
int ret = 0;
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, *start, *len, 0);
- if (IS_ERR_OR_NULL(em)) {
- if (!em)
- ret = -ENOMEM;
- else
- ret = PTR_ERR(em);
- return ret;
- }
+ if (IS_ERR(em))
+ return PTR_ERR(em);
/* Hole or vacuum extent(only exists in no-hole mode) */
if (em->block_start == EXTENT_MAP_HOLE) {
@@ -2835,11 +2882,8 @@ static long btrfs_fallocate(struct file *file, int mode,
while (1) {
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur_offset,
alloc_end - cur_offset, 0);
- if (IS_ERR_OR_NULL(em)) {
- if (!em)
- ret = -ENOMEM;
- else
- ret = PTR_ERR(em);
+ if (IS_ERR(em)) {
+ ret = PTR_ERR(em);
break;
}
last_byte = min(extent_map_end(em), alloc_end);
@@ -2856,8 +2900,10 @@ static long btrfs_fallocate(struct file *file, int mode,
}
ret = btrfs_qgroup_reserve_data(inode, cur_offset,
last_byte - cur_offset);
- if (ret < 0)
+ if (ret < 0) {
+ free_extent_map(em);
break;
+ }
} else {
/*
* Do not need to reserve unwritten extent for this
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index da6841efac26..c5e6180cdb8c 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -355,7 +355,7 @@ static void io_ctl_map_page(struct btrfs_io_ctl *io_ctl, int clear)
io_ctl->orig = io_ctl->cur;
io_ctl->size = PAGE_SIZE;
if (clear)
- memset(io_ctl->cur, 0, PAGE_SIZE);
+ clear_page(io_ctl->cur);
}
static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl)
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 5e71f1ea3391..17cbe9306faf 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -115,6 +115,31 @@ static struct extent_map *create_io_em(struct inode *inode, u64 start, u64 len,
u64 ram_bytes, int compress_type,
int type);
+static void __endio_write_update_ordered(struct inode *inode,
+ const u64 offset, const u64 bytes,
+ const bool uptodate);
+
+/*
+ * Cleanup all submitted ordered extents in specified range to handle errors
+ * from the fill_dellaloc() callback.
+ *
+ * NOTE: caller must ensure that when an error happens, it can not call
+ * extent_clear_unlock_delalloc() to clear both the bits EXTENT_DO_ACCOUNTING
+ * and EXTENT_DELALLOC simultaneously, because that causes the reserved metadata
+ * to be released, which we want to happen only when finishing the ordered
+ * extent (btrfs_finish_ordered_io()). Also note that the caller of the
+ * fill_delalloc() callback already does proper cleanup for the first page of
+ * the range, that is, it invokes the callback writepage_end_io_hook() for the
+ * range of the first page.
+ */
+static inline void btrfs_cleanup_ordered_extents(struct inode *inode,
+ const u64 offset,
+ const u64 bytes)
+{
+ return __endio_write_update_ordered(inode, offset + PAGE_SIZE,
+ bytes - PAGE_SIZE, false);
+}
+
static int btrfs_dirty_inode(struct inode *inode);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
@@ -547,7 +572,7 @@ cont:
}
if (ret <= 0) {
unsigned long clear_flags = EXTENT_DELALLOC |
- EXTENT_DEFRAG;
+ EXTENT_DELALLOC_NEW | EXTENT_DEFRAG;
unsigned long page_error_op;
clear_flags |= (ret < 0) ? EXTENT_DO_ACCOUNTING : 0;
@@ -565,8 +590,10 @@ cont:
PAGE_SET_WRITEBACK |
page_error_op |
PAGE_END_WRITEBACK);
- btrfs_free_reserved_data_space_noquota(inode, start,
- end - start + 1);
+ if (ret == 0)
+ btrfs_free_reserved_data_space_noquota(inode,
+ start,
+ end - start + 1);
goto free_pages_out;
}
}
@@ -852,6 +879,7 @@ out_free:
async_extent->start +
async_extent->ram_size - 1,
NULL, EXTENT_LOCKED | EXTENT_DELALLOC |
+ EXTENT_DELALLOC_NEW |
EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK |
@@ -918,10 +946,13 @@ static noinline int cow_file_range(struct inode *inode,
u64 num_bytes;
unsigned long ram_size;
u64 disk_num_bytes;
- u64 cur_alloc_size;
+ u64 cur_alloc_size = 0;
u64 blocksize = fs_info->sectorsize;
struct btrfs_key ins;
struct extent_map *em;
+ unsigned clear_bits;
+ unsigned long page_ops;
+ bool extent_reserved = false;
int ret = 0;
if (btrfs_is_free_space_inode(BTRFS_I(inode))) {
@@ -944,6 +975,7 @@ static noinline int cow_file_range(struct inode *inode,
extent_clear_unlock_delalloc(inode, start, end,
delalloc_end, NULL,
EXTENT_LOCKED | EXTENT_DELALLOC |
+ EXTENT_DELALLOC_NEW |
EXTENT_DEFRAG, PAGE_UNLOCK |
PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK |
PAGE_END_WRITEBACK);
@@ -966,14 +998,14 @@ static noinline int cow_file_range(struct inode *inode,
start + num_bytes - 1, 0);
while (disk_num_bytes > 0) {
- unsigned long op;
-
cur_alloc_size = disk_num_bytes;
ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size,
fs_info->sectorsize, 0, alloc_hint,
&ins, 1, 1);
if (ret < 0)
goto out_unlock;
+ cur_alloc_size = ins.offset;
+ extent_reserved = true;
ram_size = ins.offset;
em = create_io_em(inode, start, ins.offset, /* len */
@@ -988,7 +1020,6 @@ static noinline int cow_file_range(struct inode *inode,
goto out_reserve;
free_extent_map(em);
- cur_alloc_size = ins.offset;
ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
ram_size, cur_alloc_size, 0);
if (ret)
@@ -998,15 +1029,24 @@ static noinline int cow_file_range(struct inode *inode,
BTRFS_DATA_RELOC_TREE_OBJECTID) {
ret = btrfs_reloc_clone_csums(inode, start,
cur_alloc_size);
+ /*
+ * Only drop cache here, and process as normal.
+ *
+ * We must not allow extent_clear_unlock_delalloc()
+ * at out_unlock label to free meta of this ordered
+ * extent, as its meta should be freed by
+ * btrfs_finish_ordered_io().
+ *
+ * So we must continue until @start is increased to
+ * skip current ordered extent.
+ */
if (ret)
- goto out_drop_extent_cache;
+ btrfs_drop_extent_cache(BTRFS_I(inode), start,
+ start + ram_size - 1, 0);
}
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
- if (disk_num_bytes < cur_alloc_size)
- break;
-
/* we're not doing compressed IO, don't unlock the first
* page (which the caller expects to stay locked), don't
* clear any dirty bits and don't set any writeback bits
@@ -1014,18 +1054,30 @@ static noinline int cow_file_range(struct inode *inode,
* Do set the Private2 bit so we know this page was properly
* setup for writepage
*/
- op = unlock ? PAGE_UNLOCK : 0;
- op |= PAGE_SET_PRIVATE2;
+ page_ops = unlock ? PAGE_UNLOCK : 0;
+ page_ops |= PAGE_SET_PRIVATE2;
extent_clear_unlock_delalloc(inode, start,
start + ram_size - 1,
delalloc_end, locked_page,
EXTENT_LOCKED | EXTENT_DELALLOC,
- op);
- disk_num_bytes -= cur_alloc_size;
+ page_ops);
+ if (disk_num_bytes < cur_alloc_size)
+ disk_num_bytes = 0;
+ else
+ disk_num_bytes -= cur_alloc_size;
num_bytes -= cur_alloc_size;
alloc_hint = ins.objectid + ins.offset;
start += cur_alloc_size;
+ extent_reserved = false;
+
+ /*
+ * btrfs_reloc_clone_csums() error, since start is increased
+ * extent_clear_unlock_delalloc() at out_unlock label won't
+ * free metadata of current ordered extent, we're OK to exit.
+ */
+ if (ret)
+ goto out_unlock;
}
out:
return ret;
@@ -1036,12 +1088,35 @@ out_reserve:
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
out_unlock:
+ clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
+ EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV;
+ page_ops = PAGE_UNLOCK | PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK |
+ PAGE_END_WRITEBACK;
+ /*
+ * If we reserved an extent for our delalloc range (or a subrange) and
+ * failed to create the respective ordered extent, then it means that
+ * when we reserved the extent we decremented the extent's size from
+ * the data space_info's bytes_may_use counter and incremented the
+ * space_info's bytes_reserved counter by the same amount. We must make
+ * sure extent_clear_unlock_delalloc() does not try to decrement again
+ * the data space_info's bytes_may_use counter, therefore we do not pass
+ * it the flag EXTENT_CLEAR_DATA_RESV.
+ */
+ if (extent_reserved) {
+ extent_clear_unlock_delalloc(inode, start,
+ start + cur_alloc_size,
+ start + cur_alloc_size,
+ locked_page,
+ clear_bits,
+ page_ops);
+ start += cur_alloc_size;
+ if (start >= end)
+ goto out;
+ }
extent_clear_unlock_delalloc(inode, start, end, delalloc_end,
locked_page,
- EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
- EXTENT_DELALLOC | EXTENT_DEFRAG,
- PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
- PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK);
+ clear_bits | EXTENT_CLEAR_DATA_RESV,
+ page_ops);
goto out;
}
@@ -1414,15 +1489,14 @@ out_check:
BUG_ON(ret); /* -ENOMEM */
if (root->root_key.objectid ==
- BTRFS_DATA_RELOC_TREE_OBJECTID) {
+ BTRFS_DATA_RELOC_TREE_OBJECTID)
+ /*
+ * Error handled later, as we must prevent
+ * extent_clear_unlock_delalloc() in error handler
+ * from freeing metadata of created ordered extent.
+ */
ret = btrfs_reloc_clone_csums(inode, cur_offset,
num_bytes);
- if (ret) {
- if (!nolock && nocow)
- btrfs_end_write_no_snapshoting(root);
- goto error;
- }
- }
extent_clear_unlock_delalloc(inode, cur_offset,
cur_offset + num_bytes - 1, end,
@@ -1434,6 +1508,14 @@ out_check:
if (!nolock && nocow)
btrfs_end_write_no_snapshoting(root);
cur_offset = extent_end;
+
+ /*
+ * btrfs_reloc_clone_csums() error, now we're OK to call error
+ * handler, as metadata for created ordered extent will only
+ * be freed by btrfs_finish_ordered_io().
+ */
+ if (ret)
+ goto error;
if (cur_offset > end)
break;
}
@@ -1509,6 +1591,8 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
ret = cow_file_range_async(inode, locked_page, start, end,
page_started, nr_written);
}
+ if (ret)
+ btrfs_cleanup_ordered_extents(inode, start, end - start + 1);
return ret;
}
@@ -1693,6 +1777,14 @@ static void btrfs_set_bit_hook(struct inode *inode,
btrfs_add_delalloc_inodes(root, inode);
spin_unlock(&BTRFS_I(inode)->lock);
}
+
+ if (!(state->state & EXTENT_DELALLOC_NEW) &&
+ (*bits & EXTENT_DELALLOC_NEW)) {
+ spin_lock(&BTRFS_I(inode)->lock);
+ BTRFS_I(inode)->new_delalloc_bytes += state->end + 1 -
+ state->start;
+ spin_unlock(&BTRFS_I(inode)->lock);
+ }
}
/*
@@ -1722,7 +1814,7 @@ static void btrfs_clear_bit_hook(struct btrfs_inode *inode,
if (*bits & EXTENT_FIRST_DELALLOC) {
*bits &= ~EXTENT_FIRST_DELALLOC;
- } else if (!(*bits & EXTENT_DO_ACCOUNTING)) {
+ } else if (!(*bits & EXTENT_CLEAR_META_RESV)) {
spin_lock(&inode->lock);
inode->outstanding_extents -= num_extents;
spin_unlock(&inode->lock);
@@ -1733,7 +1825,7 @@ static void btrfs_clear_bit_hook(struct btrfs_inode *inode,
* don't need to call dellalloc_release_metadata if there is an
* error.
*/
- if (*bits & EXTENT_DO_ACCOUNTING &&
+ if (*bits & EXTENT_CLEAR_META_RESV &&
root != fs_info->tree_root)
btrfs_delalloc_release_metadata(inode, len);
@@ -1741,10 +1833,9 @@ static void btrfs_clear_bit_hook(struct btrfs_inode *inode,
if (btrfs_is_testing(fs_info))
return;
- if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
- && do_list && !(state->state & EXTENT_NORESERVE)
- && (*bits & (EXTENT_DO_ACCOUNTING |
- EXTENT_CLEAR_DATA_RESV)))
+ if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID &&
+ do_list && !(state->state & EXTENT_NORESERVE) &&
+ (*bits & EXTENT_CLEAR_DATA_RESV))
btrfs_free_reserved_data_space_noquota(
&inode->vfs_inode,
state->start, len);
@@ -1759,6 +1850,14 @@ static void btrfs_clear_bit_hook(struct btrfs_inode *inode,
btrfs_del_delalloc_inode(root, inode);
spin_unlock(&inode->lock);
}
+
+ if ((state->state & EXTENT_DELALLOC_NEW) &&
+ (*bits & EXTENT_DELALLOC_NEW)) {
+ spin_lock(&inode->lock);
+ ASSERT(inode->new_delalloc_bytes >= len);
+ inode->new_delalloc_bytes -= len;
+ spin_unlock(&inode->lock);
+ }
}
/*
@@ -2791,6 +2890,13 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
u64 logical_len = ordered_extent->len;
bool nolock;
bool truncated = false;
+ bool range_locked = false;
+ bool clear_new_delalloc_bytes = false;
+
+ if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
+ !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags) &&
+ !test_bit(BTRFS_ORDERED_DIRECT, &ordered_extent->flags))
+ clear_new_delalloc_bytes = true;
nolock = btrfs_is_free_space_inode(BTRFS_I(inode));
@@ -2839,6 +2945,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
goto out;
}
+ range_locked = true;
lock_extent_bits(io_tree, ordered_extent->file_offset,
ordered_extent->file_offset + ordered_extent->len - 1,
&cached_state);
@@ -2864,7 +2971,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
trans = NULL;
- goto out_unlock;
+ goto out;
}
trans->block_rsv = &fs_info->delalloc_block_rsv;
@@ -2896,7 +3003,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
trans->transid);
if (ret < 0) {
btrfs_abort_transaction(trans, ret);
- goto out_unlock;
+ goto out;
}
add_pending_csums(trans, inode, &ordered_extent->list);
@@ -2905,14 +3012,26 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
ret = btrfs_update_inode_fallback(trans, root, inode);
if (ret) { /* -ENOMEM or corruption */
btrfs_abort_transaction(trans, ret);
- goto out_unlock;
+ goto out;
}
ret = 0;
-out_unlock:
- unlock_extent_cached(io_tree, ordered_extent->file_offset,
- ordered_extent->file_offset +
- ordered_extent->len - 1, &cached_state, GFP_NOFS);
out:
+ if (range_locked || clear_new_delalloc_bytes) {
+ unsigned int clear_bits = 0;
+
+ if (range_locked)
+ clear_bits |= EXTENT_LOCKED;
+ if (clear_new_delalloc_bytes)
+ clear_bits |= EXTENT_DELALLOC_NEW;
+ clear_extent_bit(&BTRFS_I(inode)->io_tree,
+ ordered_extent->file_offset,
+ ordered_extent->file_offset +
+ ordered_extent->len - 1,
+ clear_bits,
+ (clear_bits & EXTENT_LOCKED) ? 1 : 0,
+ 0, &cached_state, GFP_NOFS);
+ }
+
if (root != fs_info->tree_root)
btrfs_delalloc_release_metadata(BTRFS_I(inode),
ordered_extent->len);
@@ -4401,9 +4520,17 @@ search_again:
if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
item_end +=
btrfs_file_extent_num_bytes(leaf, fi);
+
+ trace_btrfs_truncate_show_fi_regular(
+ BTRFS_I(inode), leaf, fi,
+ found_key.offset);
} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
item_end += btrfs_file_extent_inline_len(leaf,
path->slots[0], fi);
+
+ trace_btrfs_truncate_show_fi_inline(
+ BTRFS_I(inode), leaf, fi, path->slots[0],
+ found_key.offset);
}
item_end--;
}
@@ -4603,13 +4730,6 @@ error:
btrfs_free_path(path);
- if (err == 0) {
- /* only inline file may have last_size != new_size */
- if (new_size >= fs_info->sectorsize ||
- new_size > fs_info->max_inline)
- ASSERT(last_size == new_size);
- }
-
if (be_nice && bytes_deleted > SZ_32M) {
unsigned long updates = trans->delayed_ref_updates;
if (updates) {
@@ -6735,7 +6855,6 @@ static noinline int uncompress_inline(struct btrfs_path *path,
*
* This also copies inline extents directly into the page.
*/
-
struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
struct page *page,
size_t pg_offset, u64 start, u64 len,
@@ -6835,11 +6954,18 @@ again:
found_type == BTRFS_FILE_EXTENT_PREALLOC) {
extent_end = extent_start +
btrfs_file_extent_num_bytes(leaf, item);
+
+ trace_btrfs_get_extent_show_fi_regular(inode, leaf, item,
+ extent_start);
} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
size_t size;
size = btrfs_file_extent_inline_len(leaf, path->slots[0], item);
extent_end = ALIGN(extent_start + size,
fs_info->sectorsize);
+
+ trace_btrfs_get_extent_show_fi_inline(inode, leaf, item,
+ path->slots[0],
+ extent_start);
}
next:
if (start >= extent_end) {
@@ -7037,19 +7163,17 @@ struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode,
em = btrfs_get_extent(inode, page, pg_offset, start, len, create);
if (IS_ERR(em))
return em;
- if (em) {
- /*
- * if our em maps to
- * - a hole or
- * - a pre-alloc extent,
- * there might actually be delalloc bytes behind it.
- */
- if (em->block_start != EXTENT_MAP_HOLE &&
- !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
- return em;
- else
- hole_em = em;
- }
+ /*
+ * If our em maps to:
+ * - a hole or
+ * - a pre-alloc extent,
+ * there might actually be delalloc bytes behind it.
+ */
+ if (em->block_start != EXTENT_MAP_HOLE &&
+ !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+ return em;
+ else
+ hole_em = em;
/* check to see if we've wrapped (len == -1 or similar) */
end = start + len;
@@ -8127,17 +8251,26 @@ static void btrfs_endio_direct_read(struct bio *bio)
bio_put(bio);
}
-static void btrfs_endio_direct_write_update_ordered(struct inode *inode,
- const u64 offset,
- const u64 bytes,
- const int uptodate)
+static void __endio_write_update_ordered(struct inode *inode,
+ const u64 offset, const u64 bytes,
+ const bool uptodate)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_ordered_extent *ordered = NULL;
+ struct btrfs_workqueue *wq;
+ btrfs_work_func_t func;
u64 ordered_offset = offset;
u64 ordered_bytes = bytes;
int ret;
+ if (btrfs_is_free_space_inode(BTRFS_I(inode))) {
+ wq = fs_info->endio_freespace_worker;
+ func = btrfs_freespace_write_helper;
+ } else {
+ wq = fs_info->endio_write_workers;
+ func = btrfs_endio_write_helper;
+ }
+
again:
ret = btrfs_dec_test_first_ordered_pending(inode, &ordered,
&ordered_offset,
@@ -8146,9 +8279,8 @@ again:
if (!ret)
goto out_test;
- btrfs_init_work(&ordered->work, btrfs_endio_write_helper,
- finish_ordered_fn, NULL, NULL);
- btrfs_queue_work(fs_info->endio_write_workers, &ordered->work);
+ btrfs_init_work(&ordered->work, func, finish_ordered_fn, NULL, NULL);
+ btrfs_queue_work(wq, &ordered->work);
out_test:
/*
* our bio might span multiple ordered extents. If we haven't
@@ -8166,10 +8298,8 @@ static void btrfs_endio_direct_write(struct bio *bio)
struct btrfs_dio_private *dip = bio->bi_private;
struct bio *dio_bio = dip->dio_bio;
- btrfs_endio_direct_write_update_ordered(dip->inode,
- dip->logical_offset,
- dip->bytes,
- !bio->bi_error);
+ __endio_write_update_ordered(dip->inode, dip->logical_offset,
+ dip->bytes, !bio->bi_error);
kfree(dip);
@@ -8530,10 +8660,10 @@ free_ordered:
io_bio = NULL;
} else {
if (write)
- btrfs_endio_direct_write_update_ordered(inode,
+ __endio_write_update_ordered(inode,
file_offset,
dio_bio->bi_iter.bi_size,
- 0);
+ false);
else
unlock_extent(&BTRFS_I(inode)->io_tree, file_offset,
file_offset + dio_bio->bi_iter.bi_size - 1);
@@ -8668,11 +8798,11 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
*/
if (dio_data.unsubmitted_oe_range_start <
dio_data.unsubmitted_oe_range_end)
- btrfs_endio_direct_write_update_ordered(inode,
+ __endio_write_update_ordered(inode,
dio_data.unsubmitted_oe_range_start,
dio_data.unsubmitted_oe_range_end -
dio_data.unsubmitted_oe_range_start,
- 0);
+ false);
} else if (ret >= 0 && (size_t)ret < count)
btrfs_delalloc_release_space(inode, offset,
count - (size_t)ret);
@@ -8819,6 +8949,7 @@ again:
if (!inode_evicting)
clear_extent_bit(tree, start, end,
EXTENT_DIRTY | EXTENT_DELALLOC |
+ EXTENT_DELALLOC_NEW |
EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
EXTENT_DEFRAG, 1, 0, &cached_state,
GFP_NOFS);
@@ -8876,8 +9007,8 @@ again:
if (!inode_evicting) {
clear_extent_bit(tree, page_start, page_end,
EXTENT_LOCKED | EXTENT_DIRTY |
- EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
- EXTENT_DEFRAG, 1, 1,
+ EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
+ EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 1,
&cached_state, GFP_NOFS);
__btrfs_releasepage(page, GFP_NOFS);
@@ -9248,6 +9379,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
ei->last_sub_trans = 0;
ei->logged_trans = 0;
ei->delalloc_bytes = 0;
+ ei->new_delalloc_bytes = 0;
ei->defrag_bytes = 0;
ei->disk_i_size = 0;
ei->flags = 0;
@@ -9313,6 +9445,7 @@ void btrfs_destroy_inode(struct inode *inode)
WARN_ON(BTRFS_I(inode)->outstanding_extents);
WARN_ON(BTRFS_I(inode)->reserved_extents);
WARN_ON(BTRFS_I(inode)->delalloc_bytes);
+ WARN_ON(BTRFS_I(inode)->new_delalloc_bytes);
WARN_ON(BTRFS_I(inode)->csum_bytes);
WARN_ON(BTRFS_I(inode)->defrag_bytes);
@@ -9436,7 +9569,7 @@ static int btrfs_getattr(const struct path *path, struct kstat *stat,
stat->dev = BTRFS_I(inode)->root->anon_dev;
spin_lock(&BTRFS_I(inode)->lock);
- delalloc_bytes = BTRFS_I(inode)->delalloc_bytes;
+ delalloc_bytes = BTRFS_I(inode)->new_delalloc_bytes;
spin_unlock(&BTRFS_I(inode)->lock);
stat->blocks = (ALIGN(inode_get_bytes(inode), blocksize) +
ALIGN(delalloc_bytes, blocksize)) >> 9;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 922a66fce401..e176375f374f 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1504,7 +1504,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
if (ret)
return ret;
- if (atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)) {
+ if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
mnt_drop_write_file(file);
return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
}
@@ -1619,7 +1619,7 @@ out_free:
kfree(vol_args);
out:
mutex_unlock(&fs_info->volume_mutex);
- atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
+ clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
mnt_drop_write_file(file);
return ret;
}
@@ -2661,7 +2661,7 @@ static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg)
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1))
+ if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags))
return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
mutex_lock(&fs_info->volume_mutex);
@@ -2680,7 +2680,7 @@ static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg)
kfree(vol_args);
out:
mutex_unlock(&fs_info->volume_mutex);
- atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
+ clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
return ret;
}
@@ -2708,7 +2708,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
if (vol_args->flags & ~BTRFS_VOL_ARG_V2_FLAGS_SUPPORTED)
return -EOPNOTSUPP;
- if (atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)) {
+ if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
goto out;
}
@@ -2721,7 +2721,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
ret = btrfs_rm_device(fs_info, vol_args->name, 0);
}
mutex_unlock(&fs_info->volume_mutex);
- atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
+ clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
if (!ret) {
if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID)
@@ -2752,7 +2752,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
if (ret)
return ret;
- if (atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)) {
+ if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
goto out_drop_write;
}
@@ -2772,7 +2772,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
btrfs_info(fs_info, "disk deleted %s", vol_args->name);
kfree(vol_args);
out:
- atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
+ clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
out_drop_write:
mnt_drop_write_file(file);
@@ -4439,13 +4439,11 @@ static long btrfs_ioctl_dev_replace(struct btrfs_fs_info *fs_info,
ret = -EROFS;
goto out;
}
- if (atomic_xchg(
- &fs_info->mutually_exclusive_operation_running, 1)) {
+ if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
} else {
ret = btrfs_dev_replace_by_ioctl(fs_info, p);
- atomic_set(
- &fs_info->mutually_exclusive_operation_running, 0);
+ clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
}
break;
case BTRFS_IOCTL_DEV_REPLACE_CMD_STATUS:
@@ -4640,7 +4638,7 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg)
return ret;
again:
- if (!atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)) {
+ if (!test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
mutex_lock(&fs_info->volume_mutex);
mutex_lock(&fs_info->balance_mutex);
need_unlock = true;
@@ -4686,7 +4684,7 @@ again:
}
locked:
- BUG_ON(!atomic_read(&fs_info->mutually_exclusive_operation_running));
+ BUG_ON(!test_bit(BTRFS_FS_EXCL_OP, &fs_info->flags));
if (arg) {
bargs = memdup_user(arg, sizeof(*bargs));
@@ -4742,11 +4740,10 @@ locked:
do_balance:
/*
- * Ownership of bctl and mutually_exclusive_operation_running
+ * Ownership of bctl and filesystem flag BTRFS_FS_EXCL_OP
* goes to to btrfs_balance. bctl is freed in __cancel_balance,
* or, if restriper was paused all the way until unmount, in
- * free_fs_info. mutually_exclusive_operation_running is
- * cleared in __cancel_balance.
+ * free_fs_info. The flag is cleared in __cancel_balance.
*/
need_unlock = false;
@@ -4766,7 +4763,7 @@ out_unlock:
mutex_unlock(&fs_info->balance_mutex);
mutex_unlock(&fs_info->volume_mutex);
if (need_unlock)
- atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
+ clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
out:
mnt_drop_write_file(file);
return ret;
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 9a46878ba60f..7b40e2e7292a 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -212,7 +212,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
set_bit(BTRFS_ORDERED_DIRECT, &entry->flags);
/* one ref for the tree */
- atomic_set(&entry->refs, 1);
+ refcount_set(&entry->refs, 1);
init_waitqueue_head(&entry->wait);
INIT_LIST_HEAD(&entry->list);
INIT_LIST_HEAD(&entry->root_extent_list);
@@ -358,7 +358,7 @@ int btrfs_dec_test_first_ordered_pending(struct inode *inode,
out:
if (!ret && cached && entry) {
*cached = entry;
- atomic_inc(&entry->refs);
+ refcount_inc(&entry->refs);
}
spin_unlock_irqrestore(&tree->lock, flags);
return ret == 0;
@@ -425,7 +425,7 @@ have_entry:
out:
if (!ret && cached && entry) {
*cached = entry;
- atomic_inc(&entry->refs);
+ refcount_inc(&entry->refs);
}
spin_unlock_irqrestore(&tree->lock, flags);
return ret == 0;
@@ -456,7 +456,7 @@ void btrfs_get_logged_extents(struct btrfs_inode *inode,
if (test_and_set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags))
continue;
list_add(&ordered->log_list, logged_list);
- atomic_inc(&ordered->refs);
+ refcount_inc(&ordered->refs);
}
spin_unlock_irq(&tree->lock);
}
@@ -565,7 +565,7 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
trace_btrfs_ordered_extent_put(entry->inode, entry);
- if (atomic_dec_and_test(&entry->refs)) {
+ if (refcount_dec_and_test(&entry->refs)) {
ASSERT(list_empty(&entry->log_list));
ASSERT(list_empty(&entry->trans_list));
ASSERT(list_empty(&entry->root_extent_list));
@@ -623,7 +623,7 @@ void btrfs_remove_ordered_extent(struct inode *inode,
spin_lock(&fs_info->trans_lock);
trans = fs_info->running_transaction;
if (trans)
- atomic_inc(&trans->use_count);
+ refcount_inc(&trans->use_count);
spin_unlock(&fs_info->trans_lock);
ASSERT(trans);
@@ -690,7 +690,7 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr,
list_move_tail(&ordered->root_extent_list,
&root->ordered_extents);
- atomic_inc(&ordered->refs);
+ refcount_inc(&ordered->refs);
spin_unlock(&root->ordered_extent_lock);
btrfs_init_work(&ordered->flush_work,
@@ -870,7 +870,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
if (!offset_in_entry(entry, file_offset))
entry = NULL;
if (entry)
- atomic_inc(&entry->refs);
+ refcount_inc(&entry->refs);
out:
spin_unlock_irq(&tree->lock);
return entry;
@@ -911,7 +911,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range(
}
out:
if (entry)
- atomic_inc(&entry->refs);
+ refcount_inc(&entry->refs);
spin_unlock_irq(&tree->lock);
return entry;
}
@@ -948,7 +948,7 @@ btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset)
goto out;
entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
- atomic_inc(&entry->refs);
+ refcount_inc(&entry->refs);
out:
spin_unlock_irq(&tree->lock);
return entry;
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 195c93b67fe0..e0c1d5b8d859 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -113,7 +113,7 @@ struct btrfs_ordered_extent {
int compress_type;
/* reference count */
- atomic_t refs;
+ refcount_t refs;
/* the inode we belong to */
struct inode *inode;
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index afbea61d957e..deffbeb74a0b 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -47,50 +47,6 @@
* - check all ioctl parameters
*/
-/*
- * one struct for each qgroup, organized in fs_info->qgroup_tree.
- */
-struct btrfs_qgroup {
- u64 qgroupid;
-
- /*
- * state
- */
- u64 rfer; /* referenced */
- u64 rfer_cmpr; /* referenced compressed */
- u64 excl; /* exclusive */
- u64 excl_cmpr; /* exclusive compressed */
-
- /*
- * limits
- */
- u64 lim_flags; /* which limits are set */
- u64 max_rfer;
- u64 max_excl;
- u64 rsv_rfer;
- u64 rsv_excl;
-
- /*
- * reservation tracking
- */
- u64 reserved;
-
- /*
- * lists
- */
- struct list_head groups; /* groups this group is member of */
- struct list_head members; /* groups that are members of this group */
- struct list_head dirty; /* dirty groups */
- struct rb_node node; /* tree of qgroups */
-
- /*
- * temp variables for accounting operations
- * Refer to qgroup_shared_accounting() for details.
- */
- u64 old_refcnt;
- u64 new_refcnt;
-};
-
static void btrfs_qgroup_update_old_refcnt(struct btrfs_qgroup *qg, u64 seq,
int mod)
{
@@ -1078,6 +1034,7 @@ static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info,
qgroup->excl += sign * num_bytes;
qgroup->excl_cmpr += sign * num_bytes;
if (sign > 0) {
+ trace_qgroup_update_reserve(fs_info, qgroup, -(s64)num_bytes);
if (qgroup->reserved < num_bytes)
report_reserved_underflow(fs_info, qgroup, num_bytes);
else
@@ -1103,6 +1060,8 @@ static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info,
WARN_ON(sign < 0 && qgroup->excl < num_bytes);
qgroup->excl += sign * num_bytes;
if (sign > 0) {
+ trace_qgroup_update_reserve(fs_info, qgroup,
+ -(s64)num_bytes);
if (qgroup->reserved < num_bytes)
report_reserved_underflow(fs_info, qgroup,
num_bytes);
@@ -2058,12 +2017,12 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans,
if (!ret) {
/*
- * Use (u64)-1 as time_seq to do special search, which
+ * Use SEQ_LAST as time_seq to do special search, which
* doesn't lock tree or delayed_refs and search current
* root. It's safe inside commit_transaction().
*/
ret = btrfs_find_all_roots(trans, fs_info,
- record->bytenr, (u64)-1, &new_roots);
+ record->bytenr, SEQ_LAST, &new_roots);
if (ret < 0)
goto cleanup;
if (qgroup_to_skip)
@@ -2370,6 +2329,7 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce)
struct btrfs_fs_info *fs_info = root->fs_info;
u64 ref_root = root->root_key.objectid;
int ret = 0;
+ int retried = 0;
struct ulist_node *unode;
struct ulist_iterator uiter;
@@ -2378,7 +2338,7 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce)
if (num_bytes == 0)
return 0;
-
+retry:
spin_lock(&fs_info->qgroup_lock);
quota_root = fs_info->quota_root;
if (!quota_root)
@@ -2405,6 +2365,27 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce)
qg = unode_aux_to_qgroup(unode);
if (enforce && !qgroup_check_limits(qg, num_bytes)) {
+ /*
+ * Commit the tree and retry, since we may have
+ * deletions which would free up space.
+ */
+ if (!retried && qg->reserved > 0) {
+ struct btrfs_trans_handle *trans;
+
+ spin_unlock(&fs_info->qgroup_lock);
+ ret = btrfs_start_delalloc_inodes(root, 0);
+ if (ret)
+ return ret;
+ btrfs_wait_ordered_extents(root, -1, 0, (u64)-1);
+ trans = btrfs_join_transaction(root);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+ ret = btrfs_commit_transaction(trans);
+ if (ret)
+ return ret;
+ retried++;
+ goto retry;
+ }
ret = -EDQUOT;
goto out;
}
@@ -2427,6 +2408,7 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce)
qg = unode_aux_to_qgroup(unode);
+ trace_qgroup_update_reserve(fs_info, qg, num_bytes);
qg->reserved += num_bytes;
}
@@ -2472,6 +2454,7 @@ void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
qg = unode_aux_to_qgroup(unode);
+ trace_qgroup_update_reserve(fs_info, qg, -(s64)num_bytes);
if (qg->reserved < num_bytes)
report_reserved_underflow(fs_info, qg, num_bytes);
else
@@ -2490,18 +2473,6 @@ out:
spin_unlock(&fs_info->qgroup_lock);
}
-void assert_qgroups_uptodate(struct btrfs_trans_handle *trans)
-{
- if (list_empty(&trans->qgroup_ref_list) && !trans->delayed_ref_elem.seq)
- return;
- btrfs_err(trans->fs_info,
- "qgroups not uptodate in trans handle %p: list is%s empty, seq is %#x.%x",
- trans, list_empty(&trans->qgroup_ref_list) ? "" : " not",
- (u32)(trans->delayed_ref_elem.seq >> 32),
- (u32)trans->delayed_ref_elem.seq);
- BUG();
-}
-
/*
* returns < 0 on error, 0 when more leafs are to be scanned.
* returns 1 when done.
@@ -2889,14 +2860,14 @@ static int __btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len,
if (ret < 0)
goto out;
- if (free) {
- btrfs_qgroup_free_refroot(BTRFS_I(inode)->root->fs_info,
- BTRFS_I(inode)->root->objectid,
- changeset.bytes_changed);
+ if (free)
trace_op = QGROUP_FREE;
- }
trace_btrfs_qgroup_release_data(inode, start, len,
changeset.bytes_changed, trace_op);
+ if (free)
+ btrfs_qgroup_free_refroot(BTRFS_I(inode)->root->fs_info,
+ BTRFS_I(inode)->root->objectid,
+ changeset.bytes_changed);
out:
ulist_release(&changeset.range_changed);
return ret;
@@ -2948,6 +2919,7 @@ int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
return 0;
BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize));
+ trace_qgroup_meta_reserve(root, (s64)num_bytes);
ret = qgroup_reserve(root, num_bytes, enforce);
if (ret < 0)
return ret;
@@ -2967,6 +2939,7 @@ void btrfs_qgroup_free_meta_all(struct btrfs_root *root)
reserved = atomic64_xchg(&root->qgroup_meta_rsv, 0);
if (reserved == 0)
return;
+ trace_qgroup_meta_reserve(root, -(s64)reserved);
btrfs_qgroup_free_refroot(fs_info, root->objectid, reserved);
}
@@ -2981,6 +2954,7 @@ void btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes)
BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize));
WARN_ON(atomic64_read(&root->qgroup_meta_rsv) < num_bytes);
atomic64_sub(num_bytes, &root->qgroup_meta_rsv);
+ trace_qgroup_meta_reserve(root, -(s64)num_bytes);
btrfs_qgroup_free_refroot(fs_info, root->objectid, num_bytes);
}
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index 26932a8a1993..fe04d3f295c6 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -62,6 +62,50 @@ struct btrfs_qgroup_extent_record {
};
/*
+ * one struct for each qgroup, organized in fs_info->qgroup_tree.
+ */
+struct btrfs_qgroup {
+ u64 qgroupid;
+
+ /*
+ * state
+ */
+ u64 rfer; /* referenced */
+ u64 rfer_cmpr; /* referenced compressed */
+ u64 excl; /* exclusive */
+ u64 excl_cmpr; /* exclusive compressed */
+
+ /*
+ * limits
+ */
+ u64 lim_flags; /* which limits are set */
+ u64 max_rfer;
+ u64 max_excl;
+ u64 rsv_rfer;
+ u64 rsv_excl;
+
+ /*
+ * reservation tracking
+ */
+ u64 reserved;
+
+ /*
+ * lists
+ */
+ struct list_head groups; /* groups this group is member of */
+ struct list_head members; /* groups that are members of this group */
+ struct list_head dirty; /* dirty groups */
+ struct rb_node node; /* tree of qgroups */
+
+ /*
+ * temp variables for accounting operations
+ * Refer to qgroup_shared_accounting() for details.
+ */
+ u64 old_refcnt;
+ u64 new_refcnt;
+};
+
+/*
* For qgroup event trace points only
*/
#define QGROUP_RESERVE (1<<0)
@@ -186,17 +230,12 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
struct btrfs_qgroup_inherit *inherit);
void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
u64 ref_root, u64 num_bytes);
-/*
- * TODO: Add proper trace point for it, as btrfs_qgroup_free() is
- * called by everywhere, can't provide good trace for delayed ref case.
- */
static inline void btrfs_qgroup_free_delayed_ref(struct btrfs_fs_info *fs_info,
u64 ref_root, u64 num_bytes)
{
- btrfs_qgroup_free_refroot(fs_info, ref_root, num_bytes);
trace_btrfs_qgroup_free_delayed_ref(fs_info, ref_root, num_bytes);
+ btrfs_qgroup_free_refroot(fs_info, ref_root, num_bytes);
}
-void assert_qgroups_uptodate(struct btrfs_trans_handle *trans);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 1571bf26dc07..d8ea0eb76325 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -149,7 +149,7 @@ struct btrfs_raid_bio {
int generic_bio_cnt;
- atomic_t refs;
+ refcount_t refs;
atomic_t stripes_pending;
@@ -389,7 +389,7 @@ static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
if (bio_list_empty(&rbio->bio_list)) {
if (!list_empty(&rbio->hash_list)) {
list_del_init(&rbio->hash_list);
- atomic_dec(&rbio->refs);
+ refcount_dec(&rbio->refs);
BUG_ON(!list_empty(&rbio->plug_list));
}
}
@@ -480,7 +480,7 @@ static void cache_rbio(struct btrfs_raid_bio *rbio)
/* bump our ref if we were not in the list before */
if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags))
- atomic_inc(&rbio->refs);
+ refcount_inc(&rbio->refs);
if (!list_empty(&rbio->stripe_cache)){
list_move(&rbio->stripe_cache, &table->stripe_cache);
@@ -689,7 +689,7 @@ static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
test_bit(RBIO_CACHE_BIT, &cur->flags) &&
!test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) {
list_del_init(&cur->hash_list);
- atomic_dec(&cur->refs);
+ refcount_dec(&cur->refs);
steal_rbio(cur, rbio);
cache_drop = cur;
@@ -738,7 +738,7 @@ static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
}
}
lockit:
- atomic_inc(&rbio->refs);
+ refcount_inc(&rbio->refs);
list_add(&rbio->hash_list, &h->hash_list);
out:
spin_unlock_irqrestore(&h->lock, flags);
@@ -784,7 +784,7 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
}
list_del_init(&rbio->hash_list);
- atomic_dec(&rbio->refs);
+ refcount_dec(&rbio->refs);
/*
* we use the plug list to hold all the rbios
@@ -801,7 +801,7 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
list_del_init(&rbio->plug_list);
list_add(&next->hash_list, &h->hash_list);
- atomic_inc(&next->refs);
+ refcount_inc(&next->refs);
spin_unlock(&rbio->bio_list_lock);
spin_unlock_irqrestore(&h->lock, flags);
@@ -843,8 +843,7 @@ static void __free_raid_bio(struct btrfs_raid_bio *rbio)
{
int i;
- WARN_ON(atomic_read(&rbio->refs) < 0);
- if (!atomic_dec_and_test(&rbio->refs))
+ if (!refcount_dec_and_test(&rbio->refs))
return;
WARN_ON(!list_empty(&rbio->stripe_cache));
@@ -997,7 +996,7 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
rbio->stripe_npages = stripe_npages;
rbio->faila = -1;
rbio->failb = -1;
- atomic_set(&rbio->refs, 1);
+ refcount_set(&rbio->refs, 1);
atomic_set(&rbio->error, 0);
atomic_set(&rbio->stripes_pending, 0);
@@ -2118,6 +2117,11 @@ int raid56_parity_recover(struct btrfs_fs_info *fs_info, struct bio *bio,
struct btrfs_raid_bio *rbio;
int ret;
+ if (generic_io) {
+ ASSERT(bbio->mirror_num == mirror_num);
+ btrfs_io_bio(bio)->mirror_num = mirror_num;
+ }
+
rbio = alloc_rbio(fs_info, bbio, stripe_len);
if (IS_ERR(rbio)) {
if (generic_io)
@@ -2194,6 +2198,8 @@ static void read_rebuild_work(struct btrfs_work *work)
/*
* The following code is used to scrub/replace the parity stripe
*
+ * Caller must have already increased bio_counter for getting @bbio.
+ *
* Note: We need make sure all the pages that add into the scrub/replace
* raid bio are correct and not be changed during the scrub/replace. That
* is those pages just hold metadata or file data with checksum.
@@ -2231,6 +2237,12 @@ raid56_parity_alloc_scrub_rbio(struct btrfs_fs_info *fs_info, struct bio *bio,
ASSERT(rbio->stripe_npages == stripe_nsectors);
bitmap_copy(rbio->dbitmap, dbitmap, stripe_nsectors);
+ /*
+ * We have already increased bio_counter when getting bbio, record it
+ * so we can free it at rbio_orig_end_io().
+ */
+ rbio->generic_bio_cnt = 1;
+
return rbio;
}
@@ -2673,6 +2685,12 @@ raid56_alloc_missing_rbio(struct btrfs_fs_info *fs_info, struct bio *bio,
return NULL;
}
+ /*
+ * When we get bbio, we have already increased bio_counter, record it
+ * so we can free it at rbio_orig_end_io()
+ */
+ rbio->generic_bio_cnt = 1;
+
return rbio;
}
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index e88bca87f5d2..a17e775a4a89 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -209,9 +209,9 @@ cleanup:
return;
}
-int btree_readahead_hook(struct btrfs_fs_info *fs_info,
- struct extent_buffer *eb, int err)
+int btree_readahead_hook(struct extent_buffer *eb, int err)
{
+ struct btrfs_fs_info *fs_info = eb->fs_info;
int ret = 0;
struct reada_extent *re;
@@ -235,10 +235,10 @@ start_machine:
return ret;
}
-static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info,
- struct btrfs_device *dev, u64 logical,
+static struct reada_zone *reada_find_zone(struct btrfs_device *dev, u64 logical,
struct btrfs_bio *bbio)
{
+ struct btrfs_fs_info *fs_info = dev->fs_info;
int ret;
struct reada_zone *zone;
struct btrfs_block_group_cache *cache = NULL;
@@ -270,6 +270,12 @@ static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info,
if (!zone)
return NULL;
+ ret = radix_tree_preload(GFP_KERNEL);
+ if (ret) {
+ kfree(zone);
+ return NULL;
+ }
+
zone->start = start;
zone->end = end;
INIT_LIST_HEAD(&zone->list);
@@ -299,6 +305,7 @@ static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info,
zone = NULL;
}
spin_unlock(&fs_info->reada_lock);
+ radix_tree_preload_end();
return zone;
}
@@ -313,7 +320,6 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info,
struct btrfs_bio *bbio = NULL;
struct btrfs_device *dev;
struct btrfs_device *prev_dev;
- u32 blocksize;
u64 length;
int real_stripes;
int nzones = 0;
@@ -334,7 +340,6 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info,
if (!re)
return NULL;
- blocksize = fs_info->nodesize;
re->logical = logical;
re->top = *top;
INIT_LIST_HEAD(&re->extctl);
@@ -344,10 +349,10 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info,
/*
* map block
*/
- length = blocksize;
+ length = fs_info->nodesize;
ret = btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
&length, &bbio, 0);
- if (ret || !bbio || length < blocksize)
+ if (ret || !bbio || length < fs_info->nodesize)
goto error;
if (bbio->num_stripes > BTRFS_MAX_MIRRORS) {
@@ -367,7 +372,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info,
if (!dev->bdev)
continue;
- zone = reada_find_zone(fs_info, dev, logical, bbio);
+ zone = reada_find_zone(dev, logical, bbio);
if (!zone)
continue;
@@ -386,6 +391,10 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info,
goto error;
}
+ ret = radix_tree_preload(GFP_KERNEL);
+ if (ret)
+ goto error;
+
/* insert extent in reada_tree + all per-device trees, all or nothing */
btrfs_dev_replace_lock(&fs_info->dev_replace, 0);
spin_lock(&fs_info->reada_lock);
@@ -395,13 +404,16 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info,
re_exist->refcnt++;
spin_unlock(&fs_info->reada_lock);
btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
+ radix_tree_preload_end();
goto error;
}
if (ret) {
spin_unlock(&fs_info->reada_lock);
btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
+ radix_tree_preload_end();
goto error;
}
+ radix_tree_preload_end();
prev_dev = NULL;
dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(
&fs_info->dev_replace);
@@ -639,9 +651,9 @@ static int reada_pick_zone(struct btrfs_device *dev)
return 1;
}
-static int reada_start_machine_dev(struct btrfs_fs_info *fs_info,
- struct btrfs_device *dev)
+static int reada_start_machine_dev(struct btrfs_device *dev)
{
+ struct btrfs_fs_info *fs_info = dev->fs_info;
struct reada_extent *re = NULL;
int mirror_num = 0;
struct extent_buffer *eb = NULL;
@@ -754,8 +766,7 @@ static void __reada_start_machine(struct btrfs_fs_info *fs_info)
list_for_each_entry(device, &fs_devices->devices, dev_list) {
if (atomic_read(&device->reada_in_flight) <
MAX_IN_FLIGHT)
- enqueued += reada_start_machine_dev(fs_info,
- device);
+ enqueued += reada_start_machine_dev(device);
}
mutex_unlock(&fs_devices->device_list_mutex);
total += enqueued;
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index a08224eab8b4..7d6bc308bf43 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -501,8 +501,9 @@ void btrfs_update_root_times(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
struct btrfs_root_item *item = &root->root_item;
- struct timespec ct = current_fs_time(root->fs_info->sb);
+ struct timespec ct;
+ ktime_get_real_ts(&ct);
spin_lock(&root->root_item_lock);
btrfs_set_root_ctransid(item, trans->transid);
btrfs_set_stack_timespec_sec(&item->ctime, ct.tv_sec);
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index b0251eb1239f..c7b45eb2403d 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -64,7 +64,7 @@ struct scrub_ctx;
#define SCRUB_MAX_PAGES_PER_BLOCK 16 /* 64k per node/leaf/sector */
struct scrub_recover {
- atomic_t refs;
+ refcount_t refs;
struct btrfs_bio *bbio;
u64 map_length;
};
@@ -112,7 +112,7 @@ struct scrub_block {
struct scrub_page *pagev[SCRUB_MAX_PAGES_PER_BLOCK];
int page_count;
atomic_t outstanding_pages;
- atomic_t refs; /* free mem on transition to zero */
+ refcount_t refs; /* free mem on transition to zero */
struct scrub_ctx *sctx;
struct scrub_parity *sparity;
struct {
@@ -140,9 +140,9 @@ struct scrub_parity {
int nsectors;
- int stripe_len;
+ u64 stripe_len;
- atomic_t refs;
+ refcount_t refs;
struct list_head spages;
@@ -202,7 +202,7 @@ struct scrub_ctx {
* doesn't free the scrub context before or while the workers are
* doing the wakeup() call.
*/
- atomic_t refs;
+ refcount_t refs;
};
struct scrub_fixup_nodatasum {
@@ -240,6 +240,13 @@ struct scrub_warning {
struct btrfs_device *dev;
};
+struct full_stripe_lock {
+ struct rb_node node;
+ u64 logical;
+ u64 refs;
+ struct mutex mutex;
+};
+
static void scrub_pending_bio_inc(struct scrub_ctx *sctx);
static void scrub_pending_bio_dec(struct scrub_ctx *sctx);
static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx);
@@ -305,7 +312,7 @@ static void scrub_put_ctx(struct scrub_ctx *sctx);
static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
{
- atomic_inc(&sctx->refs);
+ refcount_inc(&sctx->refs);
atomic_inc(&sctx->bios_in_flight);
}
@@ -349,6 +356,222 @@ static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
}
/*
+ * Insert new full stripe lock into full stripe locks tree
+ *
+ * Return pointer to existing or newly inserted full_stripe_lock structure if
+ * everything works well.
+ * Return ERR_PTR(-ENOMEM) if we failed to allocate memory
+ *
+ * NOTE: caller must hold full_stripe_locks_root->lock before calling this
+ * function
+ */
+static struct full_stripe_lock *insert_full_stripe_lock(
+ struct btrfs_full_stripe_locks_tree *locks_root,
+ u64 fstripe_logical)
+{
+ struct rb_node **p;
+ struct rb_node *parent = NULL;
+ struct full_stripe_lock *entry;
+ struct full_stripe_lock *ret;
+
+ WARN_ON(!mutex_is_locked(&locks_root->lock));
+
+ p = &locks_root->root.rb_node;
+ while (*p) {
+ parent = *p;
+ entry = rb_entry(parent, struct full_stripe_lock, node);
+ if (fstripe_logical < entry->logical) {
+ p = &(*p)->rb_left;
+ } else if (fstripe_logical > entry->logical) {
+ p = &(*p)->rb_right;
+ } else {
+ entry->refs++;
+ return entry;
+ }
+ }
+
+ /* Insert new lock */
+ ret = kmalloc(sizeof(*ret), GFP_KERNEL);
+ if (!ret)
+ return ERR_PTR(-ENOMEM);
+ ret->logical = fstripe_logical;
+ ret->refs = 1;
+ mutex_init(&ret->mutex);
+
+ rb_link_node(&ret->node, parent, p);
+ rb_insert_color(&ret->node, &locks_root->root);
+ return ret;
+}
+
+/*
+ * Search for a full stripe lock of a block group
+ *
+ * Return pointer to existing full stripe lock if found
+ * Return NULL if not found
+ */
+static struct full_stripe_lock *search_full_stripe_lock(
+ struct btrfs_full_stripe_locks_tree *locks_root,
+ u64 fstripe_logical)
+{
+ struct rb_node *node;
+ struct full_stripe_lock *entry;
+
+ WARN_ON(!mutex_is_locked(&locks_root->lock));
+
+ node = locks_root->root.rb_node;
+ while (node) {
+ entry = rb_entry(node, struct full_stripe_lock, node);
+ if (fstripe_logical < entry->logical)
+ node = node->rb_left;
+ else if (fstripe_logical > entry->logical)
+ node = node->rb_right;
+ else
+ return entry;
+ }
+ return NULL;
+}
+
+/*
+ * Helper to get full stripe logical from a normal bytenr.
+ *
+ * Caller must ensure @cache is a RAID56 block group.
+ */
+static u64 get_full_stripe_logical(struct btrfs_block_group_cache *cache,
+ u64 bytenr)
+{
+ u64 ret;
+
+ /*
+ * Due to chunk item size limit, full stripe length should not be
+ * larger than U32_MAX. Just a sanity check here.
+ */
+ WARN_ON_ONCE(cache->full_stripe_len >= U32_MAX);
+
+ /*
+ * round_down() can only handle power of 2, while RAID56 full
+ * stripe length can be 64KiB * n, so we need to manually round down.
+ */
+ ret = div64_u64(bytenr - cache->key.objectid, cache->full_stripe_len) *
+ cache->full_stripe_len + cache->key.objectid;
+ return ret;
+}
+
+/*
+ * Lock a full stripe to avoid concurrency of recovery and read
+ *
+ * It's only used for profiles with parities (RAID5/6), for other profiles it
+ * does nothing.
+ *
+ * Return 0 if we locked full stripe covering @bytenr, with a mutex held.
+ * So caller must call unlock_full_stripe() at the same context.
+ *
+ * Return <0 if encounters error.
+ */
+static int lock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
+ bool *locked_ret)
+{
+ struct btrfs_block_group_cache *bg_cache;
+ struct btrfs_full_stripe_locks_tree *locks_root;
+ struct full_stripe_lock *existing;
+ u64 fstripe_start;
+ int ret = 0;
+
+ *locked_ret = false;
+ bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
+ if (!bg_cache) {
+ ASSERT(0);
+ return -ENOENT;
+ }
+
+ /* Profiles not based on parity don't need full stripe lock */
+ if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
+ goto out;
+ locks_root = &bg_cache->full_stripe_locks_root;
+
+ fstripe_start = get_full_stripe_logical(bg_cache, bytenr);
+
+ /* Now insert the full stripe lock */
+ mutex_lock(&locks_root->lock);
+ existing = insert_full_stripe_lock(locks_root, fstripe_start);
+ mutex_unlock(&locks_root->lock);
+ if (IS_ERR(existing)) {
+ ret = PTR_ERR(existing);
+ goto out;
+ }
+ mutex_lock(&existing->mutex);
+ *locked_ret = true;
+out:
+ btrfs_put_block_group(bg_cache);
+ return ret;
+}
+
+/*
+ * Unlock a full stripe.
+ *
+ * NOTE: Caller must ensure it's the same context calling corresponding
+ * lock_full_stripe().
+ *
+ * Return 0 if we unlock full stripe without problem.
+ * Return <0 for error
+ */
+static int unlock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
+ bool locked)
+{
+ struct btrfs_block_group_cache *bg_cache;
+ struct btrfs_full_stripe_locks_tree *locks_root;
+ struct full_stripe_lock *fstripe_lock;
+ u64 fstripe_start;
+ bool freeit = false;
+ int ret = 0;
+
+ /* If we didn't acquire full stripe lock, no need to continue */
+ if (!locked)
+ return 0;
+
+ bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
+ if (!bg_cache) {
+ ASSERT(0);
+ return -ENOENT;
+ }
+ if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
+ goto out;
+
+ locks_root = &bg_cache->full_stripe_locks_root;
+ fstripe_start = get_full_stripe_logical(bg_cache, bytenr);
+
+ mutex_lock(&locks_root->lock);
+ fstripe_lock = search_full_stripe_lock(locks_root, fstripe_start);
+ /* Unpaired unlock_full_stripe() detected */
+ if (!fstripe_lock) {
+ WARN_ON(1);
+ ret = -ENOENT;
+ mutex_unlock(&locks_root->lock);
+ goto out;
+ }
+
+ if (fstripe_lock->refs == 0) {
+ WARN_ON(1);
+ btrfs_warn(fs_info, "full stripe lock at %llu refcount underflow",
+ fstripe_lock->logical);
+ } else {
+ fstripe_lock->refs--;
+ }
+
+ if (fstripe_lock->refs == 0) {
+ rb_erase(&fstripe_lock->node, &locks_root->root);
+ freeit = true;
+ }
+ mutex_unlock(&locks_root->lock);
+
+ mutex_unlock(&fstripe_lock->mutex);
+ if (freeit)
+ kfree(fstripe_lock);
+out:
+ btrfs_put_block_group(bg_cache);
+ return ret;
+}
+
+/*
* used for workers that require transaction commits (i.e., for the
* NOCOW case)
*/
@@ -356,7 +579,7 @@ static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx)
{
struct btrfs_fs_info *fs_info = sctx->fs_info;
- atomic_inc(&sctx->refs);
+ refcount_inc(&sctx->refs);
/*
* increment scrubs_running to prevent cancel requests from
* completing as long as a worker is running. we must also
@@ -447,7 +670,7 @@ static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
static void scrub_put_ctx(struct scrub_ctx *sctx)
{
- if (atomic_dec_and_test(&sctx->refs))
+ if (refcount_dec_and_test(&sctx->refs))
scrub_free_ctx(sctx);
}
@@ -462,7 +685,7 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
sctx = kzalloc(sizeof(*sctx), GFP_KERNEL);
if (!sctx)
goto nomem;
- atomic_set(&sctx->refs, 1);
+ refcount_set(&sctx->refs, 1);
sctx->is_dev_replace = is_dev_replace;
sctx->pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO;
sctx->curr = -1;
@@ -857,12 +1080,14 @@ out:
static inline void scrub_get_recover(struct scrub_recover *recover)
{
- atomic_inc(&recover->refs);
+ refcount_inc(&recover->refs);
}
-static inline void scrub_put_recover(struct scrub_recover *recover)
+static inline void scrub_put_recover(struct btrfs_fs_info *fs_info,
+ struct scrub_recover *recover)
{
- if (atomic_dec_and_test(&recover->refs)) {
+ if (refcount_dec_and_test(&recover->refs)) {
+ btrfs_bio_counter_dec(fs_info);
btrfs_put_bbio(recover->bbio);
kfree(recover);
}
@@ -892,6 +1117,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
int mirror_index;
int page_num;
int success;
+ bool full_stripe_locked;
static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
DEFAULT_RATELIMIT_BURST);
@@ -917,6 +1143,24 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
have_csum = sblock_to_check->pagev[0]->have_csum;
dev = sblock_to_check->pagev[0]->dev;
+ /*
+ * For RAID5/6, race can happen for a different device scrub thread.
+ * For data corruption, Parity and Data threads will both try
+ * to recovery the data.
+ * Race can lead to doubly added csum error, or even unrecoverable
+ * error.
+ */
+ ret = lock_full_stripe(fs_info, logical, &full_stripe_locked);
+ if (ret < 0) {
+ spin_lock(&sctx->stat_lock);
+ if (ret == -ENOMEM)
+ sctx->stat.malloc_errors++;
+ sctx->stat.read_errors++;
+ sctx->stat.uncorrectable_errors++;
+ spin_unlock(&sctx->stat_lock);
+ return ret;
+ }
+
if (sctx->is_dev_replace && !is_metadata && !have_csum) {
sblocks_for_recheck = NULL;
goto nodatasum_case;
@@ -1241,7 +1485,7 @@ out:
sblock->pagev[page_index]->sblock = NULL;
recover = sblock->pagev[page_index]->recover;
if (recover) {
- scrub_put_recover(recover);
+ scrub_put_recover(fs_info, recover);
sblock->pagev[page_index]->recover =
NULL;
}
@@ -1251,6 +1495,9 @@ out:
kfree(sblocks_for_recheck);
}
+ ret = unlock_full_stripe(fs_info, logical, full_stripe_locked);
+ if (ret < 0)
+ return ret;
return 0;
}
@@ -1330,20 +1577,23 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
* with a length of PAGE_SIZE, each returned stripe
* represents one mirror
*/
+ btrfs_bio_counter_inc_blocked(fs_info);
ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
- logical, &mapped_length, &bbio, 0, 1);
+ logical, &mapped_length, &bbio);
if (ret || !bbio || mapped_length < sublen) {
btrfs_put_bbio(bbio);
+ btrfs_bio_counter_dec(fs_info);
return -EIO;
}
recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS);
if (!recover) {
btrfs_put_bbio(bbio);
+ btrfs_bio_counter_dec(fs_info);
return -ENOMEM;
}
- atomic_set(&recover->refs, 1);
+ refcount_set(&recover->refs, 1);
recover->bbio = bbio;
recover->map_length = mapped_length;
@@ -1365,7 +1615,7 @@ leave_nomem:
spin_lock(&sctx->stat_lock);
sctx->stat.malloc_errors++;
spin_unlock(&sctx->stat_lock);
- scrub_put_recover(recover);
+ scrub_put_recover(fs_info, recover);
return -ENOMEM;
}
scrub_page_get(page);
@@ -1407,7 +1657,7 @@ leave_nomem:
scrub_get_recover(recover);
page->recover = recover;
}
- scrub_put_recover(recover);
+ scrub_put_recover(fs_info, recover);
length -= sublen;
logical += sublen;
page_index++;
@@ -1497,14 +1747,18 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
bio_add_page(bio, page->page, PAGE_SIZE, 0);
if (!retry_failed_mirror && scrub_is_page_on_raid56(page)) {
- if (scrub_submit_raid56_bio_wait(fs_info, bio, page))
+ if (scrub_submit_raid56_bio_wait(fs_info, bio, page)) {
+ page->io_error = 1;
sblock->no_io_error_seen = 0;
+ }
} else {
bio->bi_iter.bi_sector = page->physical >> 9;
bio_set_op_attrs(bio, REQ_OP_READ, 0);
- if (btrfsic_submit_bio_wait(bio))
+ if (btrfsic_submit_bio_wait(bio)) {
+ page->io_error = 1;
sblock->no_io_error_seen = 0;
+ }
}
bio_put(bio);
@@ -1634,7 +1888,7 @@ static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
if (spage->io_error) {
void *mapped_buffer = kmap_atomic(spage->page);
- memset(mapped_buffer, 0, PAGE_SIZE);
+ clear_page(mapped_buffer);
flush_dcache_page(spage->page);
kunmap_atomic(mapped_buffer);
}
@@ -1998,12 +2252,12 @@ static int scrub_checksum_super(struct scrub_block *sblock)
static void scrub_block_get(struct scrub_block *sblock)
{
- atomic_inc(&sblock->refs);
+ refcount_inc(&sblock->refs);
}
static void scrub_block_put(struct scrub_block *sblock)
{
- if (atomic_dec_and_test(&sblock->refs)) {
+ if (refcount_dec_and_test(&sblock->refs)) {
int i;
if (sblock->sparity)
@@ -2187,8 +2441,9 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock)
int ret;
int i;
+ btrfs_bio_counter_inc_blocked(fs_info);
ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
- &length, &bbio, 0, 1);
+ &length, &bbio);
if (ret || !bbio || !bbio->raid_map)
goto bbio_out;
@@ -2231,6 +2486,7 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock)
rbio_out:
bio_put(bio);
bbio_out:
+ btrfs_bio_counter_dec(fs_info);
btrfs_put_bbio(bbio);
spin_lock(&sctx->stat_lock);
sctx->stat.malloc_errors++;
@@ -2255,7 +2511,7 @@ static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
/* one ref inside this function, plus one for each page added to
* a bio later on */
- atomic_set(&sblock->refs, 1);
+ refcount_set(&sblock->refs, 1);
sblock->sctx = sctx;
sblock->no_io_error_seen = 1;
@@ -2385,7 +2641,7 @@ static inline void __scrub_mark_bitmap(struct scrub_parity *sparity,
unsigned long *bitmap,
u64 start, u64 len)
{
- u32 offset;
+ u64 offset;
int nsectors;
int sectorsize = sparity->sctx->fs_info->sectorsize;
@@ -2395,8 +2651,8 @@ static inline void __scrub_mark_bitmap(struct scrub_parity *sparity,
}
start -= sparity->logic_start;
- start = div_u64_rem(start, sparity->stripe_len, &offset);
- offset /= sectorsize;
+ start = div64_u64_rem(start, sparity->stripe_len, &offset);
+ offset = div_u64(offset, sectorsize);
nsectors = (int)len / sectorsize;
if (offset + nsectors <= sparity->nsectors) {
@@ -2555,7 +2811,7 @@ static int scrub_pages_for_parity(struct scrub_parity *sparity,
/* one ref inside this function, plus one for each page added to
* a bio later on */
- atomic_set(&sblock->refs, 1);
+ refcount_set(&sblock->refs, 1);
sblock->sctx = sctx;
sblock->no_io_error_seen = 1;
sblock->sparity = sparity;
@@ -2694,7 +2950,7 @@ static int get_raid56_logic_offset(u64 physical, int num,
for (i = 0; i < nr_data_stripes(map); i++) {
*offset = last_offset + i * map->stripe_len;
- stripe_nr = div_u64(*offset, map->stripe_len);
+ stripe_nr = div64_u64(*offset, map->stripe_len);
stripe_nr = div_u64(stripe_nr, nr_data_stripes(map));
/* Work out the disk rotation on this stripe-set */
@@ -2765,7 +3021,6 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
struct btrfs_fs_info *fs_info = sctx->fs_info;
struct bio *bio;
struct btrfs_raid_bio *rbio;
- struct scrub_page *spage;
struct btrfs_bio *bbio = NULL;
u64 length;
int ret;
@@ -2775,8 +3030,10 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
goto out;
length = sparity->logic_end - sparity->logic_start;
+
+ btrfs_bio_counter_inc_blocked(fs_info);
ret = btrfs_map_sblock(fs_info, BTRFS_MAP_WRITE, sparity->logic_start,
- &length, &bbio, 0, 1);
+ &length, &bbio);
if (ret || !bbio || !bbio->raid_map)
goto bbio_out;
@@ -2795,9 +3052,6 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
if (!rbio)
goto rbio_out;
- list_for_each_entry(spage, &sparity->spages, list)
- raid56_add_scrub_pages(rbio, spage->page, spage->logical);
-
scrub_pending_bio_inc(sctx);
raid56_parity_submit_scrub_rbio(rbio);
return;
@@ -2805,6 +3059,7 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
rbio_out:
bio_put(bio);
bbio_out:
+ btrfs_bio_counter_dec(fs_info);
btrfs_put_bbio(bbio);
bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
sparity->nsectors);
@@ -2822,12 +3077,12 @@ static inline int scrub_calc_parity_bitmap_len(int nsectors)
static void scrub_parity_get(struct scrub_parity *sparity)
{
- atomic_inc(&sparity->refs);
+ refcount_inc(&sparity->refs);
}
static void scrub_parity_put(struct scrub_parity *sparity)
{
- if (!atomic_dec_and_test(&sparity->refs))
+ if (!refcount_dec_and_test(&sparity->refs))
return;
scrub_parity_check_and_repair(sparity);
@@ -2879,7 +3134,7 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
sparity->scrub_dev = sdev;
sparity->logic_start = logic_start;
sparity->logic_end = logic_end;
- atomic_set(&sparity->refs, 1);
+ refcount_set(&sparity->refs, 1);
INIT_LIST_HEAD(&sparity->spages);
sparity->dbitmap = sparity->bitmap;
sparity->ebitmap = (void *)sparity->bitmap + bitmap_len;
@@ -3098,7 +3353,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
physical = map->stripes[num].physical;
offset = 0;
- nstripes = div_u64(length, map->stripe_len);
+ nstripes = div64_u64(length, map->stripe_len);
if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
offset = map->stripe_len * num;
increment = map->stripe_len * map->num_stripes;
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 3f645cd67b54..fc496a6f842a 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -5184,13 +5184,19 @@ static int is_extent_unchanged(struct send_ctx *sctx,
while (key.offset < ekey->offset + left_len) {
ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
right_type = btrfs_file_extent_type(eb, ei);
- if (right_type != BTRFS_FILE_EXTENT_REG) {
+ if (right_type != BTRFS_FILE_EXTENT_REG &&
+ right_type != BTRFS_FILE_EXTENT_INLINE) {
ret = 0;
goto out;
}
right_disknr = btrfs_file_extent_disk_bytenr(eb, ei);
- right_len = btrfs_file_extent_num_bytes(eb, ei);
+ if (right_type == BTRFS_FILE_EXTENT_INLINE) {
+ right_len = btrfs_file_extent_inline_len(eb, slot, ei);
+ right_len = PAGE_ALIGN(right_len);
+ } else {
+ right_len = btrfs_file_extent_num_bytes(eb, ei);
+ }
right_offset = btrfs_file_extent_offset(eb, ei);
right_gen = btrfs_file_extent_generation(eb, ei);
@@ -5204,6 +5210,19 @@ static int is_extent_unchanged(struct send_ctx *sctx,
goto out;
}
+ /*
+ * We just wanted to see if when we have an inline extent, what
+ * follows it is a regular extent (wanted to check the above
+ * condition for inline extents too). This should normally not
+ * happen but it's possible for example when we have an inline
+ * compressed extent representing data with a size matching
+ * the page size (currently the same as sector size).
+ */
+ if (right_type == BTRFS_FILE_EXTENT_INLINE) {
+ ret = 0;
+ goto out;
+ }
+
left_offset_fixed = left_offset;
if (key.offset < ekey->offset) {
/* Fix the right offset for 2a and 7. */
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 72a053c9a7f0..4f1cdd5058f1 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1795,8 +1795,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
}
if (fs_info->fs_devices->missing_devices >
- fs_info->num_tolerated_disk_barrier_failures &&
- !(*flags & MS_RDONLY)) {
+ fs_info->num_tolerated_disk_barrier_failures) {
btrfs_warn(fs_info,
"too many missing devices, writeable remount is not allowed");
ret = -EACCES;
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
index ea272432c930..b18ab8f327a5 100644
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -237,7 +237,6 @@ void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans)
{
memset(trans, 0, sizeof(*trans));
trans->transid = 1;
- INIT_LIST_HEAD(&trans->qgroup_ref_list);
trans->type = __TRANS_DUMMY;
}
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 61b807de3e16..2168654c90a1 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -60,8 +60,8 @@ static const unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = {
void btrfs_put_transaction(struct btrfs_transaction *transaction)
{
- WARN_ON(atomic_read(&transaction->use_count) == 0);
- if (atomic_dec_and_test(&transaction->use_count)) {
+ WARN_ON(refcount_read(&transaction->use_count) == 0);
+ if (refcount_dec_and_test(&transaction->use_count)) {
BUG_ON(!list_empty(&transaction->list));
WARN_ON(!RB_EMPTY_ROOT(&transaction->delayed_refs.href_root));
if (transaction->delayed_refs.pending_csums)
@@ -207,7 +207,7 @@ loop:
spin_unlock(&fs_info->trans_lock);
return -EBUSY;
}
- atomic_inc(&cur_trans->use_count);
+ refcount_inc(&cur_trans->use_count);
atomic_inc(&cur_trans->num_writers);
extwriter_counter_inc(cur_trans, type);
spin_unlock(&fs_info->trans_lock);
@@ -257,7 +257,7 @@ loop:
* One for this trans handle, one so it will live on until we
* commit the transaction.
*/
- atomic_set(&cur_trans->use_count, 2);
+ refcount_set(&cur_trans->use_count, 2);
atomic_set(&cur_trans->pending_ordered, 0);
cur_trans->flags = 0;
cur_trans->start_time = get_seconds();
@@ -432,7 +432,7 @@ static void wait_current_trans(struct btrfs_fs_info *fs_info)
spin_lock(&fs_info->trans_lock);
cur_trans = fs_info->running_transaction;
if (cur_trans && is_transaction_blocked(cur_trans)) {
- atomic_inc(&cur_trans->use_count);
+ refcount_inc(&cur_trans->use_count);
spin_unlock(&fs_info->trans_lock);
wait_event(fs_info->transaction_wait,
@@ -572,7 +572,6 @@ again:
h->type = type;
h->can_flush_pending_bgs = true;
- INIT_LIST_HEAD(&h->qgroup_ref_list);
INIT_LIST_HEAD(&h->new_bgs);
smp_mb();
@@ -744,7 +743,7 @@ int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid)
list_for_each_entry(t, &fs_info->trans_list, list) {
if (t->transid == transid) {
cur_trans = t;
- atomic_inc(&cur_trans->use_count);
+ refcount_inc(&cur_trans->use_count);
ret = 0;
break;
}
@@ -773,7 +772,7 @@ int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid)
if (t->state == TRANS_STATE_COMPLETED)
break;
cur_trans = t;
- atomic_inc(&cur_trans->use_count);
+ refcount_inc(&cur_trans->use_count);
break;
}
}
@@ -917,7 +916,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
wake_up_process(info->transaction_kthread);
err = -EIO;
}
- assert_qgroups_uptodate(trans);
kmem_cache_free(btrfs_trans_handle_cachep, trans);
if (must_run_delayed_refs) {
@@ -1839,7 +1837,7 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
/* take transaction reference */
cur_trans = trans->transaction;
- atomic_inc(&cur_trans->use_count);
+ refcount_inc(&cur_trans->use_count);
btrfs_end_transaction(trans);
@@ -2015,7 +2013,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
spin_lock(&fs_info->trans_lock);
if (cur_trans->state >= TRANS_STATE_COMMIT_START) {
spin_unlock(&fs_info->trans_lock);
- atomic_inc(&cur_trans->use_count);
+ refcount_inc(&cur_trans->use_count);
ret = btrfs_end_transaction(trans);
wait_for_commit(cur_trans);
@@ -2035,7 +2033,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
prev_trans = list_entry(cur_trans->list.prev,
struct btrfs_transaction, list);
if (prev_trans->state != TRANS_STATE_COMPLETED) {
- atomic_inc(&prev_trans->use_count);
+ refcount_inc(&prev_trans->use_count);
spin_unlock(&fs_info->trans_lock);
wait_for_commit(prev_trans);
@@ -2130,13 +2128,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
goto scrub_continue;
}
- /* Reocrd old roots for later qgroup accounting */
- ret = btrfs_qgroup_prepare_account_extents(trans, fs_info);
- if (ret) {
- mutex_unlock(&fs_info->reloc_mutex);
- goto scrub_continue;
- }
-
/*
* make sure none of the code above managed to slip in a
* delayed item
@@ -2179,6 +2170,24 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
btrfs_free_log_root_tree(trans, fs_info);
/*
+ * commit_fs_roots() can call btrfs_save_ino_cache(), which generates
+ * new delayed refs. Must handle them or qgroup can be wrong.
+ */
+ ret = btrfs_run_delayed_refs(trans, fs_info, (unsigned long)-1);
+ if (ret) {
+ mutex_unlock(&fs_info->tree_log_mutex);
+ mutex_unlock(&fs_info->reloc_mutex);
+ goto scrub_continue;
+ }
+
+ ret = btrfs_qgroup_prepare_account_extents(trans, fs_info);
+ if (ret) {
+ mutex_unlock(&fs_info->tree_log_mutex);
+ mutex_unlock(&fs_info->reloc_mutex);
+ goto scrub_continue;
+ }
+
+ /*
* Since fs roots are all committed, we can get a quite accurate
* new_roots. So let's do quota accounting.
*/
@@ -2223,7 +2232,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
switch_commit_roots(cur_trans, fs_info);
- assert_qgroups_uptodate(trans);
ASSERT(list_empty(&cur_trans->dirty_bgs));
ASSERT(list_empty(&cur_trans->io_bgs));
update_super_roots(fs_info);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 5dfb5590fff6..c55e44560103 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -18,6 +18,8 @@
#ifndef __BTRFS_TRANSACTION__
#define __BTRFS_TRANSACTION__
+
+#include <linux/refcount.h>
#include "btrfs_inode.h"
#include "delayed-ref.h"
#include "ctree.h"
@@ -49,7 +51,7 @@ struct btrfs_transaction {
* transaction can end
*/
atomic_t num_writers;
- atomic_t use_count;
+ refcount_t use_count;
atomic_t pending_ordered;
unsigned long flags;
@@ -125,8 +127,6 @@ struct btrfs_trans_handle {
unsigned int type;
struct btrfs_root *root;
struct btrfs_fs_info *fs_info;
- struct seq_list delayed_ref_elem;
- struct list_head qgroup_ref_list;
struct list_head new_bgs;
};
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index a59674c3e69e..ccfe9fe7754a 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -4196,7 +4196,7 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
if (em->generation <= test_gen)
continue;
/* Need a ref to keep it from getting evicted from cache */
- atomic_inc(&em->refs);
+ refcount_inc(&em->refs);
set_bit(EXTENT_FLAG_LOGGING, &em->flags);
list_add_tail(&em->list, &extents);
num++;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index ab8a66d852f9..017b67daa3bb 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -139,6 +139,11 @@ static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info);
static void __btrfs_reset_dev_stats(struct btrfs_device *dev);
static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev);
static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
+static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
+ enum btrfs_map_op op,
+ u64 logical, u64 *length,
+ struct btrfs_bio **bbio_ret,
+ int mirror_num, int need_raid_map);
DEFINE_MUTEX(uuid_mutex);
static LIST_HEAD(fs_uuids);
@@ -1008,14 +1013,13 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
q = bdev_get_queue(bdev);
if (blk_queue_discard(q))
device->can_discard = 1;
+ if (!blk_queue_nonrot(q))
+ fs_devices->rotating = 1;
device->bdev = bdev;
device->in_fs_metadata = 0;
device->mode = flags;
- if (!blk_queue_nonrot(bdev_get_queue(bdev)))
- fs_devices->rotating = 1;
-
fs_devices->open_devices++;
if (device->writeable &&
device->devid != BTRFS_DEV_REPLACE_DEVID) {
@@ -2417,7 +2421,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
fs_info->free_chunk_space += device->total_bytes;
spin_unlock(&fs_info->free_chunk_lock);
- if (!blk_queue_nonrot(bdev_get_queue(bdev)))
+ if (!blk_queue_nonrot(q))
fs_info->fs_devices->rotating = 1;
tmp = btrfs_super_total_bytes(fs_info->super_copy);
@@ -2795,10 +2799,38 @@ static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info,
return ret;
}
+static struct extent_map *get_chunk_map(struct btrfs_fs_info *fs_info,
+ u64 logical, u64 length)
+{
+ struct extent_map_tree *em_tree;
+ struct extent_map *em;
+
+ em_tree = &fs_info->mapping_tree.map_tree;
+ read_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, logical, length);
+ read_unlock(&em_tree->lock);
+
+ if (!em) {
+ btrfs_crit(fs_info, "unable to find logical %llu length %llu",
+ logical, length);
+ return ERR_PTR(-EINVAL);
+ }
+
+ if (em->start > logical || em->start + em->len < logical) {
+ btrfs_crit(fs_info,
+ "found a bad mapping, wanted %llu-%llu, found %llu-%llu",
+ logical, length, em->start, em->start + em->len);
+ free_extent_map(em);
+ return ERR_PTR(-EINVAL);
+ }
+
+ /* callers are responsible for dropping em's ref. */
+ return em;
+}
+
int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 chunk_offset)
{
- struct extent_map_tree *em_tree;
struct extent_map *em;
struct map_lookup *map;
u64 dev_extent_len = 0;
@@ -2806,23 +2838,15 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
int i, ret = 0;
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
- em_tree = &fs_info->mapping_tree.map_tree;
-
- read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, chunk_offset, 1);
- read_unlock(&em_tree->lock);
-
- if (!em || em->start > chunk_offset ||
- em->start + em->len < chunk_offset) {
+ em = get_chunk_map(fs_info, chunk_offset, 1);
+ if (IS_ERR(em)) {
/*
* This is a logic error, but we don't want to just rely on the
* user having built with ASSERT enabled, so if ASSERT doesn't
* do anything we still error out.
*/
ASSERT(0);
- if (em)
- free_extent_map(em);
- return -EINVAL;
+ return PTR_ERR(em);
}
map = em->map_lookup;
mutex_lock(&fs_info->chunk_mutex);
@@ -3736,7 +3760,7 @@ static void __cancel_balance(struct btrfs_fs_info *fs_info)
if (ret)
btrfs_handle_fs_error(fs_info, ret, NULL);
- atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
+ clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
}
/* Non-zero return value signifies invalidity */
@@ -3755,6 +3779,7 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
struct btrfs_ioctl_balance_args *bargs)
{
struct btrfs_fs_info *fs_info = bctl->fs_info;
+ u64 meta_target, data_target;
u64 allowed;
int mixed = 0;
int ret;
@@ -3851,11 +3876,16 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
}
} while (read_seqretry(&fs_info->profiles_lock, seq));
- if (btrfs_get_num_tolerated_disk_barrier_failures(bctl->meta.target) <
- btrfs_get_num_tolerated_disk_barrier_failures(bctl->data.target)) {
+ /* if we're not converting, the target field is uninitialized */
+ meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
+ bctl->meta.target : fs_info->avail_metadata_alloc_bits;
+ data_target = (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
+ bctl->data.target : fs_info->avail_data_alloc_bits;
+ if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) <
+ btrfs_get_num_tolerated_disk_barrier_failures(data_target)) {
btrfs_warn(fs_info,
"metadata profile 0x%llx has lower redundancy than data profile 0x%llx",
- bctl->meta.target, bctl->data.target);
+ meta_target, data_target);
}
if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
@@ -3910,7 +3940,7 @@ out:
__cancel_balance(fs_info);
else {
kfree(bctl);
- atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
+ clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
}
return ret;
}
@@ -4000,7 +4030,7 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
btrfs_balance_sys(leaf, item, &disk_bargs);
btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs);
- WARN_ON(atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1));
+ WARN_ON(test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags));
mutex_lock(&fs_info->volume_mutex);
mutex_lock(&fs_info->balance_mutex);
@@ -4785,7 +4815,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
stripe_size = div_u64(stripe_size, dev_stripes);
/* align to BTRFS_STRIPE_LEN */
- stripe_size = div_u64(stripe_size, raid_stripe_len);
+ stripe_size = div64_u64(stripe_size, raid_stripe_len);
stripe_size *= raid_stripe_len;
map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
@@ -4833,7 +4863,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
ret = add_extent_mapping(em_tree, em, 0);
if (!ret) {
list_add_tail(&em->list, &trans->transaction->pending_chunks);
- atomic_inc(&em->refs);
+ refcount_inc(&em->refs);
}
write_unlock(&em_tree->lock);
if (ret) {
@@ -4888,7 +4918,6 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
struct btrfs_device *device;
struct btrfs_chunk *chunk;
struct btrfs_stripe *stripe;
- struct extent_map_tree *em_tree;
struct extent_map *em;
struct map_lookup *map;
size_t item_size;
@@ -4897,24 +4926,9 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
int i = 0;
int ret = 0;
- em_tree = &fs_info->mapping_tree.map_tree;
- read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, chunk_offset, chunk_size);
- read_unlock(&em_tree->lock);
-
- if (!em) {
- btrfs_crit(fs_info, "unable to find logical %Lu len %Lu",
- chunk_offset, chunk_size);
- return -EINVAL;
- }
-
- if (em->start != chunk_offset || em->len != chunk_size) {
- btrfs_crit(fs_info,
- "found a bad mapping, wanted %Lu-%Lu, found %Lu-%Lu",
- chunk_offset, chunk_size, em->start, em->len);
- free_extent_map(em);
- return -EINVAL;
- }
+ em = get_chunk_map(fs_info, chunk_offset, chunk_size);
+ if (IS_ERR(em))
+ return PTR_ERR(em);
map = em->map_lookup;
item_size = btrfs_chunk_item_size(map->num_stripes);
@@ -5055,15 +5069,12 @@ int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset)
{
struct extent_map *em;
struct map_lookup *map;
- struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
int readonly = 0;
int miss_ndevs = 0;
int i;
- read_lock(&map_tree->map_tree.lock);
- em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
- read_unlock(&map_tree->map_tree.lock);
- if (!em)
+ em = get_chunk_map(fs_info, chunk_offset, 1);
+ if (IS_ERR(em))
return 1;
map = em->map_lookup;
@@ -5117,34 +5128,19 @@ void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
{
- struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
struct extent_map *em;
struct map_lookup *map;
- struct extent_map_tree *em_tree = &map_tree->map_tree;
int ret;
- read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, logical, len);
- read_unlock(&em_tree->lock);
-
- /*
- * We could return errors for these cases, but that could get ugly and
- * we'd probably do the same thing which is just not do anything else
- * and exit, so return 1 so the callers don't try to use other copies.
- */
- if (!em) {
- btrfs_crit(fs_info, "No mapping for %Lu-%Lu", logical,
- logical+len);
- return 1;
- }
-
- if (em->start > logical || em->start + em->len < logical) {
- btrfs_crit(fs_info, "Invalid mapping for %Lu-%Lu, got %Lu-%Lu",
- logical, logical+len, em->start,
- em->start + em->len);
- free_extent_map(em);
+ em = get_chunk_map(fs_info, logical, len);
+ if (IS_ERR(em))
+ /*
+ * We could return errors for these cases, but that could get
+ * ugly and we'd probably do the same thing which is just not do
+ * anything else and exit, so return 1 so the callers don't try
+ * to use other copies.
+ */
return 1;
- }
map = em->map_lookup;
if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1))
@@ -5160,7 +5156,8 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
free_extent_map(em);
btrfs_dev_replace_lock(&fs_info->dev_replace, 0);
- if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))
+ if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace) &&
+ fs_info->dev_replace.tgtdev)
ret++;
btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
@@ -5173,15 +5170,11 @@ unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
{
struct extent_map *em;
struct map_lookup *map;
- struct extent_map_tree *em_tree = &map_tree->map_tree;
unsigned long len = fs_info->sectorsize;
- read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, logical, len);
- read_unlock(&em_tree->lock);
- BUG_ON(!em);
+ em = get_chunk_map(fs_info, logical, len);
+ WARN_ON(IS_ERR(em));
- BUG_ON(em->start > logical || em->start + em->len < logical);
map = em->map_lookup;
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
len = map->stripe_len * nr_data_stripes(map);
@@ -5189,20 +5182,16 @@ unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
return len;
}
-int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree,
+int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info,
u64 logical, u64 len, int mirror_num)
{
struct extent_map *em;
struct map_lookup *map;
- struct extent_map_tree *em_tree = &map_tree->map_tree;
int ret = 0;
- read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, logical, len);
- read_unlock(&em_tree->lock);
- BUG_ON(!em);
+ em = get_chunk_map(fs_info, logical, len);
+ WARN_ON(IS_ERR(em));
- BUG_ON(em->start > logical || em->start + em->len < logical);
map = em->map_lookup;
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
ret = 1;
@@ -5295,25 +5284,353 @@ static struct btrfs_bio *alloc_btrfs_bio(int total_stripes, int real_stripes)
GFP_NOFS|__GFP_NOFAIL);
atomic_set(&bbio->error, 0);
- atomic_set(&bbio->refs, 1);
+ refcount_set(&bbio->refs, 1);
return bbio;
}
void btrfs_get_bbio(struct btrfs_bio *bbio)
{
- WARN_ON(!atomic_read(&bbio->refs));
- atomic_inc(&bbio->refs);
+ WARN_ON(!refcount_read(&bbio->refs));
+ refcount_inc(&bbio->refs);
}
void btrfs_put_bbio(struct btrfs_bio *bbio)
{
if (!bbio)
return;
- if (atomic_dec_and_test(&bbio->refs))
+ if (refcount_dec_and_test(&bbio->refs))
kfree(bbio);
}
+/* can REQ_OP_DISCARD be sent with other REQ like REQ_OP_WRITE? */
+/*
+ * Please note that, discard won't be sent to target device of device
+ * replace.
+ */
+static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
+ u64 logical, u64 length,
+ struct btrfs_bio **bbio_ret)
+{
+ struct extent_map *em;
+ struct map_lookup *map;
+ struct btrfs_bio *bbio;
+ u64 offset;
+ u64 stripe_nr;
+ u64 stripe_nr_end;
+ u64 stripe_end_offset;
+ u64 stripe_cnt;
+ u64 stripe_len;
+ u64 stripe_offset;
+ u64 num_stripes;
+ u32 stripe_index;
+ u32 factor = 0;
+ u32 sub_stripes = 0;
+ u64 stripes_per_dev = 0;
+ u32 remaining_stripes = 0;
+ u32 last_stripe = 0;
+ int ret = 0;
+ int i;
+
+ /* discard always return a bbio */
+ ASSERT(bbio_ret);
+
+ em = get_chunk_map(fs_info, logical, length);
+ if (IS_ERR(em))
+ return PTR_ERR(em);
+
+ map = em->map_lookup;
+ /* we don't discard raid56 yet */
+ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+
+ offset = logical - em->start;
+ length = min_t(u64, em->len - offset, length);
+
+ stripe_len = map->stripe_len;
+ /*
+ * stripe_nr counts the total number of stripes we have to stride
+ * to get to this block
+ */
+ stripe_nr = div64_u64(offset, stripe_len);
+
+ /* stripe_offset is the offset of this block in its stripe */
+ stripe_offset = offset - stripe_nr * stripe_len;
+
+ stripe_nr_end = round_up(offset + length, map->stripe_len);
+ stripe_nr_end = div64_u64(stripe_nr_end, map->stripe_len);
+ stripe_cnt = stripe_nr_end - stripe_nr;
+ stripe_end_offset = stripe_nr_end * map->stripe_len -
+ (offset + length);
+ /*
+ * after this, stripe_nr is the number of stripes on this
+ * device we have to walk to find the data, and stripe_index is
+ * the number of our device in the stripe array
+ */
+ num_stripes = 1;
+ stripe_index = 0;
+ if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
+ BTRFS_BLOCK_GROUP_RAID10)) {
+ if (map->type & BTRFS_BLOCK_GROUP_RAID0)
+ sub_stripes = 1;
+ else
+ sub_stripes = map->sub_stripes;
+
+ factor = map->num_stripes / sub_stripes;
+ num_stripes = min_t(u64, map->num_stripes,
+ sub_stripes * stripe_cnt);
+ stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
+ stripe_index *= sub_stripes;
+ stripes_per_dev = div_u64_rem(stripe_cnt, factor,
+ &remaining_stripes);
+ div_u64_rem(stripe_nr_end - 1, factor, &last_stripe);
+ last_stripe *= sub_stripes;
+ } else if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
+ BTRFS_BLOCK_GROUP_DUP)) {
+ num_stripes = map->num_stripes;
+ } else {
+ stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
+ &stripe_index);
+ }
+
+ bbio = alloc_btrfs_bio(num_stripes, 0);
+ if (!bbio) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ for (i = 0; i < num_stripes; i++) {
+ bbio->stripes[i].physical =
+ map->stripes[stripe_index].physical +
+ stripe_offset + stripe_nr * map->stripe_len;
+ bbio->stripes[i].dev = map->stripes[stripe_index].dev;
+
+ if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
+ BTRFS_BLOCK_GROUP_RAID10)) {
+ bbio->stripes[i].length = stripes_per_dev *
+ map->stripe_len;
+
+ if (i / sub_stripes < remaining_stripes)
+ bbio->stripes[i].length +=
+ map->stripe_len;
+
+ /*
+ * Special for the first stripe and
+ * the last stripe:
+ *
+ * |-------|...|-------|
+ * |----------|
+ * off end_off
+ */
+ if (i < sub_stripes)
+ bbio->stripes[i].length -=
+ stripe_offset;
+
+ if (stripe_index >= last_stripe &&
+ stripe_index <= (last_stripe +
+ sub_stripes - 1))
+ bbio->stripes[i].length -=
+ stripe_end_offset;
+
+ if (i == sub_stripes - 1)
+ stripe_offset = 0;
+ } else {
+ bbio->stripes[i].length = length;
+ }
+
+ stripe_index++;
+ if (stripe_index == map->num_stripes) {
+ stripe_index = 0;
+ stripe_nr++;
+ }
+ }
+
+ *bbio_ret = bbio;
+ bbio->map_type = map->type;
+ bbio->num_stripes = num_stripes;
+out:
+ free_extent_map(em);
+ return ret;
+}
+
+/*
+ * In dev-replace case, for repair case (that's the only case where the mirror
+ * is selected explicitly when calling btrfs_map_block), blocks left of the
+ * left cursor can also be read from the target drive.
+ *
+ * For REQ_GET_READ_MIRRORS, the target drive is added as the last one to the
+ * array of stripes.
+ * For READ, it also needs to be supported using the same mirror number.
+ *
+ * If the requested block is not left of the left cursor, EIO is returned. This
+ * can happen because btrfs_num_copies() returns one more in the dev-replace
+ * case.
+ */
+static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info,
+ u64 logical, u64 length,
+ u64 srcdev_devid, int *mirror_num,
+ u64 *physical)
+{
+ struct btrfs_bio *bbio = NULL;
+ int num_stripes;
+ int index_srcdev = 0;
+ int found = 0;
+ u64 physical_of_found = 0;
+ int i;
+ int ret = 0;
+
+ ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
+ logical, &length, &bbio, 0, 0);
+ if (ret) {
+ ASSERT(bbio == NULL);
+ return ret;
+ }
+
+ num_stripes = bbio->num_stripes;
+ if (*mirror_num > num_stripes) {
+ /*
+ * BTRFS_MAP_GET_READ_MIRRORS does not contain this mirror,
+ * that means that the requested area is not left of the left
+ * cursor
+ */
+ btrfs_put_bbio(bbio);
+ return -EIO;
+ }
+
+ /*
+ * process the rest of the function using the mirror_num of the source
+ * drive. Therefore look it up first. At the end, patch the device
+ * pointer to the one of the target drive.
+ */
+ for (i = 0; i < num_stripes; i++) {
+ if (bbio->stripes[i].dev->devid != srcdev_devid)
+ continue;
+
+ /*
+ * In case of DUP, in order to keep it simple, only add the
+ * mirror with the lowest physical address
+ */
+ if (found &&
+ physical_of_found <= bbio->stripes[i].physical)
+ continue;
+
+ index_srcdev = i;
+ found = 1;
+ physical_of_found = bbio->stripes[i].physical;
+ }
+
+ btrfs_put_bbio(bbio);
+
+ ASSERT(found);
+ if (!found)
+ return -EIO;
+
+ *mirror_num = index_srcdev + 1;
+ *physical = physical_of_found;
+ return ret;
+}
+
+static void handle_ops_on_dev_replace(enum btrfs_map_op op,
+ struct btrfs_bio **bbio_ret,
+ struct btrfs_dev_replace *dev_replace,
+ int *num_stripes_ret, int *max_errors_ret)
+{
+ struct btrfs_bio *bbio = *bbio_ret;
+ u64 srcdev_devid = dev_replace->srcdev->devid;
+ int tgtdev_indexes = 0;
+ int num_stripes = *num_stripes_ret;
+ int max_errors = *max_errors_ret;
+ int i;
+
+ if (op == BTRFS_MAP_WRITE) {
+ int index_where_to_add;
+
+ /*
+ * duplicate the write operations while the dev replace
+ * procedure is running. Since the copying of the old disk to
+ * the new disk takes place at run time while the filesystem is
+ * mounted writable, the regular write operations to the old
+ * disk have to be duplicated to go to the new disk as well.
+ *
+ * Note that device->missing is handled by the caller, and that
+ * the write to the old disk is already set up in the stripes
+ * array.
+ */
+ index_where_to_add = num_stripes;
+ for (i = 0; i < num_stripes; i++) {
+ if (bbio->stripes[i].dev->devid == srcdev_devid) {
+ /* write to new disk, too */
+ struct btrfs_bio_stripe *new =
+ bbio->stripes + index_where_to_add;
+ struct btrfs_bio_stripe *old =
+ bbio->stripes + i;
+
+ new->physical = old->physical;
+ new->length = old->length;
+ new->dev = dev_replace->tgtdev;
+ bbio->tgtdev_map[i] = index_where_to_add;
+ index_where_to_add++;
+ max_errors++;
+ tgtdev_indexes++;
+ }
+ }
+ num_stripes = index_where_to_add;
+ } else if (op == BTRFS_MAP_GET_READ_MIRRORS) {
+ int index_srcdev = 0;
+ int found = 0;
+ u64 physical_of_found = 0;
+
+ /*
+ * During the dev-replace procedure, the target drive can also
+ * be used to read data in case it is needed to repair a corrupt
+ * block elsewhere. This is possible if the requested area is
+ * left of the left cursor. In this area, the target drive is a
+ * full copy of the source drive.
+ */
+ for (i = 0; i < num_stripes; i++) {
+ if (bbio->stripes[i].dev->devid == srcdev_devid) {
+ /*
+ * In case of DUP, in order to keep it simple,
+ * only add the mirror with the lowest physical
+ * address
+ */
+ if (found &&
+ physical_of_found <=
+ bbio->stripes[i].physical)
+ continue;
+ index_srcdev = i;
+ found = 1;
+ physical_of_found = bbio->stripes[i].physical;
+ }
+ }
+ if (found) {
+ struct btrfs_bio_stripe *tgtdev_stripe =
+ bbio->stripes + num_stripes;
+
+ tgtdev_stripe->physical = physical_of_found;
+ tgtdev_stripe->length =
+ bbio->stripes[index_srcdev].length;
+ tgtdev_stripe->dev = dev_replace->tgtdev;
+ bbio->tgtdev_map[index_srcdev] = num_stripes;
+
+ tgtdev_indexes++;
+ num_stripes++;
+ }
+ }
+
+ *num_stripes_ret = num_stripes;
+ *max_errors_ret = max_errors;
+ bbio->num_tgtdevs = tgtdev_indexes;
+ *bbio_ret = bbio;
+}
+
+static bool need_full_stripe(enum btrfs_map_op op)
+{
+ return (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS);
+}
+
static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
enum btrfs_map_op op,
u64 logical, u64 *length,
@@ -5322,14 +5639,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
{
struct extent_map *em;
struct map_lookup *map;
- struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
- struct extent_map_tree *em_tree = &map_tree->map_tree;
u64 offset;
u64 stripe_offset;
- u64 stripe_end_offset;
u64 stripe_nr;
- u64 stripe_nr_orig;
- u64 stripe_nr_end;
u64 stripe_len;
u32 stripe_index;
int i;
@@ -5345,23 +5657,13 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
u64 physical_to_patch_in_first_stripe = 0;
u64 raid56_full_stripe_start = (u64)-1;
- read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, logical, *length);
- read_unlock(&em_tree->lock);
-
- if (!em) {
- btrfs_crit(fs_info, "unable to find logical %llu len %llu",
- logical, *length);
- return -EINVAL;
- }
+ if (op == BTRFS_MAP_DISCARD)
+ return __btrfs_map_block_for_discard(fs_info, logical,
+ *length, bbio_ret);
- if (em->start > logical || em->start + em->len < logical) {
- btrfs_crit(fs_info,
- "found a bad mapping, wanted %Lu, found %Lu-%Lu",
- logical, em->start, em->start + em->len);
- free_extent_map(em);
- return -EINVAL;
- }
+ em = get_chunk_map(fs_info, logical, *length);
+ if (IS_ERR(em))
+ return PTR_ERR(em);
map = em->map_lookup;
offset = logical - em->start;
@@ -5400,14 +5702,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
raid56_full_stripe_start *= full_stripe_len;
}
- if (op == BTRFS_MAP_DISCARD) {
- /* we don't discard raid56 yet */
- if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
- ret = -EOPNOTSUPP;
- goto out;
- }
- *length = min_t(u64, em->len - offset, *length);
- } else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
+ if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
u64 max_len;
/* For writes to RAID[56], allow a full stripeset across all disks.
For other RAID types and for RAID[56] reads, just allow a single
@@ -5438,105 +5733,28 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
btrfs_dev_replace_set_lock_blocking(dev_replace);
if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 &&
- op != BTRFS_MAP_WRITE && op != BTRFS_MAP_DISCARD &&
- op != BTRFS_MAP_GET_READ_MIRRORS && dev_replace->tgtdev != NULL) {
- /*
- * in dev-replace case, for repair case (that's the only
- * case where the mirror is selected explicitly when
- * calling btrfs_map_block), blocks left of the left cursor
- * can also be read from the target drive.
- * For REQ_GET_READ_MIRRORS, the target drive is added as
- * the last one to the array of stripes. For READ, it also
- * needs to be supported using the same mirror number.
- * If the requested block is not left of the left cursor,
- * EIO is returned. This can happen because btrfs_num_copies()
- * returns one more in the dev-replace case.
- */
- u64 tmp_length = *length;
- struct btrfs_bio *tmp_bbio = NULL;
- int tmp_num_stripes;
- u64 srcdev_devid = dev_replace->srcdev->devid;
- int index_srcdev = 0;
- int found = 0;
- u64 physical_of_found = 0;
-
- ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
- logical, &tmp_length, &tmp_bbio, 0, 0);
- if (ret) {
- WARN_ON(tmp_bbio != NULL);
- goto out;
- }
-
- tmp_num_stripes = tmp_bbio->num_stripes;
- if (mirror_num > tmp_num_stripes) {
- /*
- * BTRFS_MAP_GET_READ_MIRRORS does not contain this
- * mirror, that means that the requested area
- * is not left of the left cursor
- */
- ret = -EIO;
- btrfs_put_bbio(tmp_bbio);
- goto out;
- }
-
- /*
- * process the rest of the function using the mirror_num
- * of the source drive. Therefore look it up first.
- * At the end, patch the device pointer to the one of the
- * target drive.
- */
- for (i = 0; i < tmp_num_stripes; i++) {
- if (tmp_bbio->stripes[i].dev->devid != srcdev_devid)
- continue;
-
- /*
- * In case of DUP, in order to keep it simple, only add
- * the mirror with the lowest physical address
- */
- if (found &&
- physical_of_found <= tmp_bbio->stripes[i].physical)
- continue;
-
- index_srcdev = i;
- found = 1;
- physical_of_found = tmp_bbio->stripes[i].physical;
- }
-
- btrfs_put_bbio(tmp_bbio);
-
- if (!found) {
- WARN_ON(1);
- ret = -EIO;
+ !need_full_stripe(op) && dev_replace->tgtdev != NULL) {
+ ret = get_extra_mirror_from_replace(fs_info, logical, *length,
+ dev_replace->srcdev->devid,
+ &mirror_num,
+ &physical_to_patch_in_first_stripe);
+ if (ret)
goto out;
- }
-
- mirror_num = index_srcdev + 1;
- patch_the_first_stripe_for_dev_replace = 1;
- physical_to_patch_in_first_stripe = physical_of_found;
+ else
+ patch_the_first_stripe_for_dev_replace = 1;
} else if (mirror_num > map->num_stripes) {
mirror_num = 0;
}
num_stripes = 1;
stripe_index = 0;
- stripe_nr_orig = stripe_nr;
- stripe_nr_end = ALIGN(offset + *length, map->stripe_len);
- stripe_nr_end = div_u64(stripe_nr_end, map->stripe_len);
- stripe_end_offset = stripe_nr_end * map->stripe_len -
- (offset + *length);
-
if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
- if (op == BTRFS_MAP_DISCARD)
- num_stripes = min_t(u64, map->num_stripes,
- stripe_nr_end - stripe_nr_orig);
stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
&stripe_index);
- if (op != BTRFS_MAP_WRITE && op != BTRFS_MAP_DISCARD &&
- op != BTRFS_MAP_GET_READ_MIRRORS)
+ if (op != BTRFS_MAP_WRITE && op != BTRFS_MAP_GET_READ_MIRRORS)
mirror_num = 1;
} else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
- if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_DISCARD ||
- op == BTRFS_MAP_GET_READ_MIRRORS)
+ if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS)
num_stripes = map->num_stripes;
else if (mirror_num)
stripe_index = mirror_num - 1;
@@ -5549,8 +5767,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
}
} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
- if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_DISCARD ||
- op == BTRFS_MAP_GET_READ_MIRRORS) {
+ if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS) {
num_stripes = map->num_stripes;
} else if (mirror_num) {
stripe_index = mirror_num - 1;
@@ -5566,10 +5783,6 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS)
num_stripes = map->sub_stripes;
- else if (op == BTRFS_MAP_DISCARD)
- num_stripes = min_t(u64, map->sub_stripes *
- (stripe_nr_end - stripe_nr_orig),
- map->num_stripes);
else if (mirror_num)
stripe_index += mirror_num - 1;
else {
@@ -5587,7 +5800,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
(op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS ||
mirror_num > 1)) {
/* push stripe_nr back to the start of the full stripe */
- stripe_nr = div_u64(raid56_full_stripe_start,
+ stripe_nr = div64_u64(raid56_full_stripe_start,
stripe_len * nr_data_stripes(map));
/* RAID[56] write or recovery. Return all stripes */
@@ -5612,8 +5825,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
/* We distribute the parity blocks across stripes */
div_u64_rem(stripe_nr + stripe_index, map->num_stripes,
&stripe_index);
- if ((op != BTRFS_MAP_WRITE && op != BTRFS_MAP_DISCARD &&
- op != BTRFS_MAP_GET_READ_MIRRORS) && mirror_num <= 1)
+ if ((op != BTRFS_MAP_WRITE &&
+ op != BTRFS_MAP_GET_READ_MIRRORS) &&
+ mirror_num <= 1)
mirror_num = 1;
}
} else {
@@ -5635,8 +5849,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
}
num_alloc_stripes = num_stripes;
- if (dev_replace_is_ongoing) {
- if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_DISCARD)
+ if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) {
+ if (op == BTRFS_MAP_WRITE)
num_alloc_stripes <<= 1;
if (op == BTRFS_MAP_GET_READ_MIRRORS)
num_alloc_stripes++;
@@ -5648,14 +5862,12 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
ret = -ENOMEM;
goto out;
}
- if (dev_replace_is_ongoing)
+ if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL)
bbio->tgtdev_map = (int *)(bbio->stripes + num_alloc_stripes);
/* build raid_map */
- if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK &&
- need_raid_map &&
- ((op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS) ||
- mirror_num > 1)) {
+ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map &&
+ (need_full_stripe(op) || mirror_num > 1)) {
u64 tmp;
unsigned rot;
@@ -5679,173 +5891,27 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
RAID6_Q_STRIPE;
}
- if (op == BTRFS_MAP_DISCARD) {
- u32 factor = 0;
- u32 sub_stripes = 0;
- u64 stripes_per_dev = 0;
- u32 remaining_stripes = 0;
- u32 last_stripe = 0;
- if (map->type &
- (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) {
- if (map->type & BTRFS_BLOCK_GROUP_RAID0)
- sub_stripes = 1;
- else
- sub_stripes = map->sub_stripes;
-
- factor = map->num_stripes / sub_stripes;
- stripes_per_dev = div_u64_rem(stripe_nr_end -
- stripe_nr_orig,
- factor,
- &remaining_stripes);
- div_u64_rem(stripe_nr_end - 1, factor, &last_stripe);
- last_stripe *= sub_stripes;
- }
-
- for (i = 0; i < num_stripes; i++) {
- bbio->stripes[i].physical =
- map->stripes[stripe_index].physical +
- stripe_offset + stripe_nr * map->stripe_len;
- bbio->stripes[i].dev = map->stripes[stripe_index].dev;
-
- if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
- BTRFS_BLOCK_GROUP_RAID10)) {
- bbio->stripes[i].length = stripes_per_dev *
- map->stripe_len;
-
- if (i / sub_stripes < remaining_stripes)
- bbio->stripes[i].length +=
- map->stripe_len;
-
- /*
- * Special for the first stripe and
- * the last stripe:
- *
- * |-------|...|-------|
- * |----------|
- * off end_off
- */
- if (i < sub_stripes)
- bbio->stripes[i].length -=
- stripe_offset;
-
- if (stripe_index >= last_stripe &&
- stripe_index <= (last_stripe +
- sub_stripes - 1))
- bbio->stripes[i].length -=
- stripe_end_offset;
-
- if (i == sub_stripes - 1)
- stripe_offset = 0;
- } else
- bbio->stripes[i].length = *length;
-
- stripe_index++;
- if (stripe_index == map->num_stripes) {
- /* This could only happen for RAID0/10 */
- stripe_index = 0;
- stripe_nr++;
- }
- }
- } else {
- for (i = 0; i < num_stripes; i++) {
- bbio->stripes[i].physical =
- map->stripes[stripe_index].physical +
- stripe_offset +
- stripe_nr * map->stripe_len;
- bbio->stripes[i].dev =
- map->stripes[stripe_index].dev;
- stripe_index++;
- }
+ for (i = 0; i < num_stripes; i++) {
+ bbio->stripes[i].physical =
+ map->stripes[stripe_index].physical +
+ stripe_offset +
+ stripe_nr * map->stripe_len;
+ bbio->stripes[i].dev =
+ map->stripes[stripe_index].dev;
+ stripe_index++;
}
- if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS)
+ if (need_full_stripe(op))
max_errors = btrfs_chunk_max_errors(map);
if (bbio->raid_map)
sort_parity_stripes(bbio, num_stripes);
- tgtdev_indexes = 0;
- if (dev_replace_is_ongoing &&
- (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_DISCARD) &&
- dev_replace->tgtdev != NULL) {
- int index_where_to_add;
- u64 srcdev_devid = dev_replace->srcdev->devid;
-
- /*
- * duplicate the write operations while the dev replace
- * procedure is running. Since the copying of the old disk
- * to the new disk takes place at run time while the
- * filesystem is mounted writable, the regular write
- * operations to the old disk have to be duplicated to go
- * to the new disk as well.
- * Note that device->missing is handled by the caller, and
- * that the write to the old disk is already set up in the
- * stripes array.
- */
- index_where_to_add = num_stripes;
- for (i = 0; i < num_stripes; i++) {
- if (bbio->stripes[i].dev->devid == srcdev_devid) {
- /* write to new disk, too */
- struct btrfs_bio_stripe *new =
- bbio->stripes + index_where_to_add;
- struct btrfs_bio_stripe *old =
- bbio->stripes + i;
-
- new->physical = old->physical;
- new->length = old->length;
- new->dev = dev_replace->tgtdev;
- bbio->tgtdev_map[i] = index_where_to_add;
- index_where_to_add++;
- max_errors++;
- tgtdev_indexes++;
- }
- }
- num_stripes = index_where_to_add;
- } else if (dev_replace_is_ongoing &&
- op == BTRFS_MAP_GET_READ_MIRRORS &&
- dev_replace->tgtdev != NULL) {
- u64 srcdev_devid = dev_replace->srcdev->devid;
- int index_srcdev = 0;
- int found = 0;
- u64 physical_of_found = 0;
-
- /*
- * During the dev-replace procedure, the target drive can
- * also be used to read data in case it is needed to repair
- * a corrupt block elsewhere. This is possible if the
- * requested area is left of the left cursor. In this area,
- * the target drive is a full copy of the source drive.
- */
- for (i = 0; i < num_stripes; i++) {
- if (bbio->stripes[i].dev->devid == srcdev_devid) {
- /*
- * In case of DUP, in order to keep it
- * simple, only add the mirror with the
- * lowest physical address
- */
- if (found &&
- physical_of_found <=
- bbio->stripes[i].physical)
- continue;
- index_srcdev = i;
- found = 1;
- physical_of_found = bbio->stripes[i].physical;
- }
- }
- if (found) {
- struct btrfs_bio_stripe *tgtdev_stripe =
- bbio->stripes + num_stripes;
-
- tgtdev_stripe->physical = physical_of_found;
- tgtdev_stripe->length =
- bbio->stripes[index_srcdev].length;
- tgtdev_stripe->dev = dev_replace->tgtdev;
- bbio->tgtdev_map[index_srcdev] = num_stripes;
-
- tgtdev_indexes++;
- num_stripes++;
- }
+ if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL &&
+ need_full_stripe(op)) {
+ handle_ops_on_dev_replace(op, &bbio, dev_replace, &num_stripes,
+ &max_errors);
}
*bbio_ret = bbio;
@@ -5853,7 +5919,6 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
bbio->num_stripes = num_stripes;
bbio->max_errors = max_errors;
bbio->mirror_num = mirror_num;
- bbio->num_tgtdevs = tgtdev_indexes;
/*
* this is the case that REQ_READ && dev_replace_is_ongoing &&
@@ -5886,19 +5951,15 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
/* For Scrub/replace */
int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
u64 logical, u64 *length,
- struct btrfs_bio **bbio_ret, int mirror_num,
- int need_raid_map)
+ struct btrfs_bio **bbio_ret)
{
- return __btrfs_map_block(fs_info, op, logical, length, bbio_ret,
- mirror_num, need_raid_map);
+ return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, 0, 1);
}
int btrfs_rmap_block(struct btrfs_fs_info *fs_info,
u64 chunk_start, u64 physical, u64 devid,
u64 **logical, int *naddrs, int *stripe_len)
{
- struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
- struct extent_map_tree *em_tree = &map_tree->map_tree;
struct extent_map *em;
struct map_lookup *map;
u64 *buf;
@@ -5908,24 +5969,11 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info,
u64 rmap_len;
int i, j, nr = 0;
- read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, chunk_start, 1);
- read_unlock(&em_tree->lock);
-
- if (!em) {
- btrfs_err(fs_info, "couldn't find em for chunk %Lu",
- chunk_start);
+ em = get_chunk_map(fs_info, chunk_start, 1);
+ if (IS_ERR(em))
return -EIO;
- }
- if (em->start != chunk_start) {
- btrfs_err(fs_info, "bad chunk start, em=%Lu, wanted=%Lu",
- em->start, chunk_start);
- free_extent_map(em);
- return -EIO;
- }
map = em->map_lookup;
-
length = em->len;
rmap_len = map->stripe_len;
@@ -5949,7 +5997,7 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info,
continue;
stripe_nr = physical - map->stripes[i].physical;
- stripe_nr = div_u64(stripe_nr, map->stripe_len);
+ stripe_nr = div64_u64(stripe_nr, map->stripe_len);
if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
stripe_nr = stripe_nr * map->num_stripes + i;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 59be81206dd7..c7d0fbc915ca 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -123,7 +123,6 @@ struct btrfs_device {
struct list_head resized_list;
/* for sending down flush barriers */
- int nobarriers;
struct bio *flush_bio;
struct completion flush_wait;
@@ -298,7 +297,7 @@ struct btrfs_bio;
typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err);
struct btrfs_bio {
- atomic_t refs;
+ refcount_t refs;
atomic_t stripes_pending;
struct btrfs_fs_info *fs_info;
u64 map_type; /* get from map_lookup->type */
@@ -400,8 +399,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
struct btrfs_bio **bbio_ret, int mirror_num);
int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
u64 logical, u64 *length,
- struct btrfs_bio **bbio_ret, int mirror_num,
- int need_raid_map);
+ struct btrfs_bio **bbio_ret);
int btrfs_rmap_block(struct btrfs_fs_info *fs_info,
u64 chunk_start, u64 physical, u64 devid,
u64 **logical, int *naddrs, int *stripe_len);
@@ -475,7 +473,7 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info,
struct btrfs_device *tgtdev);
void btrfs_scratch_superblocks(struct block_device *bdev, const char *device_path);
-int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree,
+int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info,
u64 logical, u64 len, int mirror_num);
unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
struct btrfs_mapping_tree *map_tree,
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 9ecb2fd348cb..1e71e6ca5ddf 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -670,8 +670,12 @@ static void writepages_finish(struct ceph_osd_request *req)
bool remove_page;
dout("writepages_finish %p rc %d\n", inode, rc);
- if (rc < 0)
+ if (rc < 0) {
mapping_set_error(mapping, rc);
+ ceph_set_error_write(ci);
+ } else {
+ ceph_clear_error_write(ci);
+ }
/*
* We lost the cache cap, need to truncate the page before
@@ -703,9 +707,6 @@ static void writepages_finish(struct ceph_osd_request *req)
clear_bdi_congested(inode_to_bdi(inode),
BLK_RW_ASYNC);
- if (rc < 0)
- SetPageError(page);
-
ceph_put_snap_context(page_snap_context(page));
page->private = 0;
ClearPagePrivate(page);
@@ -1892,6 +1893,7 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci,
err = ceph_osdc_start_request(&fsc->client->osdc, rd_req, false);
wr_req->r_mtime = ci->vfs_inode.i_mtime;
+ wr_req->r_abort_on_full = true;
err2 = ceph_osdc_start_request(&fsc->client->osdc, wr_req, false);
if (!err)
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 68c78be19d5b..a3ebb632294e 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1015,6 +1015,7 @@ static int send_cap_msg(struct cap_msg_args *arg)
void *p;
size_t extra_len;
struct timespec zerotime = {0};
+ struct ceph_osd_client *osdc = &arg->session->s_mdsc->fsc->client->osdc;
dout("send_cap_msg %s %llx %llx caps %s wanted %s dirty %s"
" seq %u/%u tid %llu/%llu mseq %u follows %lld size %llu/%llu"
@@ -1076,8 +1077,12 @@ static int send_cap_msg(struct cap_msg_args *arg)
ceph_encode_64(&p, arg->inline_data ? 0 : CEPH_INLINE_NONE);
/* inline data size */
ceph_encode_32(&p, 0);
- /* osd_epoch_barrier (version 5) */
- ceph_encode_32(&p, 0);
+ /*
+ * osd_epoch_barrier (version 5)
+ * The epoch_barrier is protected osdc->lock, so READ_ONCE here in
+ * case it was recently changed
+ */
+ ceph_encode_32(&p, READ_ONCE(osdc->epoch_barrier));
/* oldest_flush_tid (version 6) */
ceph_encode_64(&p, arg->oldest_flush_tid);
@@ -1389,7 +1394,7 @@ static void __ceph_flush_snaps(struct ceph_inode_info *ci,
first_tid = cf->tid + 1;
capsnap = container_of(cf, struct ceph_cap_snap, cap_flush);
- atomic_inc(&capsnap->nref);
+ refcount_inc(&capsnap->nref);
spin_unlock(&ci->i_ceph_lock);
dout("__flush_snaps %p capsnap %p tid %llu %s\n",
@@ -2202,7 +2207,7 @@ static void __kick_flushing_caps(struct ceph_mds_client *mdsc,
inode, capsnap, cf->tid,
ceph_cap_string(capsnap->dirty));
- atomic_inc(&capsnap->nref);
+ refcount_inc(&capsnap->nref);
spin_unlock(&ci->i_ceph_lock);
ret = __send_flush_snap(inode, session, capsnap, cap->mseq,
@@ -3633,13 +3638,19 @@ void ceph_handle_caps(struct ceph_mds_session *session,
p += inline_len;
}
+ if (le16_to_cpu(msg->hdr.version) >= 5) {
+ struct ceph_osd_client *osdc = &mdsc->fsc->client->osdc;
+ u32 epoch_barrier;
+
+ ceph_decode_32_safe(&p, end, epoch_barrier, bad);
+ ceph_osdc_update_epoch_barrier(osdc, epoch_barrier);
+ }
+
if (le16_to_cpu(msg->hdr.version) >= 8) {
u64 flush_tid;
u32 caller_uid, caller_gid;
- u32 osd_epoch_barrier;
u32 pool_ns_len;
- /* version >= 5 */
- ceph_decode_32_safe(&p, end, osd_epoch_barrier, bad);
+
/* version >= 6 */
ceph_decode_64_safe(&p, end, flush_tid, bad);
/* version >= 7 */
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index 3ef11bc8d728..4e2d112c982f 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -22,20 +22,19 @@ static int mdsmap_show(struct seq_file *s, void *p)
{
int i;
struct ceph_fs_client *fsc = s->private;
+ struct ceph_mdsmap *mdsmap;
if (fsc->mdsc == NULL || fsc->mdsc->mdsmap == NULL)
return 0;
- seq_printf(s, "epoch %d\n", fsc->mdsc->mdsmap->m_epoch);
- seq_printf(s, "root %d\n", fsc->mdsc->mdsmap->m_root);
- seq_printf(s, "session_timeout %d\n",
- fsc->mdsc->mdsmap->m_session_timeout);
- seq_printf(s, "session_autoclose %d\n",
- fsc->mdsc->mdsmap->m_session_autoclose);
- for (i = 0; i < fsc->mdsc->mdsmap->m_max_mds; i++) {
- struct ceph_entity_addr *addr =
- &fsc->mdsc->mdsmap->m_info[i].addr;
- int state = fsc->mdsc->mdsmap->m_info[i].state;
-
+ mdsmap = fsc->mdsc->mdsmap;
+ seq_printf(s, "epoch %d\n", mdsmap->m_epoch);
+ seq_printf(s, "root %d\n", mdsmap->m_root);
+ seq_printf(s, "max_mds %d\n", mdsmap->m_max_mds);
+ seq_printf(s, "session_timeout %d\n", mdsmap->m_session_timeout);
+ seq_printf(s, "session_autoclose %d\n", mdsmap->m_session_autoclose);
+ for (i = 0; i < mdsmap->m_num_mds; i++) {
+ struct ceph_entity_addr *addr = &mdsmap->m_info[i].addr;
+ int state = mdsmap->m_info[i].state;
seq_printf(s, "\tmds%d\t%s\t(%s)\n", i,
ceph_pr_addr(&addr->in_addr),
ceph_mds_state_name(state));
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 3e9ad501addf..e071d23f6148 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -294,7 +294,7 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
struct ceph_mds_client *mdsc = fsc->mdsc;
int i;
int err;
- u32 ftype;
+ unsigned frag = -1;
struct ceph_mds_reply_info_parsed *rinfo;
dout("readdir %p file %p pos %llx\n", inode, file, ctx->pos);
@@ -341,7 +341,6 @@ more:
/* do we have the correct frag content buffered? */
if (need_send_readdir(fi, ctx->pos)) {
struct ceph_mds_request *req;
- unsigned frag;
int op = ceph_snap(inode) == CEPH_SNAPDIR ?
CEPH_MDS_OP_LSSNAP : CEPH_MDS_OP_READDIR;
@@ -352,8 +351,11 @@ more:
}
if (is_hash_order(ctx->pos)) {
- frag = ceph_choose_frag(ci, fpos_hash(ctx->pos),
- NULL, NULL);
+ /* fragtree isn't always accurate. choose frag
+ * based on previous reply when possible. */
+ if (frag == (unsigned)-1)
+ frag = ceph_choose_frag(ci, fpos_hash(ctx->pos),
+ NULL, NULL);
} else {
frag = fpos_frag(ctx->pos);
}
@@ -378,7 +380,11 @@ more:
ceph_mdsc_put_request(req);
return -ENOMEM;
}
+ } else if (is_hash_order(ctx->pos)) {
+ req->r_args.readdir.offset_hash =
+ cpu_to_le32(fpos_hash(ctx->pos));
}
+
req->r_dir_release_cnt = fi->dir_release_count;
req->r_dir_ordered_cnt = fi->dir_ordered_count;
req->r_readdir_cache_idx = fi->readdir_cache_idx;
@@ -476,6 +482,7 @@ more:
struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i;
struct ceph_vino vino;
ino_t ino;
+ u32 ftype;
BUG_ON(rde->offset < ctx->pos);
@@ -498,15 +505,17 @@ more:
ctx->pos++;
}
+ ceph_mdsc_put_request(fi->last_readdir);
+ fi->last_readdir = NULL;
+
if (fi->next_offset > 2) {
- ceph_mdsc_put_request(fi->last_readdir);
- fi->last_readdir = NULL;
+ frag = fi->frag;
goto more;
}
/* more frags? */
if (!ceph_frag_is_rightmost(fi->frag)) {
- unsigned frag = ceph_frag_next(fi->frag);
+ frag = ceph_frag_next(fi->frag);
if (is_hash_order(ctx->pos)) {
loff_t new_pos = ceph_make_fpos(ceph_frag_value(frag),
fi->next_offset, true);
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 18c045e2ead6..3fdde0b283c9 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -13,6 +13,38 @@
#include "mds_client.h"
#include "cache.h"
+static __le32 ceph_flags_sys2wire(u32 flags)
+{
+ u32 wire_flags = 0;
+
+ switch (flags & O_ACCMODE) {
+ case O_RDONLY:
+ wire_flags |= CEPH_O_RDONLY;
+ break;
+ case O_WRONLY:
+ wire_flags |= CEPH_O_WRONLY;
+ break;
+ case O_RDWR:
+ wire_flags |= CEPH_O_RDWR;
+ break;
+ }
+
+#define ceph_sys2wire(a) if (flags & a) { wire_flags |= CEPH_##a; flags &= ~a; }
+
+ ceph_sys2wire(O_CREAT);
+ ceph_sys2wire(O_EXCL);
+ ceph_sys2wire(O_TRUNC);
+ ceph_sys2wire(O_DIRECTORY);
+ ceph_sys2wire(O_NOFOLLOW);
+
+#undef ceph_sys2wire
+
+ if (flags)
+ dout("unused open flags: %x", flags);
+
+ return cpu_to_le32(wire_flags);
+}
+
/*
* Ceph file operations
*
@@ -120,7 +152,7 @@ prepare_open_request(struct super_block *sb, int flags, int create_mode)
if (IS_ERR(req))
goto out;
req->r_fmode = ceph_flags_to_mode(flags);
- req->r_args.open.flags = cpu_to_le32(flags);
+ req->r_args.open.flags = ceph_flags_sys2wire(flags);
req->r_args.open.mode = cpu_to_le32(create_mode);
out:
return req;
@@ -189,7 +221,7 @@ int ceph_renew_caps(struct inode *inode)
spin_lock(&ci->i_ceph_lock);
wanted = __ceph_caps_file_wanted(ci);
if (__ceph_is_any_real_caps(ci) &&
- (!(wanted & CEPH_CAP_ANY_WR) == 0 || ci->i_auth_cap)) {
+ (!(wanted & CEPH_CAP_ANY_WR) || ci->i_auth_cap)) {
int issued = __ceph_caps_issued(ci, NULL);
spin_unlock(&ci->i_ceph_lock);
dout("renew caps %p want %s issued %s updating mds_wanted\n",
@@ -778,6 +810,7 @@ static void ceph_aio_retry_work(struct work_struct *work)
req->r_callback = ceph_aio_complete_req;
req->r_inode = inode;
req->r_priv = aio_req;
+ req->r_abort_on_full = true;
ret = ceph_osdc_start_request(req->r_osdc, req, false);
out:
@@ -1085,19 +1118,22 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
out:
ceph_osdc_put_request(req);
- if (ret == 0) {
- pos += len;
- written += len;
-
- if (pos > i_size_read(inode)) {
- check_caps = ceph_inode_set_size(inode, pos);
- if (check_caps)
- ceph_check_caps(ceph_inode(inode),
- CHECK_CAPS_AUTHONLY,
- NULL);
- }
- } else
+ if (ret != 0) {
+ ceph_set_error_write(ci);
break;
+ }
+
+ ceph_clear_error_write(ci);
+ pos += len;
+ written += len;
+ if (pos > i_size_read(inode)) {
+ check_caps = ceph_inode_set_size(inode, pos);
+ if (check_caps)
+ ceph_check_caps(ceph_inode(inode),
+ CHECK_CAPS_AUTHONLY,
+ NULL);
+ }
+
}
if (ret != -EOLDSNAPC && written > 0) {
@@ -1303,6 +1339,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
}
retry_snap:
+ /* FIXME: not complete since it doesn't account for being at quota */
if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL)) {
err = -ENOSPC;
goto out;
@@ -1324,7 +1361,8 @@ retry_snap:
inode, ceph_vinop(inode), pos, count, ceph_cap_string(got));
if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 ||
- (iocb->ki_flags & IOCB_DIRECT) || (fi->flags & CEPH_F_SYNC)) {
+ (iocb->ki_flags & IOCB_DIRECT) || (fi->flags & CEPH_F_SYNC) ||
+ (ci->i_ceph_flags & CEPH_I_ERROR_WRITE)) {
struct ceph_snap_context *snapc;
struct iov_iter data;
inode_unlock(inode);
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index d3119fe3ab45..dcce79b84406 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -1482,10 +1482,17 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags))
return readdir_prepopulate_inodes_only(req, session);
- if (rinfo->hash_order && req->r_path2) {
- last_hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash,
- req->r_path2, strlen(req->r_path2));
- last_hash = ceph_frag_value(last_hash);
+ if (rinfo->hash_order) {
+ if (req->r_path2) {
+ last_hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash,
+ req->r_path2,
+ strlen(req->r_path2));
+ last_hash = ceph_frag_value(last_hash);
+ } else if (rinfo->offset_hash) {
+ /* mds understands offset_hash */
+ WARN_ON_ONCE(req->r_readdir_offset != 2);
+ last_hash = le32_to_cpu(rhead->args.readdir.offset_hash);
+ }
}
if (rinfo->dir_dir &&
@@ -1510,7 +1517,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
}
if (ceph_frag_is_leftmost(frag) && req->r_readdir_offset == 2 &&
- !(rinfo->hash_order && req->r_path2)) {
+ !(rinfo->hash_order && last_hash)) {
/* note dir version at start of readdir so we can tell
* if any dentries get dropped */
req->r_dir_release_cnt = atomic64_read(&ci->i_release_count);
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 1d3fa90d40b9..f38e56fa9712 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -189,6 +189,7 @@ static int parse_reply_info_dir(void **p, void *end,
info->dir_end = !!(flags & CEPH_READDIR_FRAG_END);
info->dir_complete = !!(flags & CEPH_READDIR_FRAG_COMPLETE);
info->hash_order = !!(flags & CEPH_READDIR_HASH_ORDER);
+ info->offset_hash = !!(flags & CEPH_READDIR_OFFSET_HASH);
}
if (num == 0)
goto done;
@@ -378,9 +379,9 @@ const char *ceph_session_state_name(int s)
static struct ceph_mds_session *get_session(struct ceph_mds_session *s)
{
- if (atomic_inc_not_zero(&s->s_ref)) {
+ if (refcount_inc_not_zero(&s->s_ref)) {
dout("mdsc get_session %p %d -> %d\n", s,
- atomic_read(&s->s_ref)-1, atomic_read(&s->s_ref));
+ refcount_read(&s->s_ref)-1, refcount_read(&s->s_ref));
return s;
} else {
dout("mdsc get_session %p 0 -- FAIL", s);
@@ -391,8 +392,8 @@ static struct ceph_mds_session *get_session(struct ceph_mds_session *s)
void ceph_put_mds_session(struct ceph_mds_session *s)
{
dout("mdsc put_session %p %d -> %d\n", s,
- atomic_read(&s->s_ref), atomic_read(&s->s_ref)-1);
- if (atomic_dec_and_test(&s->s_ref)) {
+ refcount_read(&s->s_ref), refcount_read(&s->s_ref)-1);
+ if (refcount_dec_and_test(&s->s_ref)) {
if (s->s_auth.authorizer)
ceph_auth_destroy_authorizer(s->s_auth.authorizer);
kfree(s);
@@ -411,7 +412,7 @@ struct ceph_mds_session *__ceph_lookup_mds_session(struct ceph_mds_client *mdsc,
return NULL;
session = mdsc->sessions[mds];
dout("lookup_mds_session %p %d\n", session,
- atomic_read(&session->s_ref));
+ refcount_read(&session->s_ref));
get_session(session);
return session;
}
@@ -441,7 +442,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
{
struct ceph_mds_session *s;
- if (mds >= mdsc->mdsmap->m_max_mds)
+ if (mds >= mdsc->mdsmap->m_num_mds)
return ERR_PTR(-EINVAL);
s = kzalloc(sizeof(*s), GFP_NOFS);
@@ -466,7 +467,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
INIT_LIST_HEAD(&s->s_caps);
s->s_nr_caps = 0;
s->s_trim_caps = 0;
- atomic_set(&s->s_ref, 1);
+ refcount_set(&s->s_ref, 1);
INIT_LIST_HEAD(&s->s_waiting);
INIT_LIST_HEAD(&s->s_unsafe);
s->s_num_cap_releases = 0;
@@ -494,7 +495,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
}
mdsc->sessions[mds] = s;
atomic_inc(&mdsc->num_sessions);
- atomic_inc(&s->s_ref); /* one ref to sessions[], one to caller */
+ refcount_inc(&s->s_ref); /* one ref to sessions[], one to caller */
ceph_con_open(&s->s_con, CEPH_ENTITY_TYPE_MDS, mds,
ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
@@ -1004,7 +1005,7 @@ static void __open_export_target_sessions(struct ceph_mds_client *mdsc,
struct ceph_mds_session *ts;
int i, mds = session->s_mds;
- if (mds >= mdsc->mdsmap->m_max_mds)
+ if (mds >= mdsc->mdsmap->m_num_mds)
return;
mi = &mdsc->mdsmap->m_info[mds];
@@ -1551,9 +1552,15 @@ void ceph_send_cap_releases(struct ceph_mds_client *mdsc,
struct ceph_msg *msg = NULL;
struct ceph_mds_cap_release *head;
struct ceph_mds_cap_item *item;
+ struct ceph_osd_client *osdc = &mdsc->fsc->client->osdc;
struct ceph_cap *cap;
LIST_HEAD(tmp_list);
int num_cap_releases;
+ __le32 barrier, *cap_barrier;
+
+ down_read(&osdc->lock);
+ barrier = cpu_to_le32(osdc->epoch_barrier);
+ up_read(&osdc->lock);
spin_lock(&session->s_cap_lock);
again:
@@ -1571,7 +1578,11 @@ again:
head = msg->front.iov_base;
head->num = cpu_to_le32(0);
msg->front.iov_len = sizeof(*head);
+
+ msg->hdr.version = cpu_to_le16(2);
+ msg->hdr.compat_version = cpu_to_le16(1);
}
+
cap = list_first_entry(&tmp_list, struct ceph_cap,
session_caps);
list_del(&cap->session_caps);
@@ -1589,6 +1600,11 @@ again:
ceph_put_cap(mdsc, cap);
if (le32_to_cpu(head->num) == CEPH_CAPS_PER_RELEASE) {
+ // Append cap_barrier field
+ cap_barrier = msg->front.iov_base + msg->front.iov_len;
+ *cap_barrier = barrier;
+ msg->front.iov_len += sizeof(*cap_barrier);
+
msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
dout("send_cap_releases mds%d %p\n", session->s_mds, msg);
ceph_con_send(&session->s_con, msg);
@@ -1604,6 +1620,11 @@ again:
spin_unlock(&session->s_cap_lock);
if (msg) {
+ // Append cap_barrier field
+ cap_barrier = msg->front.iov_base + msg->front.iov_len;
+ *cap_barrier = barrier;
+ msg->front.iov_len += sizeof(*cap_barrier);
+
msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
dout("send_cap_releases mds%d %p\n", session->s_mds, msg);
ceph_con_send(&session->s_con, msg);
@@ -1993,7 +2014,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
if (req->r_pagelist) {
struct ceph_pagelist *pagelist = req->r_pagelist;
- atomic_inc(&pagelist->refcnt);
+ refcount_inc(&pagelist->refcnt);
ceph_msg_data_add_pagelist(msg, pagelist);
msg->hdr.data_len = cpu_to_le32(pagelist->length);
} else {
@@ -2640,8 +2661,10 @@ static void handle_session(struct ceph_mds_session *session,
seq = le64_to_cpu(h->seq);
mutex_lock(&mdsc->mutex);
- if (op == CEPH_SESSION_CLOSE)
+ if (op == CEPH_SESSION_CLOSE) {
+ get_session(session);
__unregister_session(mdsc, session);
+ }
/* FIXME: this ttl calculation is generous */
session->s_ttl = jiffies + HZ*mdsc->mdsmap->m_session_autoclose;
mutex_unlock(&mdsc->mutex);
@@ -2730,6 +2753,8 @@ static void handle_session(struct ceph_mds_session *session,
kick_requests(mdsc, mds);
mutex_unlock(&mdsc->mutex);
}
+ if (op == CEPH_SESSION_CLOSE)
+ ceph_put_mds_session(session);
return;
bad:
@@ -3109,7 +3134,7 @@ static void check_new_map(struct ceph_mds_client *mdsc,
dout("check_new_map new %u old %u\n",
newmap->m_epoch, oldmap->m_epoch);
- for (i = 0; i < oldmap->m_max_mds && i < mdsc->max_sessions; i++) {
+ for (i = 0; i < oldmap->m_num_mds && i < mdsc->max_sessions; i++) {
if (mdsc->sessions[i] == NULL)
continue;
s = mdsc->sessions[i];
@@ -3123,15 +3148,33 @@ static void check_new_map(struct ceph_mds_client *mdsc,
ceph_mdsmap_is_laggy(newmap, i) ? " (laggy)" : "",
ceph_session_state_name(s->s_state));
- if (i >= newmap->m_max_mds ||
+ if (i >= newmap->m_num_mds ||
memcmp(ceph_mdsmap_get_addr(oldmap, i),
ceph_mdsmap_get_addr(newmap, i),
sizeof(struct ceph_entity_addr))) {
if (s->s_state == CEPH_MDS_SESSION_OPENING) {
/* the session never opened, just close it
* out now */
+ get_session(s);
+ __unregister_session(mdsc, s);
__wake_requests(mdsc, &s->s_waiting);
+ ceph_put_mds_session(s);
+ } else if (i >= newmap->m_num_mds) {
+ /* force close session for stopped mds */
+ get_session(s);
__unregister_session(mdsc, s);
+ __wake_requests(mdsc, &s->s_waiting);
+ kick_requests(mdsc, i);
+ mutex_unlock(&mdsc->mutex);
+
+ mutex_lock(&s->s_mutex);
+ cleanup_session_requests(mdsc, s);
+ remove_session_caps(s);
+ mutex_unlock(&s->s_mutex);
+
+ ceph_put_mds_session(s);
+
+ mutex_lock(&mdsc->mutex);
} else {
/* just close it */
mutex_unlock(&mdsc->mutex);
@@ -3169,7 +3212,7 @@ static void check_new_map(struct ceph_mds_client *mdsc,
}
}
- for (i = 0; i < newmap->m_max_mds && i < mdsc->max_sessions; i++) {
+ for (i = 0; i < newmap->m_num_mds && i < mdsc->max_sessions; i++) {
s = mdsc->sessions[i];
if (!s)
continue;
@@ -3883,7 +3926,7 @@ static struct ceph_connection *con_get(struct ceph_connection *con)
struct ceph_mds_session *s = con->private;
if (get_session(s)) {
- dout("mdsc con_get %p ok (%d)\n", s, atomic_read(&s->s_ref));
+ dout("mdsc con_get %p ok (%d)\n", s, refcount_read(&s->s_ref));
return con;
}
dout("mdsc con_get %p FAIL\n", s);
@@ -3894,7 +3937,7 @@ static void con_put(struct ceph_connection *con)
{
struct ceph_mds_session *s = con->private;
- dout("mdsc con_put %p (%d)\n", s, atomic_read(&s->s_ref) - 1);
+ dout("mdsc con_put %p (%d)\n", s, refcount_read(&s->s_ref) - 1);
ceph_put_mds_session(s);
}
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index ac0475a2daa7..db57ae98ed34 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -7,6 +7,7 @@
#include <linux/mutex.h>
#include <linux/rbtree.h>
#include <linux/spinlock.h>
+#include <linux/refcount.h>
#include <linux/ceph/types.h>
#include <linux/ceph/messenger.h>
@@ -82,9 +83,10 @@ struct ceph_mds_reply_info_parsed {
struct ceph_mds_reply_dirfrag *dir_dir;
size_t dir_buf_size;
int dir_nr;
- bool dir_complete;
bool dir_end;
+ bool dir_complete;
bool hash_order;
+ bool offset_hash;
struct ceph_mds_reply_dir_entry *dir_entries;
};
@@ -104,10 +106,13 @@ struct ceph_mds_reply_info_parsed {
/*
* cap releases are batched and sent to the MDS en masse.
+ *
+ * Account for per-message overhead of mds_cap_release header
+ * and __le32 for osd epoch barrier trailing field.
*/
-#define CEPH_CAPS_PER_RELEASE ((PAGE_SIZE - \
+#define CEPH_CAPS_PER_RELEASE ((PAGE_SIZE - sizeof(u32) - \
sizeof(struct ceph_mds_cap_release)) / \
- sizeof(struct ceph_mds_cap_item))
+ sizeof(struct ceph_mds_cap_item))
/*
@@ -156,7 +161,7 @@ struct ceph_mds_session {
unsigned long s_renew_requested; /* last time we sent a renew req */
u64 s_renew_seq;
- atomic_t s_ref;
+ refcount_t s_ref;
struct list_head s_waiting; /* waiting requests */
struct list_head s_unsafe; /* unsafe requests */
};
@@ -373,7 +378,7 @@ __ceph_lookup_mds_session(struct ceph_mds_client *, int mds);
static inline struct ceph_mds_session *
ceph_get_mds_session(struct ceph_mds_session *s)
{
- atomic_inc(&s->s_ref);
+ refcount_inc(&s->s_ref);
return s;
}
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
index 5454e2327a5f..1a748cf88535 100644
--- a/fs/ceph/mdsmap.c
+++ b/fs/ceph/mdsmap.c
@@ -22,11 +22,11 @@ int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m)
int i;
/* special case for one mds */
- if (1 == m->m_max_mds && m->m_info[0].state > 0)
+ if (1 == m->m_num_mds && m->m_info[0].state > 0)
return 0;
/* count */
- for (i = 0; i < m->m_max_mds; i++)
+ for (i = 0; i < m->m_num_mds; i++)
if (m->m_info[i].state > 0)
n++;
if (n == 0)
@@ -135,8 +135,9 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
m->m_session_autoclose = ceph_decode_32(p);
m->m_max_file_size = ceph_decode_64(p);
m->m_max_mds = ceph_decode_32(p);
+ m->m_num_mds = m->m_max_mds;
- m->m_info = kcalloc(m->m_max_mds, sizeof(*m->m_info), GFP_NOFS);
+ m->m_info = kcalloc(m->m_num_mds, sizeof(*m->m_info), GFP_NOFS);
if (m->m_info == NULL)
goto nomem;
@@ -207,9 +208,20 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
ceph_pr_addr(&addr.in_addr),
ceph_mds_state_name(state));
- if (mds < 0 || mds >= m->m_max_mds || state <= 0)
+ if (mds < 0 || state <= 0)
continue;
+ if (mds >= m->m_num_mds) {
+ int new_num = max(mds + 1, m->m_num_mds * 2);
+ void *new_m_info = krealloc(m->m_info,
+ new_num * sizeof(*m->m_info),
+ GFP_NOFS | __GFP_ZERO);
+ if (!new_m_info)
+ goto nomem;
+ m->m_info = new_m_info;
+ m->m_num_mds = new_num;
+ }
+
info = &m->m_info[mds];
info->global_id = global_id;
info->state = state;
@@ -229,6 +241,14 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
info->export_targets = NULL;
}
}
+ if (m->m_num_mds > m->m_max_mds) {
+ /* find max up mds */
+ for (i = m->m_num_mds; i >= m->m_max_mds; i--) {
+ if (i == 0 || m->m_info[i-1].state > 0)
+ break;
+ }
+ m->m_num_mds = i;
+ }
/* pg_pools */
ceph_decode_32_safe(p, end, n, bad);
@@ -270,12 +290,22 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
for (i = 0; i < n; i++) {
s32 mds = ceph_decode_32(p);
- if (mds >= 0 && mds < m->m_max_mds) {
+ if (mds >= 0 && mds < m->m_num_mds) {
if (m->m_info[mds].laggy)
num_laggy++;
}
}
m->m_num_laggy = num_laggy;
+
+ if (n > m->m_num_mds) {
+ void *new_m_info = krealloc(m->m_info,
+ n * sizeof(*m->m_info),
+ GFP_NOFS | __GFP_ZERO);
+ if (!new_m_info)
+ goto nomem;
+ m->m_info = new_m_info;
+ }
+ m->m_num_mds = n;
}
/* inc */
@@ -341,7 +371,7 @@ void ceph_mdsmap_destroy(struct ceph_mdsmap *m)
{
int i;
- for (i = 0; i < m->m_max_mds; i++)
+ for (i = 0; i < m->m_num_mds; i++)
kfree(m->m_info[i].export_targets);
kfree(m->m_info);
kfree(m->m_data_pg_pools);
@@ -357,7 +387,7 @@ bool ceph_mdsmap_is_cluster_available(struct ceph_mdsmap *m)
return false;
if (m->m_num_laggy > 0)
return false;
- for (i = 0; i < m->m_max_mds; i++) {
+ for (i = 0; i < m->m_num_mds; i++) {
if (m->m_info[i].state == CEPH_MDS_STATE_ACTIVE)
nr_active++;
}
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index 8f8b41c2ef0f..dab5d6732345 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -519,7 +519,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
capsnap->need_flush ? "" : "no_flush");
ihold(inode);
- atomic_set(&capsnap->nref, 1);
+ refcount_set(&capsnap->nref, 1);
INIT_LIST_HEAD(&capsnap->ci_item);
capsnap->follows = old_snapc->seq;
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index a8c81b2052ca..8d7918ce694a 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -544,10 +544,6 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
struct ceph_options *opt)
{
struct ceph_fs_client *fsc;
- const u64 supported_features =
- CEPH_FEATURE_FLOCK | CEPH_FEATURE_DIRLAYOUTHASH |
- CEPH_FEATURE_MDSENC | CEPH_FEATURE_MDS_INLINE_DATA;
- const u64 required_features = 0;
int page_count;
size_t size;
int err = -ENOMEM;
@@ -556,8 +552,7 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
if (!fsc)
return ERR_PTR(-ENOMEM);
- fsc->client = ceph_create_client(opt, fsc, supported_features,
- required_features);
+ fsc->client = ceph_create_client(opt, fsc);
if (IS_ERR(fsc->client)) {
err = PTR_ERR(fsc->client);
goto fail;
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 176186b12457..a973acd8beaf 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -14,6 +14,7 @@
#include <linux/writeback.h>
#include <linux/slab.h>
#include <linux/posix_acl.h>
+#include <linux/refcount.h>
#include <linux/ceph/libceph.h>
@@ -160,7 +161,7 @@ struct ceph_cap_flush {
* data before flushing the snapped state (tracked here) back to the MDS.
*/
struct ceph_cap_snap {
- atomic_t nref;
+ refcount_t nref;
struct list_head ci_item;
struct ceph_cap_flush cap_flush;
@@ -189,7 +190,7 @@ struct ceph_cap_snap {
static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap)
{
- if (atomic_dec_and_test(&capsnap->nref)) {
+ if (refcount_dec_and_test(&capsnap->nref)) {
if (capsnap->xattr_blob)
ceph_buffer_put(capsnap->xattr_blob);
kfree(capsnap);
@@ -471,6 +472,32 @@ static inline struct inode *ceph_find_inode(struct super_block *sb,
#define CEPH_I_CAP_DROPPED (1 << 8) /* caps were forcibly dropped */
#define CEPH_I_KICK_FLUSH (1 << 9) /* kick flushing caps */
#define CEPH_I_FLUSH_SNAPS (1 << 10) /* need flush snapss */
+#define CEPH_I_ERROR_WRITE (1 << 11) /* have seen write errors */
+
+/*
+ * We set the ERROR_WRITE bit when we start seeing write errors on an inode
+ * and then clear it when they start succeeding. Note that we do a lockless
+ * check first, and only take the lock if it looks like it needs to be changed.
+ * The write submission code just takes this as a hint, so we're not too
+ * worried if a few slip through in either direction.
+ */
+static inline void ceph_set_error_write(struct ceph_inode_info *ci)
+{
+ if (!(READ_ONCE(ci->i_ceph_flags) & CEPH_I_ERROR_WRITE)) {
+ spin_lock(&ci->i_ceph_lock);
+ ci->i_ceph_flags |= CEPH_I_ERROR_WRITE;
+ spin_unlock(&ci->i_ceph_lock);
+ }
+}
+
+static inline void ceph_clear_error_write(struct ceph_inode_info *ci)
+{
+ if (READ_ONCE(ci->i_ceph_flags) & CEPH_I_ERROR_WRITE) {
+ spin_lock(&ci->i_ceph_lock);
+ ci->i_ceph_flags &= ~CEPH_I_ERROR_WRITE;
+ spin_unlock(&ci->i_ceph_lock);
+ }
+}
static inline void __ceph_dir_set_complete(struct ceph_inode_info *ci,
long long release_count,
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index febc28f9e2c2..75267cdd5dfd 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -392,6 +392,7 @@ static int __set_xattr(struct ceph_inode_info *ci,
if (update_xattr) {
int err = 0;
+
if (xattr && (flags & XATTR_CREATE))
err = -EEXIST;
else if (!xattr && (flags & XATTR_REPLACE))
@@ -399,12 +400,14 @@ static int __set_xattr(struct ceph_inode_info *ci,
if (err) {
kfree(name);
kfree(val);
+ kfree(*newxattr);
return err;
}
if (update_xattr < 0) {
if (xattr)
__remove_xattr(ci, xattr);
kfree(name);
+ kfree(*newxattr);
return 0;
}
}
diff --git a/fs/dax.c b/fs/dax.c
index 66d79067eedf..c22eaf162f95 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -461,35 +461,6 @@ int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
}
/*
- * Invalidate exceptional DAX entry if easily possible. This handles DAX
- * entries for invalidate_inode_pages() so we evict the entry only if we can
- * do so without blocking.
- */
-int dax_invalidate_mapping_entry(struct address_space *mapping, pgoff_t index)
-{
- int ret = 0;
- void *entry, **slot;
- struct radix_tree_root *page_tree = &mapping->page_tree;
-
- spin_lock_irq(&mapping->tree_lock);
- entry = __radix_tree_lookup(page_tree, index, NULL, &slot);
- if (!entry || !radix_tree_exceptional_entry(entry) ||
- slot_locked(mapping, slot))
- goto out;
- if (radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_DIRTY) ||
- radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE))
- goto out;
- radix_tree_delete(page_tree, index);
- mapping->nrexceptional--;
- ret = 1;
-out:
- spin_unlock_irq(&mapping->tree_lock);
- if (ret)
- dax_wake_mapping_entry_waiter(mapping, index, entry, true);
- return ret;
-}
-
-/*
* Invalidate exceptional DAX entry if it is clean.
*/
int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
@@ -993,12 +964,12 @@ int __dax_zero_page_range(struct block_device *bdev,
void *kaddr;
pfn_t pfn;
- rc = bdev_dax_pgoff(bdev, sector, size, &pgoff);
+ rc = bdev_dax_pgoff(bdev, sector, PAGE_SIZE, &pgoff);
if (rc)
return rc;
id = dax_read_lock();
- rc = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr,
+ rc = dax_direct_access(dax_dev, pgoff, 1, &kaddr,
&pfn);
if (rc < 0) {
dax_read_unlock(id);
@@ -1044,7 +1015,7 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
* into page tables. We have to tear down these mappings so that data
* written by write(2) is visible in mmap.
*/
- if ((iomap->flags & IOMAP_F_NEW) && inode->i_mapping->nrpages) {
+ if (iomap->flags & IOMAP_F_NEW) {
invalidate_inode_pages2_range(inode->i_mapping,
pos >> PAGE_SHIFT,
(end - 1) >> PAGE_SHIFT);
@@ -1177,6 +1148,12 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page)
flags |= IOMAP_WRITE;
+ entry = grab_mapping_entry(mapping, vmf->pgoff, 0);
+ if (IS_ERR(entry)) {
+ vmf_ret = dax_fault_return(PTR_ERR(entry));
+ goto out;
+ }
+
/*
* Note that we don't bother to use iomap_apply here: DAX required
* the file system block size to be equal the page size, which means
@@ -1185,17 +1162,11 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap);
if (error) {
vmf_ret = dax_fault_return(error);
- goto out;
+ goto unlock_entry;
}
if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) {
- vmf_ret = dax_fault_return(-EIO); /* fs corruption? */
- goto finish_iomap;
- }
-
- entry = grab_mapping_entry(mapping, vmf->pgoff, 0);
- if (IS_ERR(entry)) {
- vmf_ret = dax_fault_return(PTR_ERR(entry));
- goto finish_iomap;
+ error = -EIO; /* fs corruption? */
+ goto error_finish_iomap;
}
sector = dax_iomap_sector(&iomap, pos);
@@ -1217,13 +1188,13 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
}
if (error)
- goto error_unlock_entry;
+ goto error_finish_iomap;
__SetPageUptodate(vmf->cow_page);
vmf_ret = finish_fault(vmf);
if (!vmf_ret)
vmf_ret = VM_FAULT_DONE_COW;
- goto unlock_entry;
+ goto finish_iomap;
}
switch (iomap.type) {
@@ -1243,7 +1214,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
case IOMAP_HOLE:
if (!(vmf->flags & FAULT_FLAG_WRITE)) {
vmf_ret = dax_load_hole(mapping, &entry, vmf);
- goto unlock_entry;
+ goto finish_iomap;
}
/*FALLTHRU*/
default:
@@ -1252,10 +1223,8 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
break;
}
- error_unlock_entry:
+ error_finish_iomap:
vmf_ret = dax_fault_return(error) | major;
- unlock_entry:
- put_locked_mapping_entry(mapping, vmf->pgoff, entry);
finish_iomap:
if (ops->iomap_end) {
int copied = PAGE_SIZE;
@@ -1270,7 +1239,9 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
*/
ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap);
}
-out:
+ unlock_entry:
+ put_locked_mapping_entry(mapping, vmf->pgoff, entry);
+ out:
trace_dax_pte_fault_done(inode, vmf, vmf_ret);
return vmf_ret;
}
@@ -1417,6 +1388,16 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
goto fallback;
/*
+ * grab_mapping_entry() will make sure we get a 2M empty entry, a DAX
+ * PMD or a HZP entry. If it can't (because a 4k page is already in
+ * the tree, for instance), it will return -EEXIST and we just fall
+ * back to 4k entries.
+ */
+ entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD);
+ if (IS_ERR(entry))
+ goto fallback;
+
+ /*
* Note that we don't use iomap_apply here. We aren't doing I/O, only
* setting up a mapping, so really we're using iomap_begin() as a way
* to look up our filesystem block.
@@ -1424,21 +1405,11 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
pos = (loff_t)pgoff << PAGE_SHIFT;
error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap);
if (error)
- goto fallback;
+ goto unlock_entry;
if (iomap.offset + iomap.length < pos + PMD_SIZE)
goto finish_iomap;
- /*
- * grab_mapping_entry() will make sure we get a 2M empty entry, a DAX
- * PMD or a HZP entry. If it can't (because a 4k page is already in
- * the tree, for instance), it will return -EEXIST and we just fall
- * back to 4k entries.
- */
- entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD);
- if (IS_ERR(entry))
- goto finish_iomap;
-
switch (iomap.type) {
case IOMAP_MAPPED:
result = dax_pmd_insert_mapping(vmf, &iomap, pos, &entry);
@@ -1446,7 +1417,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
case IOMAP_UNWRITTEN:
case IOMAP_HOLE:
if (WARN_ON_ONCE(write))
- goto unlock_entry;
+ break;
result = dax_pmd_load_hole(vmf, &iomap, &entry);
break;
default:
@@ -1454,8 +1425,6 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
break;
}
- unlock_entry:
- put_locked_mapping_entry(mapping, pgoff, entry);
finish_iomap:
if (ops->iomap_end) {
int copied = PMD_SIZE;
@@ -1471,6 +1440,8 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
ops->iomap_end(inode, pos, PMD_SIZE, copied, iomap_flags,
&iomap);
}
+ unlock_entry:
+ put_locked_mapping_entry(mapping, pgoff, entry);
fallback:
if (result == VM_FAULT_FALLBACK) {
split_huge_pmd(vma, vmf->pmd, vmf->address);
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 8ac673c71a36..9c2028b50e5c 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -32,6 +32,7 @@
#include <linux/log2.h>
#include <linux/quotaops.h>
#include <linux/uaccess.h>
+#include <linux/dax.h>
#include "ext2.h"
#include "xattr.h"
#include "acl.h"
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index cefa9835f275..831fd6beebf0 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -257,6 +257,7 @@ static int ext4_dax_huge_fault(struct vm_fault *vmf,
enum page_entry_size pe_size)
{
int result;
+ handle_t *handle = NULL;
struct inode *inode = file_inode(vmf->vma->vm_file);
struct super_block *sb = inode->i_sb;
bool write = vmf->flags & FAULT_FLAG_WRITE;
@@ -264,12 +265,24 @@ static int ext4_dax_huge_fault(struct vm_fault *vmf,
if (write) {
sb_start_pagefault(sb);
file_update_time(vmf->vma->vm_file);
+ down_read(&EXT4_I(inode)->i_mmap_sem);
+ handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE,
+ EXT4_DATA_TRANS_BLOCKS(sb));
+ } else {
+ down_read(&EXT4_I(inode)->i_mmap_sem);
}
- down_read(&EXT4_I(inode)->i_mmap_sem);
- result = dax_iomap_fault(vmf, pe_size, &ext4_iomap_ops);
- up_read(&EXT4_I(inode)->i_mmap_sem);
- if (write)
+ if (!IS_ERR(handle))
+ result = dax_iomap_fault(vmf, pe_size, &ext4_iomap_ops);
+ else
+ result = VM_FAULT_SIGBUS;
+ if (write) {
+ if (!IS_ERR(handle))
+ ext4_journal_stop(handle);
+ up_read(&EXT4_I(inode)->i_mmap_sem);
sb_end_pagefault(sb);
+ } else {
+ up_read(&EXT4_I(inode)->i_mmap_sem);
+ }
return result;
}
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index c90edf09b0c3..0b177da9ea82 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -37,6 +37,7 @@
#include <linux/ctype.h>
#include <linux/log2.h>
#include <linux/crc16.h>
+#include <linux/dax.h>
#include <linux/cleancache.h>
#include <linux/uaccess.h>
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index c2d7f3a92679..c16d00e53264 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -20,6 +20,7 @@
#include <linux/pipe_fs_i.h>
#include <linux/swap.h>
#include <linux/splice.h>
+#include <linux/sched.h>
MODULE_ALIAS_MISCDEV(FUSE_MINOR);
MODULE_ALIAS("devname:fuse");
@@ -45,7 +46,7 @@ static void fuse_request_init(struct fuse_req *req, struct page **pages,
INIT_LIST_HEAD(&req->list);
INIT_LIST_HEAD(&req->intr_entry);
init_waitqueue_head(&req->waitq);
- atomic_set(&req->count, 1);
+ refcount_set(&req->count, 1);
req->pages = pages;
req->page_descs = page_descs;
req->max_pages = npages;
@@ -102,21 +103,20 @@ void fuse_request_free(struct fuse_req *req)
void __fuse_get_request(struct fuse_req *req)
{
- atomic_inc(&req->count);
+ refcount_inc(&req->count);
}
/* Must be called with > 1 refcount */
static void __fuse_put_request(struct fuse_req *req)
{
- BUG_ON(atomic_read(&req->count) < 2);
- atomic_dec(&req->count);
+ refcount_dec(&req->count);
}
-static void fuse_req_init_context(struct fuse_req *req)
+static void fuse_req_init_context(struct fuse_conn *fc, struct fuse_req *req)
{
req->in.h.uid = from_kuid_munged(&init_user_ns, current_fsuid());
req->in.h.gid = from_kgid_munged(&init_user_ns, current_fsgid());
- req->in.h.pid = current->pid;
+ req->in.h.pid = pid_nr_ns(task_pid(current), fc->pid_ns);
}
void fuse_set_initialized(struct fuse_conn *fc)
@@ -163,7 +163,7 @@ static struct fuse_req *__fuse_get_req(struct fuse_conn *fc, unsigned npages,
goto out;
}
- fuse_req_init_context(req);
+ fuse_req_init_context(fc, req);
__set_bit(FR_WAITING, &req->flags);
if (for_background)
__set_bit(FR_BACKGROUND, &req->flags);
@@ -256,7 +256,7 @@ struct fuse_req *fuse_get_req_nofail_nopages(struct fuse_conn *fc,
if (!req)
req = get_reserved_req(fc, file);
- fuse_req_init_context(req);
+ fuse_req_init_context(fc, req);
__set_bit(FR_WAITING, &req->flags);
__clear_bit(FR_BACKGROUND, &req->flags);
return req;
@@ -264,7 +264,7 @@ struct fuse_req *fuse_get_req_nofail_nopages(struct fuse_conn *fc,
void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req)
{
- if (atomic_dec_and_test(&req->count)) {
+ if (refcount_dec_and_test(&req->count)) {
if (test_bit(FR_BACKGROUND, &req->flags)) {
/*
* We get here in the unlikely case that a background
@@ -1222,6 +1222,9 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file,
struct fuse_in *in;
unsigned reqsize;
+ if (task_active_pid_ns(current) != fc->pid_ns)
+ return -EIO;
+
restart:
spin_lock(&fiq->waitq.lock);
err = -EAGAIN;
@@ -1820,6 +1823,9 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud,
struct fuse_req *req;
struct fuse_out_header oh;
+ if (task_active_pid_ns(current) != fc->pid_ns)
+ return -EIO;
+
if (nbytes < sizeof(struct fuse_out_header))
return -EINVAL;
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index ec238fb5a584..3ee4fdc3da9e 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -58,7 +58,7 @@ struct fuse_file *fuse_file_alloc(struct fuse_conn *fc)
}
INIT_LIST_HEAD(&ff->write_entry);
- atomic_set(&ff->count, 1);
+ refcount_set(&ff->count, 1);
RB_CLEAR_NODE(&ff->polled_node);
init_waitqueue_head(&ff->poll_wait);
@@ -77,7 +77,7 @@ void fuse_file_free(struct fuse_file *ff)
static struct fuse_file *fuse_file_get(struct fuse_file *ff)
{
- atomic_inc(&ff->count);
+ refcount_inc(&ff->count);
return ff;
}
@@ -88,7 +88,7 @@ static void fuse_release_end(struct fuse_conn *fc, struct fuse_req *req)
static void fuse_file_put(struct fuse_file *ff, bool sync)
{
- if (atomic_dec_and_test(&ff->count)) {
+ if (refcount_dec_and_test(&ff->count)) {
struct fuse_req *req = ff->reserved_req;
if (ff->fc->no_open) {
@@ -293,7 +293,7 @@ static int fuse_release(struct inode *inode, struct file *file)
void fuse_sync_release(struct fuse_file *ff, int flags)
{
- WARN_ON(atomic_read(&ff->count) != 1);
+ WARN_ON(refcount_read(&ff->count) > 1);
fuse_prepare_release(ff, flags, FUSE_RELEASE);
/*
* iput(NULL) is a no-op and since the refcount is 1 and everything's
@@ -2083,7 +2083,8 @@ static int fuse_direct_mmap(struct file *file, struct vm_area_struct *vma)
return generic_file_mmap(file, vma);
}
-static int convert_fuse_file_lock(const struct fuse_file_lock *ffl,
+static int convert_fuse_file_lock(struct fuse_conn *fc,
+ const struct fuse_file_lock *ffl,
struct file_lock *fl)
{
switch (ffl->type) {
@@ -2098,7 +2099,14 @@ static int convert_fuse_file_lock(const struct fuse_file_lock *ffl,
fl->fl_start = ffl->start;
fl->fl_end = ffl->end;
- fl->fl_pid = ffl->pid;
+
+ /*
+ * Convert pid into the caller's pid namespace. If the pid
+ * does not map into the namespace fl_pid will get set to 0.
+ */
+ rcu_read_lock();
+ fl->fl_pid = pid_vnr(find_pid_ns(ffl->pid, fc->pid_ns));
+ rcu_read_unlock();
break;
default:
@@ -2147,7 +2155,7 @@ static int fuse_getlk(struct file *file, struct file_lock *fl)
args.out.args[0].value = &outarg;
err = fuse_simple_request(fc, &args);
if (!err)
- err = convert_fuse_file_lock(&outarg.lk, fl);
+ err = convert_fuse_file_lock(fc, &outarg.lk, fl);
return err;
}
@@ -2159,7 +2167,8 @@ static int fuse_setlk(struct file *file, struct file_lock *fl, int flock)
FUSE_ARGS(args);
struct fuse_lk_in inarg;
int opcode = (fl->fl_flags & FL_SLEEP) ? FUSE_SETLKW : FUSE_SETLK;
- pid_t pid = fl->fl_type != F_UNLCK ? current->tgid : 0;
+ struct pid *pid = fl->fl_type != F_UNLCK ? task_tgid(current) : NULL;
+ pid_t pid_nr = pid_nr_ns(pid, fc->pid_ns);
int err;
if (fl->fl_lmops && fl->fl_lmops->lm_grant) {
@@ -2168,10 +2177,13 @@ static int fuse_setlk(struct file *file, struct file_lock *fl, int flock)
}
/* Unlock on close is handled by the flush method */
- if (fl->fl_flags & FL_CLOSE)
+ if ((fl->fl_flags & FL_CLOSE_POSIX) == FL_CLOSE_POSIX)
return 0;
- fuse_lk_fill(&args, file, fl, opcode, pid, flock, &inarg);
+ if (pid && pid_nr == 0)
+ return -EOVERFLOW;
+
+ fuse_lk_fill(&args, file, fl, opcode, pid_nr, flock, &inarg);
err = fuse_simple_request(fc, &args);
/* locking is restartable */
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index f33341d9501a..1bd7ffdad593 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -24,6 +24,8 @@
#include <linux/workqueue.h>
#include <linux/kref.h>
#include <linux/xattr.h>
+#include <linux/pid_namespace.h>
+#include <linux/refcount.h>
/** Max number of pages that can be used in a single read request */
#define FUSE_MAX_PAGES_PER_REQ 32
@@ -137,7 +139,7 @@ struct fuse_file {
u64 nodeid;
/** Refcount */
- atomic_t count;
+ refcount_t count;
/** FOPEN_* flags returned by open */
u32 open_flags;
@@ -306,7 +308,7 @@ struct fuse_req {
struct list_head intr_entry;
/** refcount */
- atomic_t count;
+ refcount_t count;
/** Unique ID for the interrupt request */
u64 intr_unique;
@@ -448,7 +450,7 @@ struct fuse_conn {
spinlock_t lock;
/** Refcount */
- atomic_t count;
+ refcount_t count;
/** Number of fuse_dev's */
atomic_t dev_count;
@@ -461,6 +463,9 @@ struct fuse_conn {
/** The group id for this mount */
kgid_t group_id;
+ /** The pid namespace for this mount */
+ struct pid_namespace *pid_ns;
+
/** Maximum read size */
unsigned max_read;
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 73cf05135252..5a1b58f8fef4 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -21,6 +21,7 @@
#include <linux/sched.h>
#include <linux/exportfs.h>
#include <linux/posix_acl.h>
+#include <linux/pid_namespace.h>
MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>");
MODULE_DESCRIPTION("Filesystem in Userspace");
@@ -601,7 +602,7 @@ void fuse_conn_init(struct fuse_conn *fc)
memset(fc, 0, sizeof(*fc));
spin_lock_init(&fc->lock);
init_rwsem(&fc->killsb);
- atomic_set(&fc->count, 1);
+ refcount_set(&fc->count, 1);
atomic_set(&fc->dev_count, 1);
init_waitqueue_head(&fc->blocked_waitq);
init_waitqueue_head(&fc->reserved_req_waitq);
@@ -619,14 +620,16 @@ void fuse_conn_init(struct fuse_conn *fc)
fc->connected = 1;
fc->attr_version = 1;
get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key));
+ fc->pid_ns = get_pid_ns(task_active_pid_ns(current));
}
EXPORT_SYMBOL_GPL(fuse_conn_init);
void fuse_conn_put(struct fuse_conn *fc)
{
- if (atomic_dec_and_test(&fc->count)) {
+ if (refcount_dec_and_test(&fc->count)) {
if (fc->destroy_req)
fuse_request_free(fc->destroy_req);
+ put_pid_ns(fc->pid_ns);
fc->release(fc);
}
}
@@ -634,7 +637,7 @@ EXPORT_SYMBOL_GPL(fuse_conn_put);
struct fuse_conn *fuse_conn_get(struct fuse_conn *fc)
{
- atomic_inc(&fc->count);
+ refcount_inc(&fc->count);
return fc;
}
EXPORT_SYMBOL_GPL(fuse_conn_get);
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 5a0245e36240..ebad34266bcf 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -2363,7 +2363,7 @@ static int jbd2_journal_init_journal_head_cache(void)
jbd2_journal_head_cache = kmem_cache_create("jbd2_journal_head",
sizeof(struct journal_head),
0, /* offset */
- SLAB_TEMPORARY | SLAB_DESTROY_BY_RCU,
+ SLAB_TEMPORARY | SLAB_TYPESAFE_BY_RCU,
NULL); /* ctor */
retval = 0;
if (!jbd2_journal_head_cache) {
diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c
index 06a71dbd4833..389ea53ea487 100644
--- a/fs/jffs2/readinode.c
+++ b/fs/jffs2/readinode.c
@@ -1366,7 +1366,7 @@ int jffs2_do_read_inode(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
jffs2_add_ino_cache(c, f->inocache);
}
if (!f->inocache) {
- JFFS2_ERROR("requestied to read an nonexistent ino %u\n", ino);
+ JFFS2_ERROR("requested to read a nonexistent ino %u\n", ino);
return -ENOENT;
}
diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index 41e491b8e5d7..27d577dbe51a 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -69,6 +69,7 @@ struct nlm_host *nlmclnt_init(const struct nlmclnt_initdata *nlm_init)
if (host->h_rpcclnt == NULL && nlm_bind_host(host) == NULL)
goto out_nobind;
+ host->h_nlmclnt_ops = nlm_init->nlmclnt_ops;
return host;
out_nobind:
nlmclnt_release_host(host);
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index 112952037933..066ac313ae5c 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -150,17 +150,22 @@ static void nlmclnt_release_lockargs(struct nlm_rqst *req)
* @host: address of a valid nlm_host context representing the NLM server
* @cmd: fcntl-style file lock operation to perform
* @fl: address of arguments for the lock operation
+ * @data: address of data to be sent to callback operations
*
*/
-int nlmclnt_proc(struct nlm_host *host, int cmd, struct file_lock *fl)
+int nlmclnt_proc(struct nlm_host *host, int cmd, struct file_lock *fl, void *data)
{
struct nlm_rqst *call;
int status;
+ const struct nlmclnt_operations *nlmclnt_ops = host->h_nlmclnt_ops;
call = nlm_alloc_call(host);
if (call == NULL)
return -ENOMEM;
+ if (nlmclnt_ops && nlmclnt_ops->nlmclnt_alloc_call)
+ nlmclnt_ops->nlmclnt_alloc_call(data);
+
nlmclnt_locks_init_private(fl, host);
if (!fl->fl_u.nfs_fl.owner) {
/* lockowner allocation has failed */
@@ -169,6 +174,7 @@ int nlmclnt_proc(struct nlm_host *host, int cmd, struct file_lock *fl)
}
/* Set up the argument struct */
nlmclnt_setlockargs(call, fl);
+ call->a_callback_data = data;
if (IS_SETLK(cmd) || IS_SETLKW(cmd)) {
if (fl->fl_type != F_UNLCK) {
@@ -214,8 +220,12 @@ struct nlm_rqst *nlm_alloc_call(struct nlm_host *host)
void nlmclnt_release_call(struct nlm_rqst *call)
{
+ const struct nlmclnt_operations *nlmclnt_ops = call->a_host->h_nlmclnt_ops;
+
if (!atomic_dec_and_test(&call->a_count))
return;
+ if (nlmclnt_ops && nlmclnt_ops->nlmclnt_release_call)
+ nlmclnt_ops->nlmclnt_release_call(call->a_callback_data);
nlmclnt_release_host(call->a_host);
nlmclnt_release_lockargs(call);
kfree(call);
@@ -687,6 +697,19 @@ out:
return status;
}
+static void nlmclnt_unlock_prepare(struct rpc_task *task, void *data)
+{
+ struct nlm_rqst *req = data;
+ const struct nlmclnt_operations *nlmclnt_ops = req->a_host->h_nlmclnt_ops;
+ bool defer_call = false;
+
+ if (nlmclnt_ops && nlmclnt_ops->nlmclnt_unlock_prepare)
+ defer_call = nlmclnt_ops->nlmclnt_unlock_prepare(task, req->a_callback_data);
+
+ if (!defer_call)
+ rpc_call_start(task);
+}
+
static void nlmclnt_unlock_callback(struct rpc_task *task, void *data)
{
struct nlm_rqst *req = data;
@@ -720,6 +743,7 @@ die:
}
static const struct rpc_call_ops nlmclnt_unlock_ops = {
+ .rpc_call_prepare = nlmclnt_unlock_prepare,
.rpc_call_done = nlmclnt_unlock_callback,
.rpc_release = nlmclnt_rpc_release,
};
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index e7c8b9c76e48..5d481e8a1b5d 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -132,6 +132,8 @@ lockd(void *vrqstp)
{
int err = 0;
struct svc_rqst *rqstp = vrqstp;
+ struct net *net = &init_net;
+ struct lockd_net *ln = net_generic(net, lockd_net_id);
/* try_to_freeze() is called from svc_recv() */
set_freezable();
@@ -176,6 +178,8 @@ lockd(void *vrqstp)
if (nlmsvc_ops)
nlmsvc_invalidate_all();
nlm_shutdown_hosts();
+ cancel_delayed_work_sync(&ln->grace_period_end);
+ locks_end_grace(&ln->lockd_manager);
return 0;
}
@@ -270,8 +274,6 @@ static void lockd_down_net(struct svc_serv *serv, struct net *net)
if (ln->nlmsvc_users) {
if (--ln->nlmsvc_users == 0) {
nlm_shutdown_hosts_net(net);
- cancel_delayed_work_sync(&ln->grace_period_end);
- locks_end_grace(&ln->lockd_manager);
svc_shutdown_net(serv, net);
dprintk("lockd_down_net: per-net data destroyed; net=%p\n", net);
}
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 5581e020644b..3507c80d1d4b 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -870,15 +870,15 @@ nlmsvc_grant_reply(struct nlm_cookie *cookie, __be32 status)
if (!(block = nlmsvc_find_block(cookie)))
return;
- if (block) {
- if (status == nlm_lck_denied_grace_period) {
- /* Try again in a couple of seconds */
- nlmsvc_insert_block(block, 10 * HZ);
- } else {
- /* Lock is now held by client, or has been rejected.
- * In both cases, the block should be removed. */
- nlmsvc_unlink_block(block);
- }
+ if (status == nlm_lck_denied_grace_period) {
+ /* Try again in a couple of seconds */
+ nlmsvc_insert_block(block, 10 * HZ);
+ } else {
+ /*
+ * Lock is now held by client, or has been rejected.
+ * In both cases, the block should be removed.
+ */
+ nlmsvc_unlink_block(block);
}
nlmsvc_release_block(block);
}
diff --git a/fs/locks.c b/fs/locks.c
index 26811321d39b..af2031a1fcff 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -2504,7 +2504,7 @@ locks_remove_flock(struct file *filp, struct file_lock_context *flctx)
.fl_owner = filp,
.fl_pid = current->tgid,
.fl_file = filp,
- .fl_flags = FL_FLOCK,
+ .fl_flags = FL_FLOCK | FL_CLOSE,
.fl_type = F_UNLCK,
.fl_end = OFFSET_MAX,
};
diff --git a/fs/namei.c b/fs/namei.c
index 7286f87ce863..6571a5f5112e 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2142,7 +2142,6 @@ OK:
static const char *path_init(struct nameidata *nd, unsigned flags)
{
- int retval = 0;
const char *s = nd->name->name;
if (!*s)
@@ -2154,13 +2153,8 @@ static const char *path_init(struct nameidata *nd, unsigned flags)
if (flags & LOOKUP_ROOT) {
struct dentry *root = nd->root.dentry;
struct inode *inode = root->d_inode;
- if (*s) {
- if (!d_can_lookup(root))
- return ERR_PTR(-ENOTDIR);
- retval = inode_permission(inode, MAY_EXEC);
- if (retval)
- return ERR_PTR(retval);
- }
+ if (*s && unlikely(!d_can_lookup(root)))
+ return ERR_PTR(-ENOTDIR);
nd->path = nd->root;
nd->inode = inode;
if (flags & LOOKUP_RCU) {
@@ -2258,6 +2252,35 @@ static inline int lookup_last(struct nameidata *nd)
return walk_component(nd, 0);
}
+static int handle_lookup_down(struct nameidata *nd)
+{
+ struct path path = nd->path;
+ struct inode *inode = nd->inode;
+ unsigned seq = nd->seq;
+ int err;
+
+ if (nd->flags & LOOKUP_RCU) {
+ /*
+ * don't bother with unlazy_walk on failure - we are
+ * at the very beginning of walk, so we lose nothing
+ * if we simply redo everything in non-RCU mode
+ */
+ if (unlikely(!__follow_mount_rcu(nd, &path, &inode, &seq)))
+ return -ECHILD;
+ } else {
+ dget(path.dentry);
+ err = follow_managed(&path, nd);
+ if (unlikely(err < 0))
+ return err;
+ inode = d_backing_inode(path.dentry);
+ seq = 0;
+ }
+ path_to_nameidata(&path, nd);
+ nd->inode = inode;
+ nd->seq = seq;
+ return 0;
+}
+
/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
static int path_lookupat(struct nameidata *nd, unsigned flags, struct path *path)
{
@@ -2266,6 +2289,15 @@ static int path_lookupat(struct nameidata *nd, unsigned flags, struct path *path
if (IS_ERR(s))
return PTR_ERR(s);
+
+ if (unlikely(flags & LOOKUP_DOWN)) {
+ err = handle_lookup_down(nd);
+ if (unlikely(err < 0)) {
+ terminate_walk(nd);
+ return err;
+ }
+ }
+
while (!(err = link_path_walk(s, nd))
&& ((err = lookup_last(nd)) > 0)) {
s = trailing_symlink(nd);
diff --git a/fs/namespace.c b/fs/namespace.c
index b3b115bd4e1e..8bd3e4d448b9 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -3462,8 +3462,9 @@ static void mntns_put(struct ns_common *ns)
static int mntns_install(struct nsproxy *nsproxy, struct ns_common *ns)
{
struct fs_struct *fs = current->fs;
- struct mnt_namespace *mnt_ns = to_mnt_ns(ns);
+ struct mnt_namespace *mnt_ns = to_mnt_ns(ns), *old_mnt_ns;
struct path root;
+ int err;
if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) ||
!ns_capable(current_user_ns(), CAP_SYS_CHROOT) ||
@@ -3474,15 +3475,18 @@ static int mntns_install(struct nsproxy *nsproxy, struct ns_common *ns)
return -EINVAL;
get_mnt_ns(mnt_ns);
- put_mnt_ns(nsproxy->mnt_ns);
+ old_mnt_ns = nsproxy->mnt_ns;
nsproxy->mnt_ns = mnt_ns;
/* Find the root */
- root.mnt = &mnt_ns->root->mnt;
- root.dentry = mnt_ns->root->mnt.mnt_root;
- path_get(&root);
- while(d_mountpoint(root.dentry) && follow_down_one(&root))
- ;
+ err = vfs_path_lookup(mnt_ns->root->mnt.mnt_root, &mnt_ns->root->mnt,
+ "/", LOOKUP_DOWN, &root);
+ if (err) {
+ /* revert to old namespace */
+ nsproxy->mnt_ns = old_mnt_ns;
+ put_mnt_ns(mnt_ns);
+ return err;
+ }
/* Update the pwd and root */
set_fs_pwd(fs, &root);
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index f31fd0dd92c6..69d02cf8cf37 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -123,11 +123,6 @@ config PNFS_BLOCK
depends on NFS_V4_1 && BLK_DEV_DM
default NFS_V4
-config PNFS_OBJLAYOUT
- tristate
- depends on NFS_V4_1 && SCSI_OSD_ULD
- default NFS_V4
-
config PNFS_FLEXFILE_LAYOUT
tristate
depends on NFS_V4_1 && NFS_V3
diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index 6abdda209642..98f4e5728a67 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -31,6 +31,5 @@ nfsv4-$(CONFIG_NFS_V4_1) += pnfs.o pnfs_dev.o pnfs_nfs.o
nfsv4-$(CONFIG_NFS_V4_2) += nfs42proc.o
obj-$(CONFIG_PNFS_FILE_LAYOUT) += filelayout/
-obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayout/
obj-$(CONFIG_PNFS_BLOCK) += blocklayout/
obj-$(CONFIG_PNFS_FLEXFILE_LAYOUT) += flexfilelayout/
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 773774531aff..73a1f928226c 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -76,7 +76,10 @@ nfs4_callback_svc(void *vrqstp)
set_freezable();
- while (!kthread_should_stop()) {
+ while (!kthread_freezable_should_stop(NULL)) {
+
+ if (signal_pending(current))
+ flush_signals(current);
/*
* Listen for a request on the socket
*/
@@ -85,6 +88,8 @@ nfs4_callback_svc(void *vrqstp)
continue;
svc_process(rqstp);
}
+ svc_exit_thread(rqstp);
+ module_put_and_exit(0);
return 0;
}
@@ -103,9 +108,10 @@ nfs41_callback_svc(void *vrqstp)
set_freezable();
- while (!kthread_should_stop()) {
- if (try_to_freeze())
- continue;
+ while (!kthread_freezable_should_stop(NULL)) {
+
+ if (signal_pending(current))
+ flush_signals(current);
prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_INTERRUPTIBLE);
spin_lock_bh(&serv->sv_cb_lock);
@@ -121,11 +127,13 @@ nfs41_callback_svc(void *vrqstp)
error);
} else {
spin_unlock_bh(&serv->sv_cb_lock);
- schedule();
+ if (!kthread_should_stop())
+ schedule();
finish_wait(&serv->sv_cb_waitq, &wq);
}
- flush_signals(current);
}
+ svc_exit_thread(rqstp);
+ module_put_and_exit(0);
return 0;
}
@@ -221,14 +229,14 @@ err_bind:
static struct svc_serv_ops nfs40_cb_sv_ops = {
.svo_function = nfs4_callback_svc,
.svo_enqueue_xprt = svc_xprt_do_enqueue,
- .svo_setup = svc_set_num_threads,
+ .svo_setup = svc_set_num_threads_sync,
.svo_module = THIS_MODULE,
};
#if defined(CONFIG_NFS_V4_1)
static struct svc_serv_ops nfs41_cb_sv_ops = {
.svo_function = nfs41_callback_svc,
.svo_enqueue_xprt = svc_xprt_do_enqueue,
- .svo_setup = svc_set_num_threads,
+ .svo_setup = svc_set_num_threads_sync,
.svo_module = THIS_MODULE,
};
@@ -280,7 +288,7 @@ static struct svc_serv *nfs_callback_create_svc(int minorversion)
printk(KERN_WARNING "nfs_callback_create_svc: no kthread, %d users??\n",
cb_info->users);
- serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, sv_ops);
+ serv = svc_create_pooled(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, sv_ops);
if (!serv) {
printk(KERN_ERR "nfs_callback_create_svc: create service failed\n");
return ERR_PTR(-ENOMEM);
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index f073a6d2c6a5..52479f180ea1 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -131,10 +131,11 @@ restart:
if (!inode)
continue;
if (!nfs_sb_active(inode->i_sb)) {
- rcu_read_lock();
+ rcu_read_unlock();
spin_unlock(&clp->cl_lock);
iput(inode);
spin_lock(&clp->cl_lock);
+ rcu_read_lock();
goto restart;
}
return inode;
@@ -170,10 +171,11 @@ restart:
if (!inode)
continue;
if (!nfs_sb_active(inode->i_sb)) {
- rcu_read_lock();
+ rcu_read_unlock();
spin_unlock(&clp->cl_lock);
iput(inode);
spin_lock(&clp->cl_lock);
+ rcu_read_lock();
goto restart;
}
return inode;
@@ -317,31 +319,18 @@ static u32 initiate_bulk_draining(struct nfs_client *clp,
static u32 do_callback_layoutrecall(struct nfs_client *clp,
struct cb_layoutrecallargs *args)
{
- u32 res;
-
- dprintk("%s enter, type=%i\n", __func__, args->cbl_recall_type);
if (args->cbl_recall_type == RETURN_FILE)
- res = initiate_file_draining(clp, args);
- else
- res = initiate_bulk_draining(clp, args);
- dprintk("%s returning %i\n", __func__, res);
- return res;
-
+ return initiate_file_draining(clp, args);
+ return initiate_bulk_draining(clp, args);
}
__be32 nfs4_callback_layoutrecall(struct cb_layoutrecallargs *args,
void *dummy, struct cb_process_state *cps)
{
- u32 res;
-
- dprintk("%s: -->\n", __func__);
+ u32 res = NFS4ERR_OP_NOT_IN_SESSION;
if (cps->clp)
res = do_callback_layoutrecall(cps->clp, args);
- else
- res = NFS4ERR_OP_NOT_IN_SESSION;
-
- dprintk("%s: exit with status = %d\n", __func__, res);
return cpu_to_be32(res);
}
@@ -364,8 +353,6 @@ __be32 nfs4_callback_devicenotify(struct cb_devicenotifyargs *args,
struct nfs_client *clp = cps->clp;
struct nfs_server *server = NULL;
- dprintk("%s: -->\n", __func__);
-
if (!clp) {
res = cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION);
goto out;
@@ -384,8 +371,6 @@ __be32 nfs4_callback_devicenotify(struct cb_devicenotifyargs *args,
goto found;
}
rcu_read_unlock();
- dprintk("%s: layout type %u not found\n",
- __func__, dev->cbd_layout_type);
continue;
}
@@ -395,8 +380,6 @@ __be32 nfs4_callback_devicenotify(struct cb_devicenotifyargs *args,
out:
kfree(args->devs);
- dprintk("%s: exit with status = %u\n",
- __func__, be32_to_cpu(res));
return res;
}
@@ -417,16 +400,11 @@ static __be32
validate_seqid(const struct nfs4_slot_table *tbl, const struct nfs4_slot *slot,
const struct cb_sequenceargs * args)
{
- dprintk("%s enter. slotid %u seqid %u, slot table seqid: %u\n",
- __func__, args->csa_slotid, args->csa_sequenceid, slot->seq_nr);
-
if (args->csa_slotid > tbl->server_highest_slotid)
return htonl(NFS4ERR_BADSLOT);
/* Replay */
if (args->csa_sequenceid == slot->seq_nr) {
- dprintk("%s seqid %u is a replay\n",
- __func__, args->csa_sequenceid);
if (nfs4_test_locked_slot(tbl, slot->slot_nr))
return htonl(NFS4ERR_DELAY);
/* Signal process_op to set this error on next op */
@@ -480,15 +458,6 @@ static bool referring_call_exists(struct nfs_client *clp,
for (j = 0; j < rclist->rcl_nrefcalls; j++) {
ref = &rclist->rcl_refcalls[j];
-
- dprintk("%s: sessionid %x:%x:%x:%x sequenceid %u "
- "slotid %u\n", __func__,
- ((u32 *)&rclist->rcl_sessionid.data)[0],
- ((u32 *)&rclist->rcl_sessionid.data)[1],
- ((u32 *)&rclist->rcl_sessionid.data)[2],
- ((u32 *)&rclist->rcl_sessionid.data)[3],
- ref->rc_sequenceid, ref->rc_slotid);
-
status = nfs4_slot_wait_on_seqid(tbl, ref->rc_slotid,
ref->rc_sequenceid, HZ >> 1) < 0;
if (status)
@@ -593,8 +562,6 @@ out:
res->csr_status = status;
trace_nfs4_cb_sequence(args, res, status);
- dprintk("%s: exit with status = %d res->csr_status %d\n", __func__,
- ntohl(status), ntohl(res->csr_status));
return status;
}
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index d051fc3583a9..c14758e08d73 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -171,8 +171,6 @@ static __be32 decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound
return htonl(NFS4ERR_MINOR_VERS_MISMATCH);
}
hdr->nops = ntohl(*p);
- dprintk("%s: minorversion %d nops %d\n", __func__,
- hdr->minorversion, hdr->nops);
return 0;
}
@@ -192,11 +190,8 @@ static __be32 decode_getattr_args(struct svc_rqst *rqstp, struct xdr_stream *xdr
status = decode_fh(xdr, &args->fh);
if (unlikely(status != 0))
- goto out;
- status = decode_bitmap(xdr, args->bitmap);
-out:
- dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
- return status;
+ return status;
+ return decode_bitmap(xdr, args->bitmap);
}
static __be32 decode_recall_args(struct svc_rqst *rqstp, struct xdr_stream *xdr, struct cb_recallargs *args)
@@ -206,17 +201,12 @@ static __be32 decode_recall_args(struct svc_rqst *rqstp, struct xdr_stream *xdr,
status = decode_delegation_stateid(xdr, &args->stateid);
if (unlikely(status != 0))
- goto out;
+ return status;
p = read_buf(xdr, 4);
- if (unlikely(p == NULL)) {
- status = htonl(NFS4ERR_RESOURCE);
- goto out;
- }
+ if (unlikely(p == NULL))
+ return htonl(NFS4ERR_RESOURCE);
args->truncate = ntohl(*p);
- status = decode_fh(xdr, &args->fh);
-out:
- dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
- return status;
+ return decode_fh(xdr, &args->fh);
}
#if defined(CONFIG_NFS_V4_1)
@@ -235,10 +225,8 @@ static __be32 decode_layoutrecall_args(struct svc_rqst *rqstp,
uint32_t iomode;
p = read_buf(xdr, 4 * sizeof(uint32_t));
- if (unlikely(p == NULL)) {
- status = htonl(NFS4ERR_BADXDR);
- goto out;
- }
+ if (unlikely(p == NULL))
+ return htonl(NFS4ERR_BADXDR);
args->cbl_layout_type = ntohl(*p++);
/* Depite the spec's xdr, iomode really belongs in the FILE switch,
@@ -252,37 +240,23 @@ static __be32 decode_layoutrecall_args(struct svc_rqst *rqstp,
args->cbl_range.iomode = iomode;
status = decode_fh(xdr, &args->cbl_fh);
if (unlikely(status != 0))
- goto out;
+ return status;
p = read_buf(xdr, 2 * sizeof(uint64_t));
- if (unlikely(p == NULL)) {
- status = htonl(NFS4ERR_BADXDR);
- goto out;
- }
+ if (unlikely(p == NULL))
+ return htonl(NFS4ERR_BADXDR);
p = xdr_decode_hyper(p, &args->cbl_range.offset);
p = xdr_decode_hyper(p, &args->cbl_range.length);
- status = decode_layout_stateid(xdr, &args->cbl_stateid);
- if (unlikely(status != 0))
- goto out;
+ return decode_layout_stateid(xdr, &args->cbl_stateid);
} else if (args->cbl_recall_type == RETURN_FSID) {
p = read_buf(xdr, 2 * sizeof(uint64_t));
- if (unlikely(p == NULL)) {
- status = htonl(NFS4ERR_BADXDR);
- goto out;
- }
+ if (unlikely(p == NULL))
+ return htonl(NFS4ERR_BADXDR);
p = xdr_decode_hyper(p, &args->cbl_fsid.major);
p = xdr_decode_hyper(p, &args->cbl_fsid.minor);
- } else if (args->cbl_recall_type != RETURN_ALL) {
- status = htonl(NFS4ERR_BADXDR);
- goto out;
- }
- dprintk("%s: ltype 0x%x iomode %d changed %d recall_type %d\n",
- __func__,
- args->cbl_layout_type, iomode,
- args->cbl_layoutchanged, args->cbl_recall_type);
-out:
- dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
- return status;
+ } else if (args->cbl_recall_type != RETURN_ALL)
+ return htonl(NFS4ERR_BADXDR);
+ return 0;
}
static
@@ -437,12 +411,11 @@ static __be32 decode_cb_sequence_args(struct svc_rqst *rqstp,
status = decode_sessionid(xdr, &args->csa_sessionid);
if (status)
- goto out;
+ return status;
- status = htonl(NFS4ERR_RESOURCE);
p = read_buf(xdr, 5 * sizeof(uint32_t));
if (unlikely(p == NULL))
- goto out;
+ return htonl(NFS4ERR_RESOURCE);
args->csa_addr = svc_addr(rqstp);
args->csa_sequenceid = ntohl(*p++);
@@ -456,7 +429,7 @@ static __be32 decode_cb_sequence_args(struct svc_rqst *rqstp,
sizeof(*args->csa_rclists),
GFP_KERNEL);
if (unlikely(args->csa_rclists == NULL))
- goto out;
+ return htonl(NFS4ERR_RESOURCE);
for (i = 0; i < args->csa_nrclists; i++) {
status = decode_rc_list(xdr, &args->csa_rclists[i]);
@@ -466,27 +439,13 @@ static __be32 decode_cb_sequence_args(struct svc_rqst *rqstp,
}
}
}
- status = 0;
-
- dprintk("%s: sessionid %x:%x:%x:%x sequenceid %u slotid %u "
- "highestslotid %u cachethis %d nrclists %u\n",
- __func__,
- ((u32 *)&args->csa_sessionid)[0],
- ((u32 *)&args->csa_sessionid)[1],
- ((u32 *)&args->csa_sessionid)[2],
- ((u32 *)&args->csa_sessionid)[3],
- args->csa_sequenceid, args->csa_slotid,
- args->csa_highestslotid, args->csa_cachethis,
- args->csa_nrclists);
-out:
- dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
- return status;
+ return 0;
out_free:
for (i = 0; i < args->csa_nrclists; i++)
kfree(args->csa_rclists[i].rcl_refcalls);
kfree(args->csa_rclists);
- goto out;
+ return status;
}
static __be32 decode_recallany_args(struct svc_rqst *rqstp,
@@ -557,11 +516,8 @@ static __be32 decode_notify_lock_args(struct svc_rqst *rqstp, struct xdr_stream
status = decode_fh(xdr, &args->cbnl_fh);
if (unlikely(status != 0))
- goto out;
- status = decode_lockowner(xdr, args);
-out:
- dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
- return status;
+ return status;
+ return decode_lockowner(xdr, args);
}
#endif /* CONFIG_NFS_V4_1 */
@@ -707,7 +663,6 @@ static __be32 encode_getattr_res(struct svc_rqst *rqstp, struct xdr_stream *xdr,
status = encode_attr_mtime(xdr, res->bitmap, &res->mtime);
*savep = htonl((unsigned int)((char *)xdr->p - (char *)(savep+1)));
out:
- dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
return status;
}
@@ -734,11 +689,11 @@ static __be32 encode_cb_sequence_res(struct svc_rqst *rqstp,
__be32 status = res->csr_status;
if (unlikely(status != 0))
- goto out;
+ return status;
status = encode_sessionid(xdr, &res->csr_sessionid);
if (status)
- goto out;
+ return status;
p = xdr_reserve_space(xdr, 4 * sizeof(uint32_t));
if (unlikely(p == NULL))
@@ -748,9 +703,7 @@ static __be32 encode_cb_sequence_res(struct svc_rqst *rqstp,
*p++ = htonl(res->csr_slotid);
*p++ = htonl(res->csr_highestslotid);
*p++ = htonl(res->csr_target_highestslotid);
-out:
- dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
- return status;
+ return 0;
}
static __be32
@@ -871,14 +824,10 @@ static __be32 process_op(int nop, struct svc_rqst *rqstp,
long maxlen;
__be32 res;
- dprintk("%s: start\n", __func__);
status = decode_op_hdr(xdr_in, &op_nr);
if (unlikely(status))
return status;
- dprintk("%s: minorversion=%d nop=%d op_nr=%u\n",
- __func__, cps->minorversion, nop, op_nr);
-
switch (cps->minorversion) {
case 0:
status = preprocess_nfs4_op(op_nr, &op);
@@ -917,7 +866,6 @@ encode_hdr:
return res;
if (op->encode_res != NULL && status == 0)
status = op->encode_res(rqstp, xdr_out, resp);
- dprintk("%s: done, status = %d\n", __func__, ntohl(status));
return status;
}
@@ -937,8 +885,6 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
};
unsigned int nops = 0;
- dprintk("%s: start\n", __func__);
-
xdr_init_decode(&xdr_in, &rqstp->rq_arg, rqstp->rq_arg.head[0].iov_base);
p = (__be32*)((char *)rqstp->rq_res.head[0].iov_base + rqstp->rq_res.head[0].iov_len);
@@ -977,7 +923,6 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
*hdr_res.nops = htonl(nops);
nfs4_cb_free_slot(&cps);
nfs_put_client(cps.clp);
- dprintk("%s: done, status = %u\n", __func__, ntohl(status));
return rpc_success;
out_invalidcred:
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 04d15a0045e3..ee5ddbd36088 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -218,6 +218,7 @@ static void nfs_cb_idr_remove_locked(struct nfs_client *clp)
static void pnfs_init_server(struct nfs_server *server)
{
rpc_init_wait_queue(&server->roc_rpcwaitq, "pNFS ROC");
+ rpc_init_wait_queue(&server->uoc_rpcwaitq, "NFS UOC");
}
#else
@@ -240,8 +241,6 @@ static void pnfs_init_server(struct nfs_server *server)
*/
void nfs_free_client(struct nfs_client *clp)
{
- dprintk("--> nfs_free_client(%u)\n", clp->rpc_ops->version);
-
nfs_fscache_release_client_cookie(clp);
/* -EIO all pending I/O */
@@ -256,8 +255,6 @@ void nfs_free_client(struct nfs_client *clp)
kfree(clp->cl_hostname);
kfree(clp->cl_acceptor);
kfree(clp);
-
- dprintk("<-- nfs_free_client()\n");
}
EXPORT_SYMBOL_GPL(nfs_free_client);
@@ -271,7 +268,6 @@ void nfs_put_client(struct nfs_client *clp)
if (!clp)
return;
- dprintk("--> nfs_put_client({%d})\n", atomic_read(&clp->cl_count));
nn = net_generic(clp->cl_net, nfs_net_id);
if (atomic_dec_and_lock(&clp->cl_count, &nn->nfs_client_lock)) {
@@ -382,9 +378,6 @@ nfs_found_client(const struct nfs_client_initdata *cl_init,
}
smp_rmb();
-
- dprintk("<-- %s found nfs_client %p for %s\n",
- __func__, clp, cl_init->hostname ?: "");
return clp;
}
@@ -403,9 +396,6 @@ struct nfs_client *nfs_get_client(const struct nfs_client_initdata *cl_init)
return NULL;
}
- dprintk("--> nfs_get_client(%s,v%u)\n",
- cl_init->hostname, rpc_ops->version);
-
/* see if the client already exists */
do {
spin_lock(&nn->nfs_client_lock);
@@ -430,8 +420,6 @@ struct nfs_client *nfs_get_client(const struct nfs_client_initdata *cl_init)
new = rpc_ops->alloc_client(cl_init);
} while (!IS_ERR(new));
- dprintk("<-- nfs_get_client() Failed to find %s (%ld)\n",
- cl_init->hostname, PTR_ERR(new));
return new;
}
EXPORT_SYMBOL_GPL(nfs_get_client);
@@ -558,6 +546,7 @@ static int nfs_start_lockd(struct nfs_server *server)
.noresvport = server->flags & NFS_MOUNT_NORESVPORT ?
1 : 0,
.net = clp->cl_net,
+ .nlmclnt_ops = clp->cl_nfs_mod->rpc_ops->nlmclnt_ops,
};
if (nlm_init.nfs_version > 3)
@@ -624,27 +613,21 @@ struct nfs_client *nfs_init_client(struct nfs_client *clp,
{
int error;
- if (clp->cl_cons_state == NFS_CS_READY) {
- /* the client is already initialised */
- dprintk("<-- nfs_init_client() = 0 [already %p]\n", clp);
+ /* the client is already initialised */
+ if (clp->cl_cons_state == NFS_CS_READY)
return clp;
- }
/*
* Create a client RPC handle for doing FSSTAT with UNIX auth only
* - RFC 2623, sec 2.3.2
*/
error = nfs_create_rpc_client(clp, cl_init, RPC_AUTH_UNIX);
- if (error < 0)
- goto error;
- nfs_mark_client_ready(clp, NFS_CS_READY);
+ nfs_mark_client_ready(clp, error == 0 ? NFS_CS_READY : error);
+ if (error < 0) {
+ nfs_put_client(clp);
+ clp = ERR_PTR(error);
+ }
return clp;
-
-error:
- nfs_mark_client_ready(clp, error);
- nfs_put_client(clp);
- dprintk("<-- nfs_init_client() = xerror %d\n", error);
- return ERR_PTR(error);
}
EXPORT_SYMBOL_GPL(nfs_init_client);
@@ -668,8 +651,6 @@ static int nfs_init_server(struct nfs_server *server,
struct nfs_client *clp;
int error;
- dprintk("--> nfs_init_server()\n");
-
nfs_init_timeout_values(&timeparms, data->nfs_server.protocol,
data->timeo, data->retrans);
if (data->flags & NFS_MOUNT_NORESVPORT)
@@ -677,10 +658,8 @@ static int nfs_init_server(struct nfs_server *server,
/* Allocate or find a client reference we can use */
clp = nfs_get_client(&cl_init);
- if (IS_ERR(clp)) {
- dprintk("<-- nfs_init_server() = error %ld\n", PTR_ERR(clp));
+ if (IS_ERR(clp))
return PTR_ERR(clp);
- }
server->nfs_client = clp;
@@ -725,13 +704,11 @@ static int nfs_init_server(struct nfs_server *server,
server->mountd_protocol = data->mount_server.protocol;
server->namelen = data->namlen;
- dprintk("<-- nfs_init_server() = 0 [new %p]\n", clp);
return 0;
error:
server->nfs_client = NULL;
nfs_put_client(clp);
- dprintk("<-- nfs_init_server() = xerror %d\n", error);
return error;
}
@@ -798,12 +775,10 @@ int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, struct nfs
struct nfs_client *clp = server->nfs_client;
int error;
- dprintk("--> nfs_probe_fsinfo()\n");
-
if (clp->rpc_ops->set_capabilities != NULL) {
error = clp->rpc_ops->set_capabilities(server, mntfh);
if (error < 0)
- goto out_error;
+ return error;
}
fsinfo.fattr = fattr;
@@ -811,7 +786,7 @@ int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, struct nfs
memset(fsinfo.layouttype, 0, sizeof(fsinfo.layouttype));
error = clp->rpc_ops->fsinfo(server, mntfh, &fsinfo);
if (error < 0)
- goto out_error;
+ return error;
nfs_server_set_fsinfo(server, &fsinfo);
@@ -826,12 +801,7 @@ int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, struct nfs
server->namelen = pathinfo.max_namelen;
}
- dprintk("<-- nfs_probe_fsinfo() = 0\n");
return 0;
-
-out_error:
- dprintk("nfs_probe_fsinfo: error = %d\n", -error);
- return error;
}
EXPORT_SYMBOL_GPL(nfs_probe_fsinfo);
@@ -927,8 +897,6 @@ EXPORT_SYMBOL_GPL(nfs_alloc_server);
*/
void nfs_free_server(struct nfs_server *server)
{
- dprintk("--> nfs_free_server()\n");
-
nfs_server_remove_lists(server);
if (server->destroy != NULL)
@@ -946,7 +914,6 @@ void nfs_free_server(struct nfs_server *server)
nfs_free_iostats(server->io_stats);
kfree(server);
nfs_release_automount_timer();
- dprintk("<-- nfs_free_server()\n");
}
EXPORT_SYMBOL_GPL(nfs_free_server);
@@ -1026,10 +993,6 @@ struct nfs_server *nfs_clone_server(struct nfs_server *source,
struct nfs_fattr *fattr_fsinfo;
int error;
- dprintk("--> nfs_clone_server(,%llx:%llx,)\n",
- (unsigned long long) fattr->fsid.major,
- (unsigned long long) fattr->fsid.minor);
-
server = nfs_alloc_server();
if (!server)
return ERR_PTR(-ENOMEM);
@@ -1061,10 +1024,6 @@ struct nfs_server *nfs_clone_server(struct nfs_server *source,
if (server->namelen == 0 || server->namelen > NFS4_MAXNAMLEN)
server->namelen = NFS4_MAXNAMLEN;
- dprintk("Cloned FSID: %llx:%llx\n",
- (unsigned long long) server->fsid.major,
- (unsigned long long) server->fsid.minor);
-
error = nfs_start_lockd(server);
if (error < 0)
goto out_free_server;
@@ -1073,13 +1032,11 @@ struct nfs_server *nfs_clone_server(struct nfs_server *source,
server->mount_time = jiffies;
nfs_free_fattr(fattr_fsinfo);
- dprintk("<-- nfs_clone_server() = %p\n", server);
return server;
out_free_server:
nfs_free_fattr(fattr_fsinfo);
nfs_free_server(server);
- dprintk("<-- nfs_clone_server() = error %d\n", error);
return ERR_PTR(error);
}
EXPORT_SYMBOL_GPL(nfs_clone_server);
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index f92ba8d6c556..32ccd7754f8a 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -57,7 +57,7 @@ static void nfs_readdir_clear_array(struct page*);
const struct file_operations nfs_dir_operations = {
.llseek = nfs_llseek_dir,
.read = generic_read_dir,
- .iterate_shared = nfs_readdir,
+ .iterate = nfs_readdir,
.open = nfs_opendir,
.release = nfs_closedir,
.fsync = nfs_fsync_dir,
@@ -145,7 +145,6 @@ struct nfs_cache_array_entry {
};
struct nfs_cache_array {
- atomic_t refcount;
int size;
int eof_index;
u64 last_cookie;
@@ -171,27 +170,6 @@ typedef struct {
} nfs_readdir_descriptor_t;
/*
- * The caller is responsible for calling nfs_readdir_release_array(page)
- */
-static
-struct nfs_cache_array *nfs_readdir_get_array(struct page *page)
-{
- void *ptr;
- if (page == NULL)
- return ERR_PTR(-EIO);
- ptr = kmap(page);
- if (ptr == NULL)
- return ERR_PTR(-ENOMEM);
- return ptr;
-}
-
-static
-void nfs_readdir_release_array(struct page *page)
-{
- kunmap(page);
-}
-
-/*
* we are freeing strings created by nfs_add_to_readdir_array()
*/
static
@@ -201,18 +179,9 @@ void nfs_readdir_clear_array(struct page *page)
int i;
array = kmap_atomic(page);
- if (atomic_dec_and_test(&array->refcount))
- for (i = 0; i < array->size; i++)
- kfree(array->array[i].string.name);
- kunmap_atomic(array);
-}
-
-static bool grab_page(struct page *page)
-{
- struct nfs_cache_array *array = kmap_atomic(page);
- bool res = atomic_inc_not_zero(&array->refcount);
+ for (i = 0; i < array->size; i++)
+ kfree(array->array[i].string.name);
kunmap_atomic(array);
- return res;
}
/*
@@ -239,13 +208,10 @@ int nfs_readdir_make_qstr(struct qstr *string, const char *name, unsigned int le
static
int nfs_readdir_add_to_array(struct nfs_entry *entry, struct page *page)
{
- struct nfs_cache_array *array = nfs_readdir_get_array(page);
+ struct nfs_cache_array *array = kmap(page);
struct nfs_cache_array_entry *cache_entry;
int ret;
- if (IS_ERR(array))
- return PTR_ERR(array);
-
cache_entry = &array->array[array->size];
/* Check that this entry lies within the page bounds */
@@ -264,7 +230,7 @@ int nfs_readdir_add_to_array(struct nfs_entry *entry, struct page *page)
if (entry->eof != 0)
array->eof_index = array->size;
out:
- nfs_readdir_release_array(page);
+ kunmap(page);
return ret;
}
@@ -353,11 +319,7 @@ int nfs_readdir_search_array(nfs_readdir_descriptor_t *desc)
struct nfs_cache_array *array;
int status;
- array = nfs_readdir_get_array(desc->page);
- if (IS_ERR(array)) {
- status = PTR_ERR(array);
- goto out;
- }
+ array = kmap(desc->page);
if (*desc->dir_cookie == 0)
status = nfs_readdir_search_for_pos(array, desc);
@@ -369,8 +331,7 @@ int nfs_readdir_search_array(nfs_readdir_descriptor_t *desc)
desc->current_index += array->size;
desc->page_index++;
}
- nfs_readdir_release_array(desc->page);
-out:
+ kunmap(desc->page);
return status;
}
@@ -606,13 +567,10 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en
out_nopages:
if (count == 0 || (status == -EBADCOOKIE && entry->eof != 0)) {
- array = nfs_readdir_get_array(page);
- if (!IS_ERR(array)) {
- array->eof_index = array->size;
- status = 0;
- nfs_readdir_release_array(page);
- } else
- status = PTR_ERR(array);
+ array = kmap(page);
+ array->eof_index = array->size;
+ status = 0;
+ kunmap(page);
}
put_page(scratch);
@@ -674,13 +632,8 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
goto out;
}
- array = nfs_readdir_get_array(page);
- if (IS_ERR(array)) {
- status = PTR_ERR(array);
- goto out_label_free;
- }
+ array = kmap(page);
memset(array, 0, sizeof(struct nfs_cache_array));
- atomic_set(&array->refcount, 1);
array->eof_index = -1;
status = nfs_readdir_alloc_pages(pages, array_size);
@@ -703,8 +656,7 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
nfs_readdir_free_pages(pages, array_size);
out_release_array:
- nfs_readdir_release_array(page);
-out_label_free:
+ kunmap(page);
nfs4_label_free(entry.label);
out:
nfs_free_fattr(entry.fattr);
@@ -743,7 +695,8 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page* page)
static
void cache_page_release(nfs_readdir_descriptor_t *desc)
{
- nfs_readdir_clear_array(desc->page);
+ if (!desc->page->mapping)
+ nfs_readdir_clear_array(desc->page);
put_page(desc->page);
desc->page = NULL;
}
@@ -751,16 +704,8 @@ void cache_page_release(nfs_readdir_descriptor_t *desc)
static
struct page *get_cache_page(nfs_readdir_descriptor_t *desc)
{
- struct page *page;
-
- for (;;) {
- page = read_cache_page(desc->file->f_mapping,
+ return read_cache_page(desc->file->f_mapping,
desc->page_index, (filler_t *)nfs_readdir_filler, desc);
- if (IS_ERR(page) || grab_page(page))
- break;
- put_page(page);
- }
- return page;
}
/*
@@ -809,12 +754,7 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc)
struct nfs_cache_array *array = NULL;
struct nfs_open_dir_context *ctx = file->private_data;
- array = nfs_readdir_get_array(desc->page);
- if (IS_ERR(array)) {
- res = PTR_ERR(array);
- goto out;
- }
-
+ array = kmap(desc->page);
for (i = desc->cache_entry_index; i < array->size; i++) {
struct nfs_cache_array_entry *ent;
@@ -835,8 +775,7 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc)
if (array->eof_index >= 0)
desc->eof = 1;
- nfs_readdir_release_array(desc->page);
-out:
+ kunmap(desc->page);
cache_page_release(desc);
dfprintk(DIRCACHE, "NFS: nfs_do_filldir() filling ended @ cookie %Lu; returning = %d\n",
(unsigned long long)*desc->dir_cookie, res);
@@ -966,11 +905,13 @@ out:
static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int whence)
{
+ struct inode *inode = file_inode(filp);
struct nfs_open_dir_context *dir_ctx = filp->private_data;
dfprintk(FILE, "NFS: llseek dir(%pD2, %lld, %d)\n",
filp, offset, whence);
+ inode_lock(inode);
switch (whence) {
case 1:
offset += filp->f_pos;
@@ -978,13 +919,16 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int whence)
if (offset >= 0)
break;
default:
- return -EINVAL;
+ offset = -EINVAL;
+ goto out;
}
if (offset != filp->f_pos) {
filp->f_pos = offset;
dir_ctx->dir_cookie = 0;
dir_ctx->duped = 0;
}
+out:
+ inode_unlock(inode);
return offset;
}
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index c1b5fed7c863..6fb9fad2d1e6 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -392,16 +392,6 @@ static void nfs_direct_complete(struct nfs_direct_req *dreq)
nfs_direct_req_release(dreq);
}
-static void nfs_direct_readpage_release(struct nfs_page *req)
-{
- dprintk("NFS: direct read done (%s/%llu %d@%lld)\n",
- req->wb_context->dentry->d_sb->s_id,
- (unsigned long long)NFS_FILEID(d_inode(req->wb_context->dentry)),
- req->wb_bytes,
- (long long)req_offset(req));
- nfs_release_request(req);
-}
-
static void nfs_direct_read_completion(struct nfs_pgio_header *hdr)
{
unsigned long bytes = 0;
@@ -426,7 +416,7 @@ static void nfs_direct_read_completion(struct nfs_pgio_header *hdr)
set_page_dirty(page);
bytes += req->wb_bytes;
nfs_list_remove_request(req);
- nfs_direct_readpage_release(req);
+ nfs_release_request(req);
}
out_put:
if (put_dreq(dreq))
@@ -700,16 +690,9 @@ static void nfs_direct_commit_complete(struct nfs_commit_data *data)
int status = data->task.tk_status;
nfs_init_cinfo_from_dreq(&cinfo, dreq);
- if (status < 0) {
- dprintk("NFS: %5u commit failed with error %d.\n",
- data->task.tk_pid, status);
- dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
- } else if (nfs_direct_cmp_commit_data_verf(dreq, data)) {
- dprintk("NFS: %5u commit verify failed\n", data->task.tk_pid);
+ if (status < 0 || nfs_direct_cmp_commit_data_verf(dreq, data))
dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
- }
- dprintk("NFS: %5u commit returned %d\n", data->task.tk_pid, status);
while (!list_empty(&data->pages)) {
req = nfs_list_entry(data->pages.next);
nfs_list_remove_request(req);
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 668213984d68..5713eb32a45e 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -482,7 +482,7 @@ static int nfs_launder_page(struct page *page)
inode->i_ino, (long long)page_offset(page));
nfs_fscache_wait_on_page_write(nfsi, page);
- return nfs_wb_launder_page(inode, page);
+ return nfs_wb_page(inode, page);
}
static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file,
@@ -697,14 +697,14 @@ do_unlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
if (!IS_ERR(l_ctx)) {
status = nfs_iocounter_wait(l_ctx);
nfs_put_lock_context(l_ctx);
- if (status < 0)
+ /* NOTE: special case
+ * If we're signalled while cleaning up locks on process exit, we
+ * still need to complete the unlock.
+ */
+ if (status < 0 && !(fl->fl_flags & FL_CLOSE))
return status;
}
- /* NOTE: special case
- * If we're signalled while cleaning up locks on process exit, we
- * still need to complete the unlock.
- */
/*
* Use local locking if mounted with "-onolock" or with appropriate
* "-olocal_lock="
@@ -820,9 +820,23 @@ int nfs_flock(struct file *filp, int cmd, struct file_lock *fl)
if (NFS_SERVER(inode)->flags & NFS_MOUNT_LOCAL_FLOCK)
is_local = 1;
- /* We're simulating flock() locks using posix locks on the server */
- if (fl->fl_type == F_UNLCK)
+ /*
+ * VFS doesn't require the open mode to match a flock() lock's type.
+ * NFS, however, may simulate flock() locking with posix locking which
+ * requires the open mode to match the lock type.
+ */
+ switch (fl->fl_type) {
+ case F_UNLCK:
return do_unlk(filp, cmd, fl, is_local);
+ case F_RDLCK:
+ if (!(filp->f_mode & FMODE_READ))
+ return -EBADF;
+ break;
+ case F_WRLCK:
+ if (!(filp->f_mode & FMODE_WRITE))
+ return -EBADF;
+ }
+
return do_setlk(filp, cmd, fl, is_local);
}
EXPORT_SYMBOL_GPL(nfs_flock);
diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index acd30baca461..1cf85d65b748 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -921,11 +921,11 @@ fl_pnfs_update_layout(struct inode *ino,
fl = FILELAYOUT_LSEG(lseg);
status = filelayout_check_deviceid(lo, fl, gfp_flags);
- if (status)
+ if (status) {
+ pnfs_put_lseg(lseg);
lseg = ERR_PTR(status);
+ }
out:
- if (IS_ERR(lseg))
- pnfs_put_lseg(lseg);
return lseg;
}
@@ -933,6 +933,7 @@ static void
filelayout_pg_init_read(struct nfs_pageio_descriptor *pgio,
struct nfs_page *req)
{
+ pnfs_generic_pg_check_layout(pgio);
if (!pgio->pg_lseg) {
pgio->pg_lseg = fl_pnfs_update_layout(pgio->pg_inode,
req->wb_context,
@@ -959,6 +960,7 @@ filelayout_pg_init_write(struct nfs_pageio_descriptor *pgio,
struct nfs_commit_info cinfo;
int status;
+ pnfs_generic_pg_check_layout(pgio);
if (!pgio->pg_lseg) {
pgio->pg_lseg = fl_pnfs_update_layout(pgio->pg_inode,
req->wb_context,
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index 42dedf2d625f..f5714ee01000 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -846,6 +846,7 @@ ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio,
int ds_idx;
retry:
+ pnfs_generic_pg_check_layout(pgio);
/* Use full layout for now */
if (!pgio->pg_lseg)
ff_layout_pg_get_read(pgio, req, false);
@@ -894,6 +895,7 @@ ff_layout_pg_init_write(struct nfs_pageio_descriptor *pgio,
int status;
retry:
+ pnfs_generic_pg_check_layout(pgio);
if (!pgio->pg_lseg) {
pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
req->wb_context,
@@ -1800,16 +1802,16 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync)
ds = nfs4_ff_layout_prepare_ds(lseg, idx, true);
if (!ds)
- return PNFS_NOT_ATTEMPTED;
+ goto out_failed;
ds_clnt = nfs4_ff_find_or_create_ds_client(lseg, idx, ds->ds_clp,
hdr->inode);
if (IS_ERR(ds_clnt))
- return PNFS_NOT_ATTEMPTED;
+ goto out_failed;
ds_cred = ff_layout_get_ds_cred(lseg, idx, hdr->cred);
if (!ds_cred)
- return PNFS_NOT_ATTEMPTED;
+ goto out_failed;
vers = nfs4_ff_layout_ds_version(lseg, idx);
@@ -1839,6 +1841,11 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync)
sync, RPC_TASK_SOFTCONN);
put_rpccred(ds_cred);
return PNFS_ATTEMPTED;
+
+out_failed:
+ if (ff_layout_avoid_mds_available_ds(lseg))
+ return PNFS_TRY_AGAIN;
+ return PNFS_NOT_ATTEMPTED;
}
static u32 calc_ds_index_from_commit(struct pnfs_layout_segment *lseg, u32 i)
@@ -2354,10 +2361,21 @@ ff_layout_prepare_layoutstats(struct nfs42_layoutstat_args *args)
return 0;
}
+static int
+ff_layout_set_layoutdriver(struct nfs_server *server,
+ const struct nfs_fh *dummy)
+{
+#if IS_ENABLED(CONFIG_NFS_V4_2)
+ server->caps |= NFS_CAP_LAYOUTSTATS;
+#endif
+ return 0;
+}
+
static struct pnfs_layoutdriver_type flexfilelayout_type = {
.id = LAYOUT_FLEX_FILES,
.name = "LAYOUT_FLEX_FILES",
.owner = THIS_MODULE,
+ .set_layoutdriver = ff_layout_set_layoutdriver,
.alloc_layout_hdr = ff_layout_alloc_layout_hdr,
.free_layout_hdr = ff_layout_free_layout_hdr,
.alloc_lseg = ff_layout_alloc_lseg,
diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
index 457cfeb1d5c1..6df7a0cf5660 100644
--- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c
+++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
@@ -119,7 +119,13 @@ nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
if (ds_versions[i].wsize > NFS_MAX_FILE_IO_SIZE)
ds_versions[i].wsize = NFS_MAX_FILE_IO_SIZE;
- if (ds_versions[i].version != 3 || ds_versions[i].minor_version != 0) {
+ /*
+ * check for valid major/minor combination.
+ * currently we support dataserver which talk:
+ * v3, v4.0, v4.1, v4.2
+ */
+ if (!((ds_versions[i].version == 3 && ds_versions[i].minor_version == 0) ||
+ (ds_versions[i].version == 4 && ds_versions[i].minor_version < 3))) {
dprintk("%s: [%d] unsupported ds version %d-%d\n", __func__,
i, ds_versions[i].version,
ds_versions[i].minor_version);
@@ -415,7 +421,7 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx,
mirror->mirror_ds->ds_versions[0].minor_version);
/* connect success, check rsize/wsize limit */
- if (ds->ds_clp) {
+ if (!status) {
max_payload =
nfs_block_size(rpc_max_payload(ds->ds_clp->cl_rpcclient),
NULL);
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index f489a5a71bd5..1de93ba78dc9 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -734,7 +734,10 @@ int nfs_getattr(const struct path *path, struct kstat *stat,
if (need_atime || nfs_need_revalidate_inode(inode)) {
struct nfs_server *server = NFS_SERVER(inode);
- nfs_readdirplus_parent_cache_miss(path->dentry);
+ if (!(server->flags & NFS_MOUNT_NOAC))
+ nfs_readdirplus_parent_cache_miss(path->dentry);
+ else
+ nfs_readdirplus_parent_cache_hit(path->dentry);
err = __nfs_revalidate_inode(server, inode);
} else
nfs_readdirplus_parent_cache_hit(path->dentry);
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 7b38fedb7e03..e9b4c3320e37 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -495,7 +495,6 @@ void nfs_mark_request_commit(struct nfs_page *req,
u32 ds_commit_idx);
int nfs_write_need_commit(struct nfs_pgio_header *);
void nfs_writeback_update_inode(struct nfs_pgio_header *hdr);
-int nfs_commit_file(struct file *file, struct nfs_write_verifier *verf);
int nfs_generic_commit_list(struct inode *inode, struct list_head *head,
int how, struct nfs_commit_info *cinfo);
void nfs_retry_commit(struct list_head *page_list,
@@ -756,9 +755,13 @@ static inline bool nfs_error_is_fatal(int err)
{
switch (err) {
case -ERESTARTSYS:
+ case -EACCES:
+ case -EDQUOT:
+ case -EFBIG:
case -EIO:
case -ENOSPC:
case -EROFS:
+ case -ESTALE:
case -E2BIG:
return true;
default:
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index 786f17580582..1a224a33a6c2 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -143,11 +143,8 @@ struct vfsmount *nfs_d_automount(struct path *path)
struct nfs_fh *fh = NULL;
struct nfs_fattr *fattr = NULL;
- dprintk("--> nfs_d_automount()\n");
-
- mnt = ERR_PTR(-ESTALE);
if (IS_ROOT(path->dentry))
- goto out_nofree;
+ return ERR_PTR(-ESTALE);
mnt = ERR_PTR(-ENOMEM);
fh = nfs_alloc_fhandle();
@@ -155,13 +152,10 @@ struct vfsmount *nfs_d_automount(struct path *path)
if (fh == NULL || fattr == NULL)
goto out;
- dprintk("%s: enter\n", __func__);
-
mnt = server->nfs_client->rpc_ops->submount(server, path->dentry, fh, fattr);
if (IS_ERR(mnt))
goto out;
- dprintk("%s: done, success\n", __func__);
mntget(mnt); /* prevent immediate expiration */
mnt_set_expiry(mnt, &nfs_automount_list);
schedule_delayed_work(&nfs_automount_task, nfs_mountpoint_expiry_timeout);
@@ -169,11 +163,6 @@ struct vfsmount *nfs_d_automount(struct path *path)
out:
nfs_free_fattr(fattr);
nfs_free_fhandle(fh);
-out_nofree:
- if (IS_ERR(mnt))
- dprintk("<-- %s(): error %ld\n", __func__, PTR_ERR(mnt));
- else
- dprintk("<-- %s() = %p\n", __func__, mnt);
return mnt;
}
@@ -248,27 +237,20 @@ struct vfsmount *nfs_do_submount(struct dentry *dentry, struct nfs_fh *fh,
.fattr = fattr,
.authflavor = authflavor,
};
- struct vfsmount *mnt = ERR_PTR(-ENOMEM);
+ struct vfsmount *mnt;
char *page = (char *) __get_free_page(GFP_USER);
char *devname;
- dprintk("--> nfs_do_submount()\n");
-
- dprintk("%s: submounting on %pd2\n", __func__,
- dentry);
if (page == NULL)
- goto out;
+ return ERR_PTR(-ENOMEM);
+
devname = nfs_devname(dentry, page, PAGE_SIZE);
- mnt = (struct vfsmount *)devname;
if (IS_ERR(devname))
- goto free_page;
- mnt = nfs_do_clone_mount(NFS_SB(dentry->d_sb), devname, &mountdata);
-free_page:
- free_page((unsigned long)page);
-out:
- dprintk("%s: done\n", __func__);
+ mnt = (struct vfsmount *)devname;
+ else
+ mnt = nfs_do_clone_mount(NFS_SB(dentry->d_sb), devname, &mountdata);
- dprintk("<-- nfs_do_submount() = %p\n", mnt);
+ free_page((unsigned long)page);
return mnt;
}
EXPORT_SYMBOL_GPL(nfs_do_submount);
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index dc925b531f32..0c07b567118d 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -865,12 +865,63 @@ static void nfs3_proc_commit_setup(struct nfs_commit_data *data, struct rpc_mess
msg->rpc_proc = &nfs3_procedures[NFS3PROC_COMMIT];
}
+static void nfs3_nlm_alloc_call(void *data)
+{
+ struct nfs_lock_context *l_ctx = data;
+ if (l_ctx && test_bit(NFS_CONTEXT_UNLOCK, &l_ctx->open_context->flags)) {
+ get_nfs_open_context(l_ctx->open_context);
+ nfs_get_lock_context(l_ctx->open_context);
+ }
+}
+
+static bool nfs3_nlm_unlock_prepare(struct rpc_task *task, void *data)
+{
+ struct nfs_lock_context *l_ctx = data;
+ if (l_ctx && test_bit(NFS_CONTEXT_UNLOCK, &l_ctx->open_context->flags))
+ return nfs_async_iocounter_wait(task, l_ctx);
+ return false;
+
+}
+
+static void nfs3_nlm_release_call(void *data)
+{
+ struct nfs_lock_context *l_ctx = data;
+ struct nfs_open_context *ctx;
+ if (l_ctx && test_bit(NFS_CONTEXT_UNLOCK, &l_ctx->open_context->flags)) {
+ ctx = l_ctx->open_context;
+ nfs_put_lock_context(l_ctx);
+ put_nfs_open_context(ctx);
+ }
+}
+
+const struct nlmclnt_operations nlmclnt_fl_close_lock_ops = {
+ .nlmclnt_alloc_call = nfs3_nlm_alloc_call,
+ .nlmclnt_unlock_prepare = nfs3_nlm_unlock_prepare,
+ .nlmclnt_release_call = nfs3_nlm_release_call,
+};
+
static int
nfs3_proc_lock(struct file *filp, int cmd, struct file_lock *fl)
{
struct inode *inode = file_inode(filp);
+ struct nfs_lock_context *l_ctx = NULL;
+ struct nfs_open_context *ctx = nfs_file_open_context(filp);
+ int status;
- return nlmclnt_proc(NFS_SERVER(inode)->nlm_host, cmd, fl);
+ if (fl->fl_flags & FL_CLOSE) {
+ l_ctx = nfs_get_lock_context(ctx);
+ if (IS_ERR(l_ctx))
+ l_ctx = NULL;
+ else
+ set_bit(NFS_CONTEXT_UNLOCK, &ctx->flags);
+ }
+
+ status = nlmclnt_proc(NFS_SERVER(inode)->nlm_host, cmd, fl, l_ctx);
+
+ if (l_ctx)
+ nfs_put_lock_context(l_ctx);
+
+ return status;
}
static int nfs3_have_delegation(struct inode *inode, fmode_t flags)
@@ -921,6 +972,7 @@ const struct nfs_rpc_ops nfs_v3_clientops = {
.dir_inode_ops = &nfs3_dir_inode_operations,
.file_inode_ops = &nfs3_file_inode_operations,
.file_ops = &nfs_file_operations,
+ .nlmclnt_ops = &nlmclnt_fl_close_lock_ops,
.getroot = nfs3_proc_get_root,
.submount = nfs_submount,
.try_mount = nfs_try_mount,
diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
index 1e486c73ec94..929d09a5310a 100644
--- a/fs/nfs/nfs42proc.c
+++ b/fs/nfs/nfs42proc.c
@@ -167,23 +167,29 @@ static ssize_t _nfs42_proc_copy(struct file *src,
if (status)
return status;
+ res->commit_res.verf = kzalloc(sizeof(struct nfs_writeverf), GFP_NOFS);
+ if (!res->commit_res.verf)
+ return -ENOMEM;
status = nfs4_call_sync(server->client, server, &msg,
&args->seq_args, &res->seq_res, 0);
if (status == -ENOTSUPP)
server->caps &= ~NFS_CAP_COPY;
if (status)
- return status;
+ goto out;
- if (res->write_res.verifier.committed != NFS_FILE_SYNC) {
- status = nfs_commit_file(dst, &res->write_res.verifier.verifier);
- if (status)
- return status;
+ if (!nfs_write_verifier_cmp(&res->write_res.verifier.verifier,
+ &res->commit_res.verf->verifier)) {
+ status = -EAGAIN;
+ goto out;
}
truncate_pagecache_range(dst_inode, pos_dst,
pos_dst + res->write_res.count);
- return res->write_res.count;
+ status = res->write_res.count;
+out:
+ kfree(res->commit_res.verf);
+ return status;
}
ssize_t nfs42_proc_copy(struct file *src, loff_t pos_src,
@@ -240,6 +246,9 @@ ssize_t nfs42_proc_copy(struct file *src, loff_t pos_src,
if (err == -ENOTSUPP) {
err = -EOPNOTSUPP;
break;
+ } if (err == -EAGAIN) {
+ dst_exception.retry = 1;
+ continue;
}
err2 = nfs4_handle_exception(server, err, &src_exception);
@@ -379,6 +388,7 @@ nfs42_layoutstat_done(struct rpc_task *task, void *calldata)
pnfs_mark_layout_stateid_invalid(lo, &head);
spin_unlock(&inode->i_lock);
pnfs_free_lseg_list(&head);
+ nfs_commit_inode(inode, 0);
} else
spin_unlock(&inode->i_lock);
break;
@@ -400,8 +410,6 @@ nfs42_layoutstat_done(struct rpc_task *task, void *calldata)
case -EOPNOTSUPP:
NFS_SERVER(inode)->caps &= ~NFS_CAP_LAYOUTSTATS;
}
-
- dprintk("%s server returns %d\n", __func__, task->tk_status);
}
static void
diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c
index 6c7296454bbc..528362f69cc1 100644
--- a/fs/nfs/nfs42xdr.c
+++ b/fs/nfs/nfs42xdr.c
@@ -66,12 +66,14 @@
encode_putfh_maxsz + \
encode_savefh_maxsz + \
encode_putfh_maxsz + \
- encode_copy_maxsz)
+ encode_copy_maxsz + \
+ encode_commit_maxsz)
#define NFS4_dec_copy_sz (compound_decode_hdr_maxsz + \
decode_putfh_maxsz + \
decode_savefh_maxsz + \
decode_putfh_maxsz + \
- decode_copy_maxsz)
+ decode_copy_maxsz + \
+ decode_commit_maxsz)
#define NFS4_enc_deallocate_sz (compound_encode_hdr_maxsz + \
encode_putfh_maxsz + \
encode_deallocate_maxsz + \
@@ -222,6 +224,18 @@ static void nfs4_xdr_enc_allocate(struct rpc_rqst *req,
encode_nops(&hdr);
}
+static void encode_copy_commit(struct xdr_stream *xdr,
+ struct nfs42_copy_args *args,
+ struct compound_hdr *hdr)
+{
+ __be32 *p;
+
+ encode_op_hdr(xdr, OP_COMMIT, decode_commit_maxsz, hdr);
+ p = reserve_space(xdr, 12);
+ p = xdr_encode_hyper(p, args->dst_pos);
+ *p = cpu_to_be32(args->count);
+}
+
/*
* Encode COPY request
*/
@@ -239,6 +253,7 @@ static void nfs4_xdr_enc_copy(struct rpc_rqst *req,
encode_savefh(xdr, &hdr);
encode_putfh(xdr, args->dst_fh, &hdr);
encode_copy(xdr, args, &hdr);
+ encode_copy_commit(xdr, args, &hdr);
encode_nops(&hdr);
}
@@ -481,6 +496,9 @@ static int nfs4_xdr_dec_copy(struct rpc_rqst *rqstp,
if (status)
goto out;
status = decode_copy(xdr, res);
+ if (status)
+ goto out;
+ status = decode_commit(xdr, &res->commit_res);
out:
return status;
}
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 8346ccbf2d52..692a7a8bfc7a 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -359,11 +359,9 @@ struct nfs_client *nfs4_init_client(struct nfs_client *clp,
struct nfs_client *old;
int error;
- if (clp->cl_cons_state == NFS_CS_READY) {
+ if (clp->cl_cons_state == NFS_CS_READY)
/* the client is initialised already */
- dprintk("<-- nfs4_init_client() = 0 [already %p]\n", clp);
return clp;
- }
/* Check NFS protocol revision and initialize RPC op vector */
clp->rpc_ops = &nfs_v4_clientops;
@@ -421,7 +419,6 @@ struct nfs_client *nfs4_init_client(struct nfs_client *clp,
error:
nfs_mark_client_ready(clp, error);
nfs_put_client(clp);
- dprintk("<-- nfs4_init_client() = xerror %d\n", error);
return ERR_PTR(error);
}
@@ -469,6 +466,50 @@ static bool nfs4_same_verifier(nfs4_verifier *v1, nfs4_verifier *v2)
return memcmp(v1->data, v2->data, sizeof(v1->data)) == 0;
}
+static int nfs4_match_client(struct nfs_client *pos, struct nfs_client *new,
+ struct nfs_client **prev, struct nfs_net *nn)
+{
+ int status;
+
+ if (pos->rpc_ops != new->rpc_ops)
+ return 1;
+
+ if (pos->cl_minorversion != new->cl_minorversion)
+ return 1;
+
+ /* If "pos" isn't marked ready, we can't trust the
+ * remaining fields in "pos", especially the client
+ * ID and serverowner fields. Wait for CREATE_SESSION
+ * to finish. */
+ if (pos->cl_cons_state > NFS_CS_READY) {
+ atomic_inc(&pos->cl_count);
+ spin_unlock(&nn->nfs_client_lock);
+
+ nfs_put_client(*prev);
+ *prev = pos;
+
+ status = nfs_wait_client_init_complete(pos);
+ spin_lock(&nn->nfs_client_lock);
+
+ if (status < 0)
+ return status;
+ }
+
+ if (pos->cl_cons_state != NFS_CS_READY)
+ return 1;
+
+ if (pos->cl_clientid != new->cl_clientid)
+ return 1;
+
+ /* NFSv4.1 always uses the uniform string, however someone
+ * might switch the uniquifier string on us.
+ */
+ if (!nfs4_match_client_owner_id(pos, new))
+ return 1;
+
+ return 0;
+}
+
/**
* nfs40_walk_client_list - Find server that recognizes a client ID
*
@@ -497,34 +538,10 @@ int nfs40_walk_client_list(struct nfs_client *new,
spin_lock(&nn->nfs_client_lock);
list_for_each_entry(pos, &nn->nfs_client_list, cl_share_link) {
- if (pos->rpc_ops != new->rpc_ops)
- continue;
-
- if (pos->cl_minorversion != new->cl_minorversion)
- continue;
-
- /* If "pos" isn't marked ready, we can't trust the
- * remaining fields in "pos" */
- if (pos->cl_cons_state > NFS_CS_READY) {
- atomic_inc(&pos->cl_count);
- spin_unlock(&nn->nfs_client_lock);
-
- nfs_put_client(prev);
- prev = pos;
-
- status = nfs_wait_client_init_complete(pos);
- if (status < 0)
- goto out;
- status = -NFS4ERR_STALE_CLIENTID;
- spin_lock(&nn->nfs_client_lock);
- }
- if (pos->cl_cons_state != NFS_CS_READY)
- continue;
-
- if (pos->cl_clientid != new->cl_clientid)
- continue;
-
- if (!nfs4_match_client_owner_id(pos, new))
+ status = nfs4_match_client(pos, new, &prev, nn);
+ if (status < 0)
+ goto out_unlock;
+ if (status != 0)
continue;
/*
* We just sent a new SETCLIENTID, which should have
@@ -557,8 +574,6 @@ int nfs40_walk_client_list(struct nfs_client *new,
prev = NULL;
*result = pos;
- dprintk("NFS: <-- %s using nfs_client = %p ({%d})\n",
- __func__, pos, atomic_read(&pos->cl_count));
goto out;
case -ERESTARTSYS:
case -ETIMEDOUT:
@@ -567,37 +582,23 @@ int nfs40_walk_client_list(struct nfs_client *new,
*/
nfs4_schedule_path_down_recovery(pos);
default:
+ spin_lock(&nn->nfs_client_lock);
goto out;
}
spin_lock(&nn->nfs_client_lock);
}
+out_unlock:
spin_unlock(&nn->nfs_client_lock);
/* No match found. The server lost our clientid */
out:
nfs_put_client(prev);
- dprintk("NFS: <-- %s status = %d\n", __func__, status);
return status;
}
#ifdef CONFIG_NFS_V4_1
/*
- * Returns true if the client IDs match
- */
-static bool nfs4_match_clientids(u64 a, u64 b)
-{
- if (a != b) {
- dprintk("NFS: --> %s client ID %llx does not match %llx\n",
- __func__, a, b);
- return false;
- }
- dprintk("NFS: --> %s client ID %llx matches %llx\n",
- __func__, a, b);
- return true;
-}
-
-/*
* Returns true if the server major ids match
*/
static bool
@@ -605,36 +606,8 @@ nfs4_check_serverowner_major_id(struct nfs41_server_owner *o1,
struct nfs41_server_owner *o2)
{
if (o1->major_id_sz != o2->major_id_sz)
- goto out_major_mismatch;
- if (memcmp(o1->major_id, o2->major_id, o1->major_id_sz) != 0)
- goto out_major_mismatch;
-
- dprintk("NFS: --> %s server owner major IDs match\n", __func__);
- return true;
-
-out_major_mismatch:
- dprintk("NFS: --> %s server owner major IDs do not match\n",
- __func__);
- return false;
-}
-
-/*
- * Returns true if server minor ids match
- */
-static bool
-nfs4_check_serverowner_minor_id(struct nfs41_server_owner *o1,
- struct nfs41_server_owner *o2)
-{
- /* Check eir_server_owner so_minor_id */
- if (o1->minor_id != o2->minor_id)
- goto out_minor_mismatch;
-
- dprintk("NFS: --> %s server owner minor IDs match\n", __func__);
- return true;
-
-out_minor_mismatch:
- dprintk("NFS: --> %s server owner minor IDs do not match\n", __func__);
- return false;
+ return false;
+ return memcmp(o1->major_id, o2->major_id, o1->major_id_sz) == 0;
}
/*
@@ -645,18 +618,9 @@ nfs4_check_server_scope(struct nfs41_server_scope *s1,
struct nfs41_server_scope *s2)
{
if (s1->server_scope_sz != s2->server_scope_sz)
- goto out_scope_mismatch;
- if (memcmp(s1->server_scope, s2->server_scope,
- s1->server_scope_sz) != 0)
- goto out_scope_mismatch;
-
- dprintk("NFS: --> %s server scopes match\n", __func__);
- return true;
-
-out_scope_mismatch:
- dprintk("NFS: --> %s server scopes do not match\n",
- __func__);
- return false;
+ return false;
+ return memcmp(s1->server_scope, s2->server_scope,
+ s1->server_scope_sz) == 0;
}
/**
@@ -680,7 +644,7 @@ int nfs4_detect_session_trunking(struct nfs_client *clp,
struct rpc_xprt *xprt)
{
/* Check eir_clientid */
- if (!nfs4_match_clientids(clp->cl_clientid, res->clientid))
+ if (clp->cl_clientid != res->clientid)
goto out_err;
/* Check eir_server_owner so_major_id */
@@ -689,8 +653,7 @@ int nfs4_detect_session_trunking(struct nfs_client *clp,
goto out_err;
/* Check eir_server_owner so_minor_id */
- if (!nfs4_check_serverowner_minor_id(clp->cl_serverowner,
- res->server_owner))
+ if (clp->cl_serverowner->minor_id != res->server_owner->minor_id)
goto out_err;
/* Check eir_server_scope */
@@ -739,33 +702,10 @@ int nfs41_walk_client_list(struct nfs_client *new,
if (pos == new)
goto found;
- if (pos->rpc_ops != new->rpc_ops)
- continue;
-
- if (pos->cl_minorversion != new->cl_minorversion)
- continue;
-
- /* If "pos" isn't marked ready, we can't trust the
- * remaining fields in "pos", especially the client
- * ID and serverowner fields. Wait for CREATE_SESSION
- * to finish. */
- if (pos->cl_cons_state > NFS_CS_READY) {
- atomic_inc(&pos->cl_count);
- spin_unlock(&nn->nfs_client_lock);
-
- nfs_put_client(prev);
- prev = pos;
-
- status = nfs_wait_client_init_complete(pos);
- spin_lock(&nn->nfs_client_lock);
- if (status < 0)
- break;
- status = -NFS4ERR_STALE_CLIENTID;
- }
- if (pos->cl_cons_state != NFS_CS_READY)
- continue;
-
- if (!nfs4_match_clientids(pos->cl_clientid, new->cl_clientid))
+ status = nfs4_match_client(pos, new, &prev, nn);
+ if (status < 0)
+ goto out;
+ if (status != 0)
continue;
/*
@@ -777,23 +717,15 @@ int nfs41_walk_client_list(struct nfs_client *new,
new->cl_serverowner))
continue;
- /* Unlike NFSv4.0, we know that NFSv4.1 always uses the
- * uniform string, however someone might switch the
- * uniquifier string on us.
- */
- if (!nfs4_match_client_owner_id(pos, new))
- continue;
found:
atomic_inc(&pos->cl_count);
*result = pos;
status = 0;
- dprintk("NFS: <-- %s using nfs_client = %p ({%d})\n",
- __func__, pos, atomic_read(&pos->cl_count));
break;
}
+out:
spin_unlock(&nn->nfs_client_lock);
- dprintk("NFS: <-- %s status = %d\n", __func__, status);
nfs_put_client(prev);
return status;
}
@@ -916,9 +848,6 @@ static int nfs4_set_client(struct nfs_server *server,
.timeparms = timeparms,
};
struct nfs_client *clp;
- int error;
-
- dprintk("--> nfs4_set_client()\n");
if (server->flags & NFS_MOUNT_NORESVPORT)
set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags);
@@ -927,15 +856,11 @@ static int nfs4_set_client(struct nfs_server *server,
/* Allocate or find a client reference we can use */
clp = nfs_get_client(&cl_init);
- if (IS_ERR(clp)) {
- error = PTR_ERR(clp);
- goto error;
- }
+ if (IS_ERR(clp))
+ return PTR_ERR(clp);
- if (server->nfs_client == clp) {
- error = -ELOOP;
- goto error;
- }
+ if (server->nfs_client == clp)
+ return -ELOOP;
/*
* Query for the lease time on clientid setup or renewal
@@ -947,11 +872,7 @@ static int nfs4_set_client(struct nfs_server *server,
set_bit(NFS_CS_CHECK_LEASE_TIME, &clp->cl_res_state);
server->nfs_client = clp;
- dprintk("<-- nfs4_set_client() = 0 [new %p]\n", clp);
return 0;
-error:
- dprintk("<-- nfs4_set_client() = xerror %d\n", error);
- return error;
}
/*
@@ -982,7 +903,6 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv,
.net = mds_clp->cl_net,
.timeparms = &ds_timeout,
};
- struct nfs_client *clp;
char buf[INET6_ADDRSTRLEN + 1];
if (rpc_ntop(ds_addr, buf, sizeof(buf)) <= 0)
@@ -998,10 +918,7 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv,
* (section 13.1 RFC 5661).
*/
nfs_init_timeout_values(&ds_timeout, ds_proto, ds_timeo, ds_retrans);
- clp = nfs_get_client(&cl_init);
-
- dprintk("<-- %s %p\n", __func__, clp);
- return clp;
+ return nfs_get_client(&cl_init);
}
EXPORT_SYMBOL_GPL(nfs4_set_ds_client);
@@ -1098,8 +1015,6 @@ static int nfs4_init_server(struct nfs_server *server,
struct rpc_timeout timeparms;
int error;
- dprintk("--> nfs4_init_server()\n");
-
nfs_init_timeout_values(&timeparms, data->nfs_server.protocol,
data->timeo, data->retrans);
@@ -1127,7 +1042,7 @@ static int nfs4_init_server(struct nfs_server *server,
data->minorversion,
data->net);
if (error < 0)
- goto error;
+ return error;
if (data->rsize)
server->rsize = nfs_block_size(data->rsize, NULL);
@@ -1138,16 +1053,10 @@ static int nfs4_init_server(struct nfs_server *server,
server->acregmax = data->acregmax * HZ;
server->acdirmin = data->acdirmin * HZ;
server->acdirmax = data->acdirmax * HZ;
+ server->port = data->nfs_server.port;
- server->port = data->nfs_server.port;
-
- error = nfs_init_server_rpcclient(server, &timeparms,
- data->selected_flavor);
-
-error:
- /* Done */
- dprintk("<-- nfs4_init_server() = %d\n", error);
- return error;
+ return nfs_init_server_rpcclient(server, &timeparms,
+ data->selected_flavor);
}
/*
@@ -1163,8 +1072,6 @@ struct nfs_server *nfs4_create_server(struct nfs_mount_info *mount_info,
bool auth_probe;
int error;
- dprintk("--> nfs4_create_server()\n");
-
server = nfs_alloc_server();
if (!server)
return ERR_PTR(-ENOMEM);
@@ -1180,12 +1087,10 @@ struct nfs_server *nfs4_create_server(struct nfs_mount_info *mount_info,
if (error < 0)
goto error;
- dprintk("<-- nfs4_create_server() = %p\n", server);
return server;
error:
nfs_free_server(server);
- dprintk("<-- nfs4_create_server() = error %d\n", error);
return ERR_PTR(error);
}
@@ -1200,8 +1105,6 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
bool auth_probe;
int error;
- dprintk("--> nfs4_create_referral_server()\n");
-
server = nfs_alloc_server();
if (!server)
return ERR_PTR(-ENOMEM);
@@ -1235,12 +1138,10 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
if (error < 0)
goto error;
- dprintk("<-- nfs_create_referral_server() = %p\n", server);
return server;
error:
nfs_free_server(server);
- dprintk("<-- nfs4_create_referral_server() = error %d\n", error);
return ERR_PTR(error);
}
@@ -1300,31 +1201,16 @@ int nfs4_update_server(struct nfs_server *server, const char *hostname,
struct sockaddr *localaddr = (struct sockaddr *)&address;
int error;
- dprintk("--> %s: move FSID %llx:%llx to \"%s\")\n", __func__,
- (unsigned long long)server->fsid.major,
- (unsigned long long)server->fsid.minor,
- hostname);
-
error = rpc_switch_client_transport(clnt, &xargs, clnt->cl_timeout);
- if (error != 0) {
- dprintk("<-- %s(): rpc_switch_client_transport returned %d\n",
- __func__, error);
- goto out;
- }
+ if (error != 0)
+ return error;
error = rpc_localaddr(clnt, localaddr, sizeof(address));
- if (error != 0) {
- dprintk("<-- %s(): rpc_localaddr returned %d\n",
- __func__, error);
- goto out;
- }
+ if (error != 0)
+ return error;
- error = -EAFNOSUPPORT;
- if (rpc_ntop(localaddr, buf, sizeof(buf)) == 0) {
- dprintk("<-- %s(): rpc_ntop returned %d\n",
- __func__, error);
- goto out;
- }
+ if (rpc_ntop(localaddr, buf, sizeof(buf)) == 0)
+ return -EAFNOSUPPORT;
nfs_server_remove_lists(server);
error = nfs4_set_client(server, hostname, sap, salen, buf,
@@ -1333,21 +1219,12 @@ int nfs4_update_server(struct nfs_server *server, const char *hostname,
nfs_put_client(clp);
if (error != 0) {
nfs_server_insert_lists(server);
- dprintk("<-- %s(): nfs4_set_client returned %d\n",
- __func__, error);
- goto out;
+ return error;
}
if (server->nfs_client->cl_hostname == NULL)
server->nfs_client->cl_hostname = kstrdup(hostname, GFP_KERNEL);
nfs_server_insert_lists(server);
- error = nfs_probe_destination(server);
- if (error < 0)
- goto out;
-
- dprintk("<-- %s() succeeded\n", __func__);
-
-out:
- return error;
+ return nfs_probe_destination(server);
}
diff --git a/fs/nfs/nfs4getroot.c b/fs/nfs/nfs4getroot.c
index 039b3eb6d834..ac8406018962 100644
--- a/fs/nfs/nfs4getroot.c
+++ b/fs/nfs/nfs4getroot.c
@@ -14,8 +14,6 @@ int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh, bool auth_p
struct nfs_fsinfo fsinfo;
int ret = -ENOMEM;
- dprintk("--> nfs4_get_rootfh()\n");
-
fsinfo.fattr = nfs_alloc_fattr();
if (fsinfo.fattr == NULL)
goto out;
@@ -38,6 +36,5 @@ int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh, bool auth_p
memcpy(&server->fsid, &fsinfo.fattr->fsid, sizeof(server->fsid));
out:
nfs_free_fattr(fsinfo.fattr);
- dprintk("<-- nfs4_get_rootfh() = %d\n", ret);
return ret;
}
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index d8b040bd9814..7d531da1bae3 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -340,7 +340,6 @@ static struct vfsmount *nfs_follow_referral(struct dentry *dentry,
out:
free_page((unsigned long) page);
free_page((unsigned long) page2);
- dprintk("%s: done\n", __func__);
return mnt;
}
@@ -358,11 +357,9 @@ static struct vfsmount *nfs_do_refmount(struct rpc_clnt *client, struct dentry *
int err;
/* BUG_ON(IS_ROOT(dentry)); */
- dprintk("%s: enter\n", __func__);
-
page = alloc_page(GFP_KERNEL);
if (page == NULL)
- goto out;
+ return mnt;
fs_locations = kmalloc(sizeof(struct nfs4_fs_locations), GFP_KERNEL);
if (fs_locations == NULL)
@@ -386,8 +383,6 @@ static struct vfsmount *nfs_do_refmount(struct rpc_clnt *client, struct dentry *
out_free:
__free_page(page);
kfree(fs_locations);
-out:
- dprintk("%s: done\n", __func__);
return mnt;
}
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 201ca3f2c4ba..c08c46a3b8cd 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -698,7 +698,8 @@ static int nfs41_sequence_process(struct rpc_task *task,
session = slot->table->session;
if (slot->interrupted) {
- slot->interrupted = 0;
+ if (res->sr_status != -NFS4ERR_DELAY)
+ slot->interrupted = 0;
interrupted = true;
}
@@ -2300,8 +2301,10 @@ static int _nfs4_proc_open(struct nfs4_opendata *data)
if (status != 0)
return status;
}
- if (!(o_res->f_attr->valid & NFS_ATTR_FATTR))
+ if (!(o_res->f_attr->valid & NFS_ATTR_FATTR)) {
+ nfs4_sequence_free_slot(&o_res->seq_res);
nfs4_proc_getattr(server, &o_res->fh, o_res->f_attr, o_res->f_label);
+ }
return 0;
}
@@ -3265,6 +3268,7 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
.rpc_resp = &res,
};
int status;
+ int i;
bitmask[0] = FATTR4_WORD0_SUPPORTED_ATTRS |
FATTR4_WORD0_FH_EXPIRE_TYPE |
@@ -3330,8 +3334,13 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
server->cache_consistency_bitmask[0] &= FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE;
server->cache_consistency_bitmask[1] &= FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY;
server->cache_consistency_bitmask[2] = 0;
+
+ /* Avoid a regression due to buggy server */
+ for (i = 0; i < ARRAY_SIZE(res.exclcreat_bitmask); i++)
+ res.exclcreat_bitmask[i] &= res.attr_bitmask[i];
memcpy(server->exclcreat_bitmask, res.exclcreat_bitmask,
sizeof(server->exclcreat_bitmask));
+
server->acl_bitmask = res.acl_bitmask;
server->fh_expire_type = res.fh_expire_type;
}
@@ -4610,7 +4619,7 @@ static int nfs4_proc_pgio_rpc_prepare(struct rpc_task *task,
return 0;
if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context,
hdr->args.lock_context,
- hdr->rw_ops->rw_mode) == -EIO)
+ hdr->rw_mode) == -EIO)
return -EIO;
if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags)))
return -EIO;
@@ -4804,8 +4813,10 @@ static int nfs4_proc_async_renew(struct nfs_client *clp, struct rpc_cred *cred,
if (!atomic_inc_not_zero(&clp->cl_count))
return -EIO;
data = kmalloc(sizeof(*data), GFP_NOFS);
- if (data == NULL)
+ if (data == NULL) {
+ nfs_put_client(clp);
return -ENOMEM;
+ }
data->client = clp;
data->timestamp = jiffies;
return rpc_call_async(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT,
@@ -5782,6 +5793,7 @@ struct nfs4_unlockdata {
struct nfs_locku_res res;
struct nfs4_lock_state *lsp;
struct nfs_open_context *ctx;
+ struct nfs_lock_context *l_ctx;
struct file_lock fl;
struct nfs_server *server;
unsigned long timestamp;
@@ -5806,6 +5818,7 @@ static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl,
atomic_inc(&lsp->ls_count);
/* Ensure we don't close file until we're done freeing locks! */
p->ctx = get_nfs_open_context(ctx);
+ p->l_ctx = nfs_get_lock_context(ctx);
memcpy(&p->fl, fl, sizeof(p->fl));
p->server = NFS_SERVER(inode);
return p;
@@ -5816,6 +5829,7 @@ static void nfs4_locku_release_calldata(void *data)
struct nfs4_unlockdata *calldata = data;
nfs_free_seqid(calldata->arg.seqid);
nfs4_put_lock_state(calldata->lsp);
+ nfs_put_lock_context(calldata->l_ctx);
put_nfs_open_context(calldata->ctx);
kfree(calldata);
}
@@ -5857,6 +5871,10 @@ static void nfs4_locku_prepare(struct rpc_task *task, void *data)
{
struct nfs4_unlockdata *calldata = data;
+ if (test_bit(NFS_CONTEXT_UNLOCK, &calldata->l_ctx->open_context->flags) &&
+ nfs_async_iocounter_wait(task, calldata->l_ctx))
+ return;
+
if (nfs_wait_on_sequence(calldata->arg.seqid, task) != 0)
goto out_wait;
nfs4_stateid_copy(&calldata->arg.stateid, &calldata->lsp->ls_stateid);
@@ -5908,6 +5926,8 @@ static struct rpc_task *nfs4_do_unlck(struct file_lock *fl,
* canceled lock is passed in, and it won't be an unlock.
*/
fl->fl_type = F_UNLCK;
+ if (fl->fl_flags & FL_CLOSE)
+ set_bit(NFS_CONTEXT_UNLOCK, &ctx->flags);
data = nfs4_alloc_unlockdata(fl, ctx, lsp, seqid);
if (data == NULL) {
@@ -6445,9 +6465,6 @@ nfs4_proc_lock(struct file *filp, int cmd, struct file_lock *request)
ctx = nfs_file_open_context(filp);
state = ctx->state;
- if (request->fl_start < 0 || request->fl_end < 0)
- return -EINVAL;
-
if (IS_GETLK(cmd)) {
if (state != NULL)
return nfs4_proc_getlk(state, F_GETLK, request);
@@ -6470,20 +6487,6 @@ nfs4_proc_lock(struct file *filp, int cmd, struct file_lock *request)
!test_bit(NFS_STATE_POSIX_LOCKS, &state->flags))
return -ENOLCK;
- /*
- * Don't rely on the VFS having checked the file open mode,
- * since it won't do this for flock() locks.
- */
- switch (request->fl_type) {
- case F_RDLCK:
- if (!(filp->f_mode & FMODE_READ))
- return -EBADF;
- break;
- case F_WRLCK:
- if (!(filp->f_mode & FMODE_WRITE))
- return -EBADF;
- }
-
status = nfs4_set_lock_state(state, request);
if (status != 0)
return status;
@@ -7155,8 +7158,6 @@ int nfs4_proc_bind_one_conn_to_session(struct rpc_clnt *clnt,
};
struct rpc_task *task;
- dprintk("--> %s\n", __func__);
-
nfs4_copy_sessionid(&args.sessionid, &clp->cl_session->sess_id);
if (!(clp->cl_session->flags & SESSION4_BACK_CHAN))
args.dir = NFS4_CDFC4_FORE;
@@ -7176,24 +7177,20 @@ int nfs4_proc_bind_one_conn_to_session(struct rpc_clnt *clnt,
if (memcmp(res.sessionid.data,
clp->cl_session->sess_id.data, NFS4_MAX_SESSIONID_LEN)) {
dprintk("NFS: %s: Session ID mismatch\n", __func__);
- status = -EIO;
- goto out;
+ return -EIO;
}
if ((res.dir & args.dir) != res.dir || res.dir == 0) {
dprintk("NFS: %s: Unexpected direction from server\n",
__func__);
- status = -EIO;
- goto out;
+ return -EIO;
}
if (res.use_conn_in_rdma_mode != args.use_conn_in_rdma_mode) {
dprintk("NFS: %s: Server returned RDMA mode = true\n",
__func__);
- status = -EIO;
- goto out;
+ return -EIO;
}
}
-out:
- dprintk("<-- %s status= %d\n", __func__, status);
+
return status;
}
@@ -7459,15 +7456,16 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred,
};
struct nfs41_exchange_id_data *calldata;
struct rpc_task *task;
- int status = -EIO;
+ int status;
if (!atomic_inc_not_zero(&clp->cl_count))
- goto out;
+ return -EIO;
- status = -ENOMEM;
calldata = kzalloc(sizeof(*calldata), GFP_NOFS);
- if (!calldata)
- goto out;
+ if (!calldata) {
+ nfs_put_client(clp);
+ return -ENOMEM;
+ }
if (!xprt)
nfs4_init_boot_verifier(clp, &verifier);
@@ -7476,10 +7474,6 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred,
if (status)
goto out_calldata;
- dprintk("NFS call exchange_id auth=%s, '%s'\n",
- clp->cl_rpcclient->cl_auth->au_ops->au_name,
- clp->cl_owner_id);
-
calldata->res.server_owner = kzalloc(sizeof(struct nfs41_server_owner),
GFP_NOFS);
status = -ENOMEM;
@@ -7545,13 +7539,6 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred,
rpc_put_task(task);
out:
- if (clp->cl_implid != NULL)
- dprintk("NFS reply exchange_id: Server Implementation ID: "
- "domain: %s, name: %s, date: %llu,%u\n",
- clp->cl_implid->domain, clp->cl_implid->name,
- clp->cl_implid->date.seconds,
- clp->cl_implid->date.nseconds);
- dprintk("NFS reply exchange_id: %d\n", status);
return status;
out_impl_id:
@@ -7769,17 +7756,13 @@ int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo)
nfs4_init_sequence(&args.la_seq_args, &res.lr_seq_res, 0);
nfs4_set_sequence_privileged(&args.la_seq_args);
- dprintk("--> %s\n", __func__);
task = rpc_run_task(&task_setup);
if (IS_ERR(task))
- status = PTR_ERR(task);
- else {
- status = task->tk_status;
- rpc_put_task(task);
- }
- dprintk("<-- %s return %d\n", __func__, status);
+ return PTR_ERR(task);
+ status = task->tk_status;
+ rpc_put_task(task);
return status;
}
@@ -8180,6 +8163,12 @@ static int nfs41_reclaim_complete_handle_errors(struct rpc_task *task, struct nf
/* fall through */
case -NFS4ERR_RETRY_UNCACHED_REP:
return -EAGAIN;
+ case -NFS4ERR_BADSESSION:
+ case -NFS4ERR_DEADSESSION:
+ case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
+ nfs4_schedule_session_recovery(clp->cl_session,
+ task->tk_status);
+ break;
default:
nfs4_schedule_lease_recovery(clp);
}
@@ -8258,7 +8247,6 @@ static int nfs41_proc_reclaim_complete(struct nfs_client *clp,
if (status == 0)
status = task->tk_status;
rpc_put_task(task);
- return 0;
out:
dprintk("<-- %s status=%d\n", __func__, status);
return status;
@@ -8357,6 +8345,7 @@ nfs4_layoutget_handle_exception(struct rpc_task *task,
*/
pnfs_mark_layout_stateid_invalid(lo, &head);
spin_unlock(&inode->i_lock);
+ nfs_commit_inode(inode, 0);
pnfs_free_lseg_list(&head);
status = -EAGAIN;
goto out;
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 8156bad6b441..b34de036501b 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1649,13 +1649,14 @@ static void nfs4_state_start_reclaim_reboot(struct nfs_client *clp)
nfs4_state_mark_reclaim_helper(clp, nfs4_state_mark_reclaim_reboot);
}
-static void nfs4_reclaim_complete(struct nfs_client *clp,
+static int nfs4_reclaim_complete(struct nfs_client *clp,
const struct nfs4_state_recovery_ops *ops,
struct rpc_cred *cred)
{
/* Notify the server we're done reclaiming our state */
if (ops->reclaim_complete)
- (void)ops->reclaim_complete(clp, cred);
+ return ops->reclaim_complete(clp, cred);
+ return 0;
}
static void nfs4_clear_reclaim_server(struct nfs_server *server)
@@ -1702,13 +1703,16 @@ static void nfs4_state_end_reclaim_reboot(struct nfs_client *clp)
{
const struct nfs4_state_recovery_ops *ops;
struct rpc_cred *cred;
+ int err;
if (!nfs4_state_clear_reclaim_reboot(clp))
return;
ops = clp->cl_mvops->reboot_recovery_ops;
cred = nfs4_get_clid_cred(clp);
- nfs4_reclaim_complete(clp, ops, cred);
+ err = nfs4_reclaim_complete(clp, ops, cred);
put_rpccred(cred);
+ if (err == -NFS4ERR_CONN_NOT_BOUND_TO_SESSION)
+ set_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state);
}
static void nfs4_state_start_reclaim_nograce(struct nfs_client *clp)
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 80ce289eea05..3aebfdc82b30 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -1000,8 +1000,9 @@ static void encode_nfs4_verifier(struct xdr_stream *xdr, const nfs4_verifier *ve
static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap,
const struct nfs4_label *label,
+ const umode_t *umask,
const struct nfs_server *server,
- bool excl_check, const umode_t *umask)
+ const uint32_t attrmask[])
{
char owner_name[IDMAP_NAMESZ];
char owner_group[IDMAP_NAMESZ];
@@ -1016,22 +1017,20 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap,
/*
* We reserve enough space to write the entire attribute buffer at once.
*/
- if (iap->ia_valid & ATTR_SIZE) {
+ if ((iap->ia_valid & ATTR_SIZE) && (attrmask[0] & FATTR4_WORD0_SIZE)) {
bmval[0] |= FATTR4_WORD0_SIZE;
len += 8;
}
- if (!(server->attr_bitmask[2] & FATTR4_WORD2_MODE_UMASK))
- umask = NULL;
if (iap->ia_valid & ATTR_MODE) {
- if (umask) {
+ if (umask && (attrmask[2] & FATTR4_WORD2_MODE_UMASK)) {
bmval[2] |= FATTR4_WORD2_MODE_UMASK;
len += 8;
- } else {
+ } else if (attrmask[1] & FATTR4_WORD1_MODE) {
bmval[1] |= FATTR4_WORD1_MODE;
len += 4;
}
}
- if (iap->ia_valid & ATTR_UID) {
+ if ((iap->ia_valid & ATTR_UID) && (attrmask[1] & FATTR4_WORD1_OWNER)) {
owner_namelen = nfs_map_uid_to_name(server, iap->ia_uid, owner_name, IDMAP_NAMESZ);
if (owner_namelen < 0) {
dprintk("nfs: couldn't resolve uid %d to string\n",
@@ -1044,7 +1043,8 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap,
bmval[1] |= FATTR4_WORD1_OWNER;
len += 4 + (XDR_QUADLEN(owner_namelen) << 2);
}
- if (iap->ia_valid & ATTR_GID) {
+ if ((iap->ia_valid & ATTR_GID) &&
+ (attrmask[1] & FATTR4_WORD1_OWNER_GROUP)) {
owner_grouplen = nfs_map_gid_to_group(server, iap->ia_gid, owner_group, IDMAP_NAMESZ);
if (owner_grouplen < 0) {
dprintk("nfs: couldn't resolve gid %d to string\n",
@@ -1056,32 +1056,26 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap,
bmval[1] |= FATTR4_WORD1_OWNER_GROUP;
len += 4 + (XDR_QUADLEN(owner_grouplen) << 2);
}
- if (iap->ia_valid & ATTR_ATIME_SET) {
- bmval[1] |= FATTR4_WORD1_TIME_ACCESS_SET;
- len += 16;
- } else if (iap->ia_valid & ATTR_ATIME) {
- bmval[1] |= FATTR4_WORD1_TIME_ACCESS_SET;
- len += 4;
- }
- if (iap->ia_valid & ATTR_MTIME_SET) {
- bmval[1] |= FATTR4_WORD1_TIME_MODIFY_SET;
- len += 16;
- } else if (iap->ia_valid & ATTR_MTIME) {
- bmval[1] |= FATTR4_WORD1_TIME_MODIFY_SET;
- len += 4;
+ if (attrmask[1] & FATTR4_WORD1_TIME_ACCESS_SET) {
+ if (iap->ia_valid & ATTR_ATIME_SET) {
+ bmval[1] |= FATTR4_WORD1_TIME_ACCESS_SET;
+ len += 16;
+ } else if (iap->ia_valid & ATTR_ATIME) {
+ bmval[1] |= FATTR4_WORD1_TIME_ACCESS_SET;
+ len += 4;
+ }
}
-
- if (excl_check) {
- const u32 *excl_bmval = server->exclcreat_bitmask;
- bmval[0] &= excl_bmval[0];
- bmval[1] &= excl_bmval[1];
- bmval[2] &= excl_bmval[2];
-
- if (!(excl_bmval[2] & FATTR4_WORD2_SECURITY_LABEL))
- label = NULL;
+ if (attrmask[1] & FATTR4_WORD1_TIME_MODIFY_SET) {
+ if (iap->ia_valid & ATTR_MTIME_SET) {
+ bmval[1] |= FATTR4_WORD1_TIME_MODIFY_SET;
+ len += 16;
+ } else if (iap->ia_valid & ATTR_MTIME) {
+ bmval[1] |= FATTR4_WORD1_TIME_MODIFY_SET;
+ len += 4;
+ }
}
- if (label) {
+ if (label && (attrmask[2] & FATTR4_WORD2_SECURITY_LABEL)) {
len += 4 + 4 + 4 + (XDR_QUADLEN(label->len) << 2);
bmval[2] |= FATTR4_WORD2_SECURITY_LABEL;
}
@@ -1188,8 +1182,8 @@ static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *
}
encode_string(xdr, create->name->len, create->name->name);
- encode_attrs(xdr, create->attrs, create->label, create->server, false,
- &create->umask);
+ encode_attrs(xdr, create->attrs, create->label, &create->umask,
+ create->server, create->server->attr_bitmask);
}
static void encode_getattr_one(struct xdr_stream *xdr, uint32_t bitmap, struct compound_hdr *hdr)
@@ -1409,13 +1403,13 @@ static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_op
switch(arg->createmode) {
case NFS4_CREATE_UNCHECKED:
*p = cpu_to_be32(NFS4_CREATE_UNCHECKED);
- encode_attrs(xdr, arg->u.attrs, arg->label, arg->server, false,
- &arg->umask);
+ encode_attrs(xdr, arg->u.attrs, arg->label, &arg->umask,
+ arg->server, arg->server->attr_bitmask);
break;
case NFS4_CREATE_GUARDED:
*p = cpu_to_be32(NFS4_CREATE_GUARDED);
- encode_attrs(xdr, arg->u.attrs, arg->label, arg->server, false,
- &arg->umask);
+ encode_attrs(xdr, arg->u.attrs, arg->label, &arg->umask,
+ arg->server, arg->server->attr_bitmask);
break;
case NFS4_CREATE_EXCLUSIVE:
*p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE);
@@ -1424,8 +1418,8 @@ static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_op
case NFS4_CREATE_EXCLUSIVE4_1:
*p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE4_1);
encode_nfs4_verifier(xdr, &arg->u.verifier);
- encode_attrs(xdr, arg->u.attrs, arg->label, arg->server, true,
- &arg->umask);
+ encode_attrs(xdr, arg->u.attrs, arg->label, &arg->umask,
+ arg->server, arg->server->exclcreat_bitmask);
}
}
@@ -1681,7 +1675,8 @@ static void encode_setattr(struct xdr_stream *xdr, const struct nfs_setattrargs
{
encode_op_hdr(xdr, OP_SETATTR, decode_setattr_maxsz, hdr);
encode_nfs4_stateid(xdr, &arg->stateid);
- encode_attrs(xdr, arg->iap, arg->label, server, false, NULL);
+ encode_attrs(xdr, arg->iap, arg->label, NULL, server,
+ server->attr_bitmask);
}
static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclientid *setclientid, struct compound_hdr *hdr)
@@ -2005,16 +2000,10 @@ encode_layoutcommit(struct xdr_stream *xdr,
*p++ = cpu_to_be32(0); /* Never send time_modify_changed */
*p++ = cpu_to_be32(NFS_SERVER(args->inode)->pnfs_curr_ld->id);/* type */
- if (NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit) {
- NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit(
- NFS_I(inode)->layout, xdr, args);
- } else {
- encode_uint32(xdr, args->layoutupdate_len);
- if (args->layoutupdate_pages) {
- xdr_write_pages(xdr, args->layoutupdate_pages, 0,
- args->layoutupdate_len);
- }
- }
+ encode_uint32(xdr, args->layoutupdate_len);
+ if (args->layoutupdate_pages)
+ xdr_write_pages(xdr, args->layoutupdate_pages, 0,
+ args->layoutupdate_len);
return 0;
}
@@ -2024,7 +2013,6 @@ encode_layoutreturn(struct xdr_stream *xdr,
const struct nfs4_layoutreturn_args *args,
struct compound_hdr *hdr)
{
- const struct pnfs_layoutdriver_type *lr_ops = NFS_SERVER(args->inode)->pnfs_curr_ld;
__be32 *p;
encode_op_hdr(xdr, OP_LAYOUTRETURN, decode_layoutreturn_maxsz, hdr);
@@ -2041,8 +2029,6 @@ encode_layoutreturn(struct xdr_stream *xdr,
spin_unlock(&args->inode->i_lock);
if (args->ld_private->ops && args->ld_private->ops->encode)
args->ld_private->ops->encode(xdr, args, args->ld_private);
- else if (lr_ops->encode_layoutreturn)
- lr_ops->encode_layoutreturn(xdr, args);
else
encode_uint32(xdr, 0);
}
@@ -5579,6 +5565,8 @@ static int decode_op_map(struct xdr_stream *xdr, struct nfs4_op_map *op_map)
unsigned int i;
p = xdr_inline_decode(xdr, 4);
+ if (!p)
+ return -EIO;
bitmap_words = be32_to_cpup(p++);
if (bitmap_words > NFS4_OP_MAP_NUM_WORDS)
return -EIO;
diff --git a/fs/nfs/objlayout/Kbuild b/fs/nfs/objlayout/Kbuild
deleted file mode 100644
index ed30ea072bb8..000000000000
--- a/fs/nfs/objlayout/Kbuild
+++ /dev/null
@@ -1,5 +0,0 @@
-#
-# Makefile for the pNFS Objects Layout Driver kernel module
-#
-objlayoutdriver-y := objio_osd.o pnfs_osd_xdr_cli.o objlayout.o
-obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayoutdriver.o
diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
deleted file mode 100644
index 049c1b1f2932..000000000000
--- a/fs/nfs/objlayout/objio_osd.c
+++ /dev/null
@@ -1,675 +0,0 @@
-/*
- * pNFS Objects layout implementation over open-osd initiator library
- *
- * Copyright (C) 2009 Panasas Inc. [year of first publication]
- * All rights reserved.
- *
- * Benny Halevy <bhalevy@panasas.com>
- * Boaz Harrosh <ooo@electrozaur.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2
- * See the file COPYING included with this distribution for more details.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * 3. Neither the name of the Panasas company nor the names of its
- * contributors may be used to endorse or promote products derived
- * from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
- * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
- * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include <linux/module.h>
-#include <scsi/osd_ore.h>
-
-#include "objlayout.h"
-#include "../internal.h"
-
-#define NFSDBG_FACILITY NFSDBG_PNFS_LD
-
-struct objio_dev_ent {
- struct nfs4_deviceid_node id_node;
- struct ore_dev od;
-};
-
-static void
-objio_free_deviceid_node(struct nfs4_deviceid_node *d)
-{
- struct objio_dev_ent *de = container_of(d, struct objio_dev_ent, id_node);
-
- dprintk("%s: free od=%p\n", __func__, de->od.od);
- osduld_put_device(de->od.od);
- kfree_rcu(d, rcu);
-}
-
-struct objio_segment {
- struct pnfs_layout_segment lseg;
-
- struct ore_layout layout;
- struct ore_components oc;
-};
-
-static inline struct objio_segment *
-OBJIO_LSEG(struct pnfs_layout_segment *lseg)
-{
- return container_of(lseg, struct objio_segment, lseg);
-}
-
-struct objio_state {
- /* Generic layer */
- struct objlayout_io_res oir;
-
- bool sync;
- /*FIXME: Support for extra_bytes at ore_get_rw_state() */
- struct ore_io_state *ios;
-};
-
-/* Send and wait for a get_device_info of devices in the layout,
- then look them up with the osd_initiator library */
-struct nfs4_deviceid_node *
-objio_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
- gfp_t gfp_flags)
-{
- struct pnfs_osd_deviceaddr *deviceaddr;
- struct objio_dev_ent *ode = NULL;
- struct osd_dev *od;
- struct osd_dev_info odi;
- bool retry_flag = true;
- __be32 *p;
- int err;
-
- deviceaddr = kzalloc(sizeof(*deviceaddr), gfp_flags);
- if (!deviceaddr)
- return NULL;
-
- p = page_address(pdev->pages[0]);
- pnfs_osd_xdr_decode_deviceaddr(deviceaddr, p);
-
- odi.systemid_len = deviceaddr->oda_systemid.len;
- if (odi.systemid_len > sizeof(odi.systemid)) {
- dprintk("%s: odi.systemid_len > sizeof(systemid=%zd)\n",
- __func__, sizeof(odi.systemid));
- err = -EINVAL;
- goto out;
- } else if (odi.systemid_len)
- memcpy(odi.systemid, deviceaddr->oda_systemid.data,
- odi.systemid_len);
- odi.osdname_len = deviceaddr->oda_osdname.len;
- odi.osdname = (u8 *)deviceaddr->oda_osdname.data;
-
- if (!odi.osdname_len && !odi.systemid_len) {
- dprintk("%s: !odi.osdname_len && !odi.systemid_len\n",
- __func__);
- err = -ENODEV;
- goto out;
- }
-
-retry_lookup:
- od = osduld_info_lookup(&odi);
- if (IS_ERR(od)) {
- err = PTR_ERR(od);
- dprintk("%s: osduld_info_lookup => %d\n", __func__, err);
- if (err == -ENODEV && retry_flag) {
- err = objlayout_autologin(deviceaddr);
- if (likely(!err)) {
- retry_flag = false;
- goto retry_lookup;
- }
- }
- goto out;
- }
-
- dprintk("Adding new dev_id(%llx:%llx)\n",
- _DEVID_LO(&pdev->dev_id), _DEVID_HI(&pdev->dev_id));
-
- ode = kzalloc(sizeof(*ode), gfp_flags);
- if (!ode) {
- dprintk("%s: -ENOMEM od=%p\n", __func__, od);
- goto out;
- }
-
- nfs4_init_deviceid_node(&ode->id_node, server, &pdev->dev_id);
- kfree(deviceaddr);
-
- ode->od.od = od;
- return &ode->id_node;
-
-out:
- kfree(deviceaddr);
- return NULL;
-}
-
-static void copy_single_comp(struct ore_components *oc, unsigned c,
- struct pnfs_osd_object_cred *src_comp)
-{
- struct ore_comp *ocomp = &oc->comps[c];
-
- WARN_ON(src_comp->oc_cap_key.cred_len > 0); /* libosd is NO_SEC only */
- WARN_ON(src_comp->oc_cap.cred_len > sizeof(ocomp->cred));
-
- ocomp->obj.partition = src_comp->oc_object_id.oid_partition_id;
- ocomp->obj.id = src_comp->oc_object_id.oid_object_id;
-
- memcpy(ocomp->cred, src_comp->oc_cap.cred, sizeof(ocomp->cred));
-}
-
-static int __alloc_objio_seg(unsigned numdevs, gfp_t gfp_flags,
- struct objio_segment **pseg)
-{
-/* This is the in memory structure of the objio_segment
- *
- * struct __alloc_objio_segment {
- * struct objio_segment olseg;
- * struct ore_dev *ods[numdevs];
- * struct ore_comp comps[numdevs];
- * } *aolseg;
- * NOTE: The code as above compiles and runs perfectly. It is elegant,
- * type safe and compact. At some Past time Linus has decided he does not
- * like variable length arrays, For the sake of this principal we uglify
- * the code as below.
- */
- struct objio_segment *lseg;
- size_t lseg_size = sizeof(*lseg) +
- numdevs * sizeof(lseg->oc.ods[0]) +
- numdevs * sizeof(*lseg->oc.comps);
-
- lseg = kzalloc(lseg_size, gfp_flags);
- if (unlikely(!lseg)) {
- dprintk("%s: Failed allocation numdevs=%d size=%zd\n", __func__,
- numdevs, lseg_size);
- return -ENOMEM;
- }
-
- lseg->oc.numdevs = numdevs;
- lseg->oc.single_comp = EC_MULTPLE_COMPS;
- lseg->oc.ods = (void *)(lseg + 1);
- lseg->oc.comps = (void *)(lseg->oc.ods + numdevs);
-
- *pseg = lseg;
- return 0;
-}
-
-int objio_alloc_lseg(struct pnfs_layout_segment **outp,
- struct pnfs_layout_hdr *pnfslay,
- struct pnfs_layout_range *range,
- struct xdr_stream *xdr,
- gfp_t gfp_flags)
-{
- struct nfs_server *server = NFS_SERVER(pnfslay->plh_inode);
- struct objio_segment *objio_seg;
- struct pnfs_osd_xdr_decode_layout_iter iter;
- struct pnfs_osd_layout layout;
- struct pnfs_osd_object_cred src_comp;
- unsigned cur_comp;
- int err;
-
- err = pnfs_osd_xdr_decode_layout_map(&layout, &iter, xdr);
- if (unlikely(err))
- return err;
-
- err = __alloc_objio_seg(layout.olo_num_comps, gfp_flags, &objio_seg);
- if (unlikely(err))
- return err;
-
- objio_seg->layout.stripe_unit = layout.olo_map.odm_stripe_unit;
- objio_seg->layout.group_width = layout.olo_map.odm_group_width;
- objio_seg->layout.group_depth = layout.olo_map.odm_group_depth;
- objio_seg->layout.mirrors_p1 = layout.olo_map.odm_mirror_cnt + 1;
- objio_seg->layout.raid_algorithm = layout.olo_map.odm_raid_algorithm;
-
- err = ore_verify_layout(layout.olo_map.odm_num_comps,
- &objio_seg->layout);
- if (unlikely(err))
- goto err;
-
- objio_seg->oc.first_dev = layout.olo_comps_index;
- cur_comp = 0;
- while (pnfs_osd_xdr_decode_layout_comp(&src_comp, &iter, xdr, &err)) {
- struct nfs4_deviceid_node *d;
- struct objio_dev_ent *ode;
-
- copy_single_comp(&objio_seg->oc, cur_comp, &src_comp);
-
- d = nfs4_find_get_deviceid(server,
- &src_comp.oc_object_id.oid_device_id,
- pnfslay->plh_lc_cred, gfp_flags);
- if (!d) {
- err = -ENXIO;
- goto err;
- }
-
- ode = container_of(d, struct objio_dev_ent, id_node);
- objio_seg->oc.ods[cur_comp++] = &ode->od;
- }
- /* pnfs_osd_xdr_decode_layout_comp returns false on error */
- if (unlikely(err))
- goto err;
-
- *outp = &objio_seg->lseg;
- return 0;
-
-err:
- kfree(objio_seg);
- dprintk("%s: Error: return %d\n", __func__, err);
- *outp = NULL;
- return err;
-}
-
-void objio_free_lseg(struct pnfs_layout_segment *lseg)
-{
- int i;
- struct objio_segment *objio_seg = OBJIO_LSEG(lseg);
-
- for (i = 0; i < objio_seg->oc.numdevs; i++) {
- struct ore_dev *od = objio_seg->oc.ods[i];
- struct objio_dev_ent *ode;
-
- if (!od)
- break;
- ode = container_of(od, typeof(*ode), od);
- nfs4_put_deviceid_node(&ode->id_node);
- }
- kfree(objio_seg);
-}
-
-static int
-objio_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type, bool is_reading,
- struct pnfs_layout_segment *lseg, struct page **pages, unsigned pgbase,
- loff_t offset, size_t count, void *rpcdata, gfp_t gfp_flags,
- struct objio_state **outp)
-{
- struct objio_segment *objio_seg = OBJIO_LSEG(lseg);
- struct ore_io_state *ios;
- int ret;
- struct __alloc_objio_state {
- struct objio_state objios;
- struct pnfs_osd_ioerr ioerrs[objio_seg->oc.numdevs];
- } *aos;
-
- aos = kzalloc(sizeof(*aos), gfp_flags);
- if (unlikely(!aos))
- return -ENOMEM;
-
- objlayout_init_ioerrs(&aos->objios.oir, objio_seg->oc.numdevs,
- aos->ioerrs, rpcdata, pnfs_layout_type);
-
- ret = ore_get_rw_state(&objio_seg->layout, &objio_seg->oc, is_reading,
- offset, count, &ios);
- if (unlikely(ret)) {
- kfree(aos);
- return ret;
- }
-
- ios->pages = pages;
- ios->pgbase = pgbase;
- ios->private = aos;
- BUG_ON(ios->nr_pages > (pgbase + count + PAGE_SIZE - 1) >> PAGE_SHIFT);
-
- aos->objios.sync = 0;
- aos->objios.ios = ios;
- *outp = &aos->objios;
- return 0;
-}
-
-void objio_free_result(struct objlayout_io_res *oir)
-{
- struct objio_state *objios = container_of(oir, struct objio_state, oir);
-
- ore_put_io_state(objios->ios);
- kfree(objios);
-}
-
-static enum pnfs_osd_errno osd_pri_2_pnfs_err(enum osd_err_priority oep)
-{
- switch (oep) {
- case OSD_ERR_PRI_NO_ERROR:
- return (enum pnfs_osd_errno)0;
-
- case OSD_ERR_PRI_CLEAR_PAGES:
- BUG_ON(1);
- return 0;
-
- case OSD_ERR_PRI_RESOURCE:
- return PNFS_OSD_ERR_RESOURCE;
- case OSD_ERR_PRI_BAD_CRED:
- return PNFS_OSD_ERR_BAD_CRED;
- case OSD_ERR_PRI_NO_ACCESS:
- return PNFS_OSD_ERR_NO_ACCESS;
- case OSD_ERR_PRI_UNREACHABLE:
- return PNFS_OSD_ERR_UNREACHABLE;
- case OSD_ERR_PRI_NOT_FOUND:
- return PNFS_OSD_ERR_NOT_FOUND;
- case OSD_ERR_PRI_NO_SPACE:
- return PNFS_OSD_ERR_NO_SPACE;
- default:
- WARN_ON(1);
- /* fallthrough */
- case OSD_ERR_PRI_EIO:
- return PNFS_OSD_ERR_EIO;
- }
-}
-
-static void __on_dev_error(struct ore_io_state *ios,
- struct ore_dev *od, unsigned dev_index, enum osd_err_priority oep,
- u64 dev_offset, u64 dev_len)
-{
- struct objio_state *objios = ios->private;
- struct pnfs_osd_objid pooid;
- struct objio_dev_ent *ode = container_of(od, typeof(*ode), od);
- /* FIXME: what to do with more-then-one-group layouts. We need to
- * translate from ore_io_state index to oc->comps index
- */
- unsigned comp = dev_index;
-
- pooid.oid_device_id = ode->id_node.deviceid;
- pooid.oid_partition_id = ios->oc->comps[comp].obj.partition;
- pooid.oid_object_id = ios->oc->comps[comp].obj.id;
-
- objlayout_io_set_result(&objios->oir, comp,
- &pooid, osd_pri_2_pnfs_err(oep),
- dev_offset, dev_len, !ios->reading);
-}
-
-/*
- * read
- */
-static void _read_done(struct ore_io_state *ios, void *private)
-{
- struct objio_state *objios = private;
- ssize_t status;
- int ret = ore_check_io(ios, &__on_dev_error);
-
- /* FIXME: _io_free(ios) can we dealocate the libosd resources; */
-
- if (likely(!ret))
- status = ios->length;
- else
- status = ret;
-
- objlayout_read_done(&objios->oir, status, objios->sync);
-}
-
-int objio_read_pagelist(struct nfs_pgio_header *hdr)
-{
- struct objio_state *objios;
- int ret;
-
- ret = objio_alloc_io_state(NFS_I(hdr->inode)->layout, true,
- hdr->lseg, hdr->args.pages, hdr->args.pgbase,
- hdr->args.offset, hdr->args.count, hdr,
- GFP_KERNEL, &objios);
- if (unlikely(ret))
- return ret;
-
- objios->ios->done = _read_done;
- dprintk("%s: offset=0x%llx length=0x%x\n", __func__,
- hdr->args.offset, hdr->args.count);
- ret = ore_read(objios->ios);
- if (unlikely(ret))
- objio_free_result(&objios->oir);
- return ret;
-}
-
-/*
- * write
- */
-static void _write_done(struct ore_io_state *ios, void *private)
-{
- struct objio_state *objios = private;
- ssize_t status;
- int ret = ore_check_io(ios, &__on_dev_error);
-
- /* FIXME: _io_free(ios) can we dealocate the libosd resources; */
-
- if (likely(!ret)) {
- /* FIXME: should be based on the OSD's persistence model
- * See OSD2r05 Section 4.13 Data persistence model */
- objios->oir.committed = NFS_FILE_SYNC;
- status = ios->length;
- } else {
- status = ret;
- }
-
- objlayout_write_done(&objios->oir, status, objios->sync);
-}
-
-static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate)
-{
- struct objio_state *objios = priv;
- struct nfs_pgio_header *hdr = objios->oir.rpcdata;
- struct address_space *mapping = hdr->inode->i_mapping;
- pgoff_t index = offset / PAGE_SIZE;
- struct page *page;
- loff_t i_size = i_size_read(hdr->inode);
-
- if (offset >= i_size) {
- *uptodate = true;
- dprintk("%s: g_zero_page index=0x%lx\n", __func__, index);
- return ZERO_PAGE(0);
- }
-
- page = find_get_page(mapping, index);
- if (!page) {
- page = find_or_create_page(mapping, index, GFP_NOFS);
- if (unlikely(!page)) {
- dprintk("%s: grab_cache_page Failed index=0x%lx\n",
- __func__, index);
- return NULL;
- }
- unlock_page(page);
- }
- *uptodate = PageUptodate(page);
- dprintk("%s: index=0x%lx uptodate=%d\n", __func__, index, *uptodate);
- return page;
-}
-
-static void __r4w_put_page(void *priv, struct page *page)
-{
- dprintk("%s: index=0x%lx\n", __func__,
- (page == ZERO_PAGE(0)) ? -1UL : page->index);
- if (ZERO_PAGE(0) != page)
- put_page(page);
- return;
-}
-
-static const struct _ore_r4w_op _r4w_op = {
- .get_page = &__r4w_get_page,
- .put_page = &__r4w_put_page,
-};
-
-int objio_write_pagelist(struct nfs_pgio_header *hdr, int how)
-{
- struct objio_state *objios;
- int ret;
-
- ret = objio_alloc_io_state(NFS_I(hdr->inode)->layout, false,
- hdr->lseg, hdr->args.pages, hdr->args.pgbase,
- hdr->args.offset, hdr->args.count, hdr, GFP_NOFS,
- &objios);
- if (unlikely(ret))
- return ret;
-
- objios->sync = 0 != (how & FLUSH_SYNC);
- objios->ios->r4w = &_r4w_op;
-
- if (!objios->sync)
- objios->ios->done = _write_done;
-
- dprintk("%s: offset=0x%llx length=0x%x\n", __func__,
- hdr->args.offset, hdr->args.count);
- ret = ore_write(objios->ios);
- if (unlikely(ret)) {
- objio_free_result(&objios->oir);
- return ret;
- }
-
- if (objios->sync)
- _write_done(objios->ios, objios);
-
- return 0;
-}
-
-/*
- * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number
- * of bytes (maximum @req->wb_bytes) that can be coalesced.
- */
-static size_t objio_pg_test(struct nfs_pageio_descriptor *pgio,
- struct nfs_page *prev, struct nfs_page *req)
-{
- struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(pgio);
- unsigned int size;
-
- size = pnfs_generic_pg_test(pgio, prev, req);
-
- if (!size || mirror->pg_count + req->wb_bytes >
- (unsigned long)pgio->pg_layout_private)
- return 0;
-
- return min(size, req->wb_bytes);
-}
-
-static void objio_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
-{
- pnfs_generic_pg_init_read(pgio, req);
- if (unlikely(pgio->pg_lseg == NULL))
- return; /* Not pNFS */
-
- pgio->pg_layout_private = (void *)
- OBJIO_LSEG(pgio->pg_lseg)->layout.max_io_length;
-}
-
-static bool aligned_on_raid_stripe(u64 offset, struct ore_layout *layout,
- unsigned long *stripe_end)
-{
- u32 stripe_off;
- unsigned stripe_size;
-
- if (layout->raid_algorithm == PNFS_OSD_RAID_0)
- return true;
-
- stripe_size = layout->stripe_unit *
- (layout->group_width - layout->parity);
-
- div_u64_rem(offset, stripe_size, &stripe_off);
- if (!stripe_off)
- return true;
-
- *stripe_end = stripe_size - stripe_off;
- return false;
-}
-
-static void objio_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
-{
- unsigned long stripe_end = 0;
- u64 wb_size;
-
- if (pgio->pg_dreq == NULL)
- wb_size = i_size_read(pgio->pg_inode) - req_offset(req);
- else
- wb_size = nfs_dreq_bytes_left(pgio->pg_dreq);
-
- pnfs_generic_pg_init_write(pgio, req, wb_size);
- if (unlikely(pgio->pg_lseg == NULL))
- return; /* Not pNFS */
-
- if (req->wb_offset ||
- !aligned_on_raid_stripe(req->wb_index * PAGE_SIZE,
- &OBJIO_LSEG(pgio->pg_lseg)->layout,
- &stripe_end)) {
- pgio->pg_layout_private = (void *)stripe_end;
- } else {
- pgio->pg_layout_private = (void *)
- OBJIO_LSEG(pgio->pg_lseg)->layout.max_io_length;
- }
-}
-
-static const struct nfs_pageio_ops objio_pg_read_ops = {
- .pg_init = objio_init_read,
- .pg_test = objio_pg_test,
- .pg_doio = pnfs_generic_pg_readpages,
- .pg_cleanup = pnfs_generic_pg_cleanup,
-};
-
-static const struct nfs_pageio_ops objio_pg_write_ops = {
- .pg_init = objio_init_write,
- .pg_test = objio_pg_test,
- .pg_doio = pnfs_generic_pg_writepages,
- .pg_cleanup = pnfs_generic_pg_cleanup,
-};
-
-static struct pnfs_layoutdriver_type objlayout_type = {
- .id = LAYOUT_OSD2_OBJECTS,
- .name = "LAYOUT_OSD2_OBJECTS",
- .flags = PNFS_LAYOUTRET_ON_SETATTR |
- PNFS_LAYOUTRET_ON_ERROR,
-
- .max_deviceinfo_size = PAGE_SIZE,
- .owner = THIS_MODULE,
- .alloc_layout_hdr = objlayout_alloc_layout_hdr,
- .free_layout_hdr = objlayout_free_layout_hdr,
-
- .alloc_lseg = objlayout_alloc_lseg,
- .free_lseg = objlayout_free_lseg,
-
- .read_pagelist = objlayout_read_pagelist,
- .write_pagelist = objlayout_write_pagelist,
- .pg_read_ops = &objio_pg_read_ops,
- .pg_write_ops = &objio_pg_write_ops,
-
- .sync = pnfs_generic_sync,
-
- .free_deviceid_node = objio_free_deviceid_node,
-
- .encode_layoutcommit = objlayout_encode_layoutcommit,
- .encode_layoutreturn = objlayout_encode_layoutreturn,
-};
-
-MODULE_DESCRIPTION("pNFS Layout Driver for OSD2 objects");
-MODULE_AUTHOR("Benny Halevy <bhalevy@panasas.com>");
-MODULE_LICENSE("GPL");
-
-static int __init
-objlayout_init(void)
-{
- int ret = pnfs_register_layoutdriver(&objlayout_type);
-
- if (ret)
- printk(KERN_INFO
- "NFS: %s: Registering OSD pNFS Layout Driver failed: error=%d\n",
- __func__, ret);
- else
- printk(KERN_INFO "NFS: %s: Registered OSD pNFS Layout Driver\n",
- __func__);
- return ret;
-}
-
-static void __exit
-objlayout_exit(void)
-{
- pnfs_unregister_layoutdriver(&objlayout_type);
- printk(KERN_INFO "NFS: %s: Unregistered OSD pNFS Layout Driver\n",
- __func__);
-}
-
-MODULE_ALIAS("nfs-layouttype4-2");
-
-module_init(objlayout_init);
-module_exit(objlayout_exit);
diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c
deleted file mode 100644
index 8f3d2acb81c3..000000000000
--- a/fs/nfs/objlayout/objlayout.c
+++ /dev/null
@@ -1,706 +0,0 @@
-/*
- * pNFS Objects layout driver high level definitions
- *
- * Copyright (C) 2007 Panasas Inc. [year of first publication]
- * All rights reserved.
- *
- * Benny Halevy <bhalevy@panasas.com>
- * Boaz Harrosh <ooo@electrozaur.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2
- * See the file COPYING included with this distribution for more details.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * 3. Neither the name of the Panasas company nor the names of its
- * contributors may be used to endorse or promote products derived
- * from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
- * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
- * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include <linux/kmod.h>
-#include <linux/moduleparam.h>
-#include <linux/ratelimit.h>
-#include <scsi/osd_initiator.h>
-#include "objlayout.h"
-
-#define NFSDBG_FACILITY NFSDBG_PNFS_LD
-/*
- * Create a objlayout layout structure for the given inode and return it.
- */
-struct pnfs_layout_hdr *
-objlayout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags)
-{
- struct objlayout *objlay;
-
- objlay = kzalloc(sizeof(struct objlayout), gfp_flags);
- if (!objlay)
- return NULL;
- spin_lock_init(&objlay->lock);
- INIT_LIST_HEAD(&objlay->err_list);
- dprintk("%s: Return %p\n", __func__, objlay);
- return &objlay->pnfs_layout;
-}
-
-/*
- * Free an objlayout layout structure
- */
-void
-objlayout_free_layout_hdr(struct pnfs_layout_hdr *lo)
-{
- struct objlayout *objlay = OBJLAYOUT(lo);
-
- dprintk("%s: objlay %p\n", __func__, objlay);
-
- WARN_ON(!list_empty(&objlay->err_list));
- kfree(objlay);
-}
-
-/*
- * Unmarshall layout and store it in pnfslay.
- */
-struct pnfs_layout_segment *
-objlayout_alloc_lseg(struct pnfs_layout_hdr *pnfslay,
- struct nfs4_layoutget_res *lgr,
- gfp_t gfp_flags)
-{
- int status = -ENOMEM;
- struct xdr_stream stream;
- struct xdr_buf buf = {
- .pages = lgr->layoutp->pages,
- .page_len = lgr->layoutp->len,
- .buflen = lgr->layoutp->len,
- .len = lgr->layoutp->len,
- };
- struct page *scratch;
- struct pnfs_layout_segment *lseg;
-
- dprintk("%s: Begin pnfslay %p\n", __func__, pnfslay);
-
- scratch = alloc_page(gfp_flags);
- if (!scratch)
- goto err_nofree;
-
- xdr_init_decode(&stream, &buf, NULL);
- xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
-
- status = objio_alloc_lseg(&lseg, pnfslay, &lgr->range, &stream, gfp_flags);
- if (unlikely(status)) {
- dprintk("%s: objio_alloc_lseg Return err %d\n", __func__,
- status);
- goto err;
- }
-
- __free_page(scratch);
-
- dprintk("%s: Return %p\n", __func__, lseg);
- return lseg;
-
-err:
- __free_page(scratch);
-err_nofree:
- dprintk("%s: Err Return=>%d\n", __func__, status);
- return ERR_PTR(status);
-}
-
-/*
- * Free a layout segement
- */
-void
-objlayout_free_lseg(struct pnfs_layout_segment *lseg)
-{
- dprintk("%s: freeing layout segment %p\n", __func__, lseg);
-
- if (unlikely(!lseg))
- return;
-
- objio_free_lseg(lseg);
-}
-
-/*
- * I/O Operations
- */
-static inline u64
-end_offset(u64 start, u64 len)
-{
- u64 end;
-
- end = start + len;
- return end >= start ? end : NFS4_MAX_UINT64;
-}
-
-static void _fix_verify_io_params(struct pnfs_layout_segment *lseg,
- struct page ***p_pages, unsigned *p_pgbase,
- u64 offset, unsigned long count)
-{
- u64 lseg_end_offset;
-
- BUG_ON(offset < lseg->pls_range.offset);
- lseg_end_offset = end_offset(lseg->pls_range.offset,
- lseg->pls_range.length);
- BUG_ON(offset >= lseg_end_offset);
- WARN_ON(offset + count > lseg_end_offset);
-
- if (*p_pgbase > PAGE_SIZE) {
- dprintk("%s: pgbase(0x%x) > PAGE_SIZE\n", __func__, *p_pgbase);
- *p_pages += *p_pgbase >> PAGE_SHIFT;
- *p_pgbase &= ~PAGE_MASK;
- }
-}
-
-/*
- * I/O done common code
- */
-static void
-objlayout_iodone(struct objlayout_io_res *oir)
-{
- if (likely(oir->status >= 0)) {
- objio_free_result(oir);
- } else {
- struct objlayout *objlay = oir->objlay;
-
- spin_lock(&objlay->lock);
- objlay->delta_space_valid = OBJ_DSU_INVALID;
- list_add(&objlay->err_list, &oir->err_list);
- spin_unlock(&objlay->lock);
- }
-}
-
-/*
- * objlayout_io_set_result - Set an osd_error code on a specific osd comp.
- *
- * The @index component IO failed (error returned from target). Register
- * the error for later reporting at layout-return.
- */
-void
-objlayout_io_set_result(struct objlayout_io_res *oir, unsigned index,
- struct pnfs_osd_objid *pooid, int osd_error,
- u64 offset, u64 length, bool is_write)
-{
- struct pnfs_osd_ioerr *ioerr = &oir->ioerrs[index];
-
- BUG_ON(index >= oir->num_comps);
- if (osd_error) {
- ioerr->oer_component = *pooid;
- ioerr->oer_comp_offset = offset;
- ioerr->oer_comp_length = length;
- ioerr->oer_iswrite = is_write;
- ioerr->oer_errno = osd_error;
-
- dprintk("%s: err[%d]: errno=%d is_write=%d dev(%llx:%llx) "
- "par=0x%llx obj=0x%llx offset=0x%llx length=0x%llx\n",
- __func__, index, ioerr->oer_errno,
- ioerr->oer_iswrite,
- _DEVID_LO(&ioerr->oer_component.oid_device_id),
- _DEVID_HI(&ioerr->oer_component.oid_device_id),
- ioerr->oer_component.oid_partition_id,
- ioerr->oer_component.oid_object_id,
- ioerr->oer_comp_offset,
- ioerr->oer_comp_length);
- } else {
- /* User need not call if no error is reported */
- ioerr->oer_errno = 0;
- }
-}
-
-/* Function scheduled on rpc workqueue to call ->nfs_readlist_complete().
- * This is because the osd completion is called with ints-off from
- * the block layer
- */
-static void _rpc_read_complete(struct work_struct *work)
-{
- struct rpc_task *task;
- struct nfs_pgio_header *hdr;
-
- dprintk("%s enter\n", __func__);
- task = container_of(work, struct rpc_task, u.tk_work);
- hdr = container_of(task, struct nfs_pgio_header, task);
-
- pnfs_ld_read_done(hdr);
-}
-
-void
-objlayout_read_done(struct objlayout_io_res *oir, ssize_t status, bool sync)
-{
- struct nfs_pgio_header *hdr = oir->rpcdata;
-
- oir->status = hdr->task.tk_status = status;
- if (status >= 0)
- hdr->res.count = status;
- else
- hdr->pnfs_error = status;
- objlayout_iodone(oir);
- /* must not use oir after this point */
-
- dprintk("%s: Return status=%zd eof=%d sync=%d\n", __func__,
- status, hdr->res.eof, sync);
-
- if (sync)
- pnfs_ld_read_done(hdr);
- else {
- INIT_WORK(&hdr->task.u.tk_work, _rpc_read_complete);
- schedule_work(&hdr->task.u.tk_work);
- }
-}
-
-/*
- * Perform sync or async reads.
- */
-enum pnfs_try_status
-objlayout_read_pagelist(struct nfs_pgio_header *hdr)
-{
- struct inode *inode = hdr->inode;
- loff_t offset = hdr->args.offset;
- size_t count = hdr->args.count;
- int err;
- loff_t eof;
-
- eof = i_size_read(inode);
- if (unlikely(offset + count > eof)) {
- if (offset >= eof) {
- err = 0;
- hdr->res.count = 0;
- hdr->res.eof = 1;
- /*FIXME: do we need to call pnfs_ld_read_done() */
- goto out;
- }
- count = eof - offset;
- }
-
- hdr->res.eof = (offset + count) >= eof;
- _fix_verify_io_params(hdr->lseg, &hdr->args.pages,
- &hdr->args.pgbase,
- hdr->args.offset, hdr->args.count);
-
- dprintk("%s: inode(%lx) offset 0x%llx count 0x%zx eof=%d\n",
- __func__, inode->i_ino, offset, count, hdr->res.eof);
-
- err = objio_read_pagelist(hdr);
- out:
- if (unlikely(err)) {
- hdr->pnfs_error = err;
- dprintk("%s: Returned Error %d\n", __func__, err);
- return PNFS_NOT_ATTEMPTED;
- }
- return PNFS_ATTEMPTED;
-}
-
-/* Function scheduled on rpc workqueue to call ->nfs_writelist_complete().
- * This is because the osd completion is called with ints-off from
- * the block layer
- */
-static void _rpc_write_complete(struct work_struct *work)
-{
- struct rpc_task *task;
- struct nfs_pgio_header *hdr;
-
- dprintk("%s enter\n", __func__);
- task = container_of(work, struct rpc_task, u.tk_work);
- hdr = container_of(task, struct nfs_pgio_header, task);
-
- pnfs_ld_write_done(hdr);
-}
-
-void
-objlayout_write_done(struct objlayout_io_res *oir, ssize_t status, bool sync)
-{
- struct nfs_pgio_header *hdr = oir->rpcdata;
-
- oir->status = hdr->task.tk_status = status;
- if (status >= 0) {
- hdr->res.count = status;
- hdr->verf.committed = oir->committed;
- } else {
- hdr->pnfs_error = status;
- }
- objlayout_iodone(oir);
- /* must not use oir after this point */
-
- dprintk("%s: Return status %zd committed %d sync=%d\n", __func__,
- status, hdr->verf.committed, sync);
-
- if (sync)
- pnfs_ld_write_done(hdr);
- else {
- INIT_WORK(&hdr->task.u.tk_work, _rpc_write_complete);
- schedule_work(&hdr->task.u.tk_work);
- }
-}
-
-/*
- * Perform sync or async writes.
- */
-enum pnfs_try_status
-objlayout_write_pagelist(struct nfs_pgio_header *hdr, int how)
-{
- int err;
-
- _fix_verify_io_params(hdr->lseg, &hdr->args.pages,
- &hdr->args.pgbase,
- hdr->args.offset, hdr->args.count);
-
- err = objio_write_pagelist(hdr, how);
- if (unlikely(err)) {
- hdr->pnfs_error = err;
- dprintk("%s: Returned Error %d\n", __func__, err);
- return PNFS_NOT_ATTEMPTED;
- }
- return PNFS_ATTEMPTED;
-}
-
-void
-objlayout_encode_layoutcommit(struct pnfs_layout_hdr *pnfslay,
- struct xdr_stream *xdr,
- const struct nfs4_layoutcommit_args *args)
-{
- struct objlayout *objlay = OBJLAYOUT(pnfslay);
- struct pnfs_osd_layoutupdate lou;
- __be32 *start;
-
- dprintk("%s: Begin\n", __func__);
-
- spin_lock(&objlay->lock);
- lou.dsu_valid = (objlay->delta_space_valid == OBJ_DSU_VALID);
- lou.dsu_delta = objlay->delta_space_used;
- objlay->delta_space_used = 0;
- objlay->delta_space_valid = OBJ_DSU_INIT;
- lou.olu_ioerr_flag = !list_empty(&objlay->err_list);
- spin_unlock(&objlay->lock);
-
- start = xdr_reserve_space(xdr, 4);
-
- BUG_ON(pnfs_osd_xdr_encode_layoutupdate(xdr, &lou));
-
- *start = cpu_to_be32((xdr->p - start - 1) * 4);
-
- dprintk("%s: Return delta_space_used %lld err %d\n", __func__,
- lou.dsu_delta, lou.olu_ioerr_flag);
-}
-
-static int
-err_prio(u32 oer_errno)
-{
- switch (oer_errno) {
- case 0:
- return 0;
-
- case PNFS_OSD_ERR_RESOURCE:
- return OSD_ERR_PRI_RESOURCE;
- case PNFS_OSD_ERR_BAD_CRED:
- return OSD_ERR_PRI_BAD_CRED;
- case PNFS_OSD_ERR_NO_ACCESS:
- return OSD_ERR_PRI_NO_ACCESS;
- case PNFS_OSD_ERR_UNREACHABLE:
- return OSD_ERR_PRI_UNREACHABLE;
- case PNFS_OSD_ERR_NOT_FOUND:
- return OSD_ERR_PRI_NOT_FOUND;
- case PNFS_OSD_ERR_NO_SPACE:
- return OSD_ERR_PRI_NO_SPACE;
- default:
- WARN_ON(1);
- /* fallthrough */
- case PNFS_OSD_ERR_EIO:
- return OSD_ERR_PRI_EIO;
- }
-}
-
-static void
-merge_ioerr(struct pnfs_osd_ioerr *dest_err,
- const struct pnfs_osd_ioerr *src_err)
-{
- u64 dest_end, src_end;
-
- if (!dest_err->oer_errno) {
- *dest_err = *src_err;
- /* accumulated device must be blank */
- memset(&dest_err->oer_component.oid_device_id, 0,
- sizeof(dest_err->oer_component.oid_device_id));
-
- return;
- }
-
- if (dest_err->oer_component.oid_partition_id !=
- src_err->oer_component.oid_partition_id)
- dest_err->oer_component.oid_partition_id = 0;
-
- if (dest_err->oer_component.oid_object_id !=
- src_err->oer_component.oid_object_id)
- dest_err->oer_component.oid_object_id = 0;
-
- if (dest_err->oer_comp_offset > src_err->oer_comp_offset)
- dest_err->oer_comp_offset = src_err->oer_comp_offset;
-
- dest_end = end_offset(dest_err->oer_comp_offset,
- dest_err->oer_comp_length);
- src_end = end_offset(src_err->oer_comp_offset,
- src_err->oer_comp_length);
- if (dest_end < src_end)
- dest_end = src_end;
-
- dest_err->oer_comp_length = dest_end - dest_err->oer_comp_offset;
-
- if ((src_err->oer_iswrite == dest_err->oer_iswrite) &&
- (err_prio(src_err->oer_errno) > err_prio(dest_err->oer_errno))) {
- dest_err->oer_errno = src_err->oer_errno;
- } else if (src_err->oer_iswrite) {
- dest_err->oer_iswrite = true;
- dest_err->oer_errno = src_err->oer_errno;
- }
-}
-
-static void
-encode_accumulated_error(struct objlayout *objlay, __be32 *p)
-{
- struct objlayout_io_res *oir, *tmp;
- struct pnfs_osd_ioerr accumulated_err = {.oer_errno = 0};
-
- list_for_each_entry_safe(oir, tmp, &objlay->err_list, err_list) {
- unsigned i;
-
- for (i = 0; i < oir->num_comps; i++) {
- struct pnfs_osd_ioerr *ioerr = &oir->ioerrs[i];
-
- if (!ioerr->oer_errno)
- continue;
-
- printk(KERN_ERR "NFS: %s: err[%d]: errno=%d "
- "is_write=%d dev(%llx:%llx) par=0x%llx "
- "obj=0x%llx offset=0x%llx length=0x%llx\n",
- __func__, i, ioerr->oer_errno,
- ioerr->oer_iswrite,
- _DEVID_LO(&ioerr->oer_component.oid_device_id),
- _DEVID_HI(&ioerr->oer_component.oid_device_id),
- ioerr->oer_component.oid_partition_id,
- ioerr->oer_component.oid_object_id,
- ioerr->oer_comp_offset,
- ioerr->oer_comp_length);
-
- merge_ioerr(&accumulated_err, ioerr);
- }
- list_del(&oir->err_list);
- objio_free_result(oir);
- }
-
- pnfs_osd_xdr_encode_ioerr(p, &accumulated_err);
-}
-
-void
-objlayout_encode_layoutreturn(struct xdr_stream *xdr,
- const struct nfs4_layoutreturn_args *args)
-{
- struct pnfs_layout_hdr *pnfslay = args->layout;
- struct objlayout *objlay = OBJLAYOUT(pnfslay);
- struct objlayout_io_res *oir, *tmp;
- __be32 *start;
-
- dprintk("%s: Begin\n", __func__);
- start = xdr_reserve_space(xdr, 4);
- BUG_ON(!start);
-
- spin_lock(&objlay->lock);
-
- list_for_each_entry_safe(oir, tmp, &objlay->err_list, err_list) {
- __be32 *last_xdr = NULL, *p;
- unsigned i;
- int res = 0;
-
- for (i = 0; i < oir->num_comps; i++) {
- struct pnfs_osd_ioerr *ioerr = &oir->ioerrs[i];
-
- if (!ioerr->oer_errno)
- continue;
-
- dprintk("%s: err[%d]: errno=%d is_write=%d "
- "dev(%llx:%llx) par=0x%llx obj=0x%llx "
- "offset=0x%llx length=0x%llx\n",
- __func__, i, ioerr->oer_errno,
- ioerr->oer_iswrite,
- _DEVID_LO(&ioerr->oer_component.oid_device_id),
- _DEVID_HI(&ioerr->oer_component.oid_device_id),
- ioerr->oer_component.oid_partition_id,
- ioerr->oer_component.oid_object_id,
- ioerr->oer_comp_offset,
- ioerr->oer_comp_length);
-
- p = pnfs_osd_xdr_ioerr_reserve_space(xdr);
- if (unlikely(!p)) {
- res = -E2BIG;
- break; /* accumulated_error */
- }
-
- last_xdr = p;
- pnfs_osd_xdr_encode_ioerr(p, &oir->ioerrs[i]);
- }
-
- /* TODO: use xdr_write_pages */
- if (unlikely(res)) {
- /* no space for even one error descriptor */
- BUG_ON(!last_xdr);
-
- /* we've encountered a situation with lots and lots of
- * errors and no space to encode them all. Use the last
- * available slot to report the union of all the
- * remaining errors.
- */
- encode_accumulated_error(objlay, last_xdr);
- goto loop_done;
- }
- list_del(&oir->err_list);
- objio_free_result(oir);
- }
-loop_done:
- spin_unlock(&objlay->lock);
-
- *start = cpu_to_be32((xdr->p - start - 1) * 4);
- dprintk("%s: Return\n", __func__);
-}
-
-enum {
- OBJLAYOUT_MAX_URI_LEN = 256, OBJLAYOUT_MAX_OSDNAME_LEN = 64,
- OBJLAYOUT_MAX_SYSID_HEX_LEN = OSD_SYSTEMID_LEN * 2 + 1,
- OSD_LOGIN_UPCALL_PATHLEN = 256
-};
-
-static char osd_login_prog[OSD_LOGIN_UPCALL_PATHLEN] = "/sbin/osd_login";
-
-module_param_string(osd_login_prog, osd_login_prog, sizeof(osd_login_prog),
- 0600);
-MODULE_PARM_DESC(osd_login_prog, "Path to the osd_login upcall program");
-
-struct __auto_login {
- char uri[OBJLAYOUT_MAX_URI_LEN];
- char osdname[OBJLAYOUT_MAX_OSDNAME_LEN];
- char systemid_hex[OBJLAYOUT_MAX_SYSID_HEX_LEN];
-};
-
-static int __objlayout_upcall(struct __auto_login *login)
-{
- static char *envp[] = { "HOME=/",
- "TERM=linux",
- "PATH=/sbin:/usr/sbin:/bin:/usr/bin",
- NULL
- };
- char *argv[8];
- int ret;
-
- if (unlikely(!osd_login_prog[0])) {
- dprintk("%s: osd_login_prog is disabled\n", __func__);
- return -EACCES;
- }
-
- dprintk("%s uri: %s\n", __func__, login->uri);
- dprintk("%s osdname %s\n", __func__, login->osdname);
- dprintk("%s systemid_hex %s\n", __func__, login->systemid_hex);
-
- argv[0] = (char *)osd_login_prog;
- argv[1] = "-u";
- argv[2] = login->uri;
- argv[3] = "-o";
- argv[4] = login->osdname;
- argv[5] = "-s";
- argv[6] = login->systemid_hex;
- argv[7] = NULL;
-
- ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
- /*
- * Disable the upcall mechanism if we're getting an ENOENT or
- * EACCES error. The admin can re-enable it on the fly by using
- * sysfs to set the objlayoutdriver.osd_login_prog module parameter once
- * the problem has been fixed.
- */
- if (ret == -ENOENT || ret == -EACCES) {
- printk(KERN_ERR "PNFS-OBJ: %s was not found please set "
- "objlayoutdriver.osd_login_prog kernel parameter!\n",
- osd_login_prog);
- osd_login_prog[0] = '\0';
- }
- dprintk("%s %s return value: %d\n", __func__, osd_login_prog, ret);
-
- return ret;
-}
-
-/* Assume dest is all zeros */
-static void __copy_nfsS_and_zero_terminate(struct nfs4_string s,
- char *dest, int max_len,
- const char *var_name)
-{
- if (!s.len)
- return;
-
- if (s.len >= max_len) {
- pr_warn_ratelimited(
- "objlayout_autologin: %s: s.len(%d) >= max_len(%d)",
- var_name, s.len, max_len);
- s.len = max_len - 1; /* space for null terminator */
- }
-
- memcpy(dest, s.data, s.len);
-}
-
-/* Assume sysid is all zeros */
-static void _sysid_2_hex(struct nfs4_string s,
- char sysid[OBJLAYOUT_MAX_SYSID_HEX_LEN])
-{
- int i;
- char *cur;
-
- if (!s.len)
- return;
-
- if (s.len != OSD_SYSTEMID_LEN) {
- pr_warn_ratelimited(
- "objlayout_autologin: systemid_len(%d) != OSD_SYSTEMID_LEN",
- s.len);
- if (s.len > OSD_SYSTEMID_LEN)
- s.len = OSD_SYSTEMID_LEN;
- }
-
- cur = sysid;
- for (i = 0; i < s.len; i++)
- cur = hex_byte_pack(cur, s.data[i]);
-}
-
-int objlayout_autologin(struct pnfs_osd_deviceaddr *deviceaddr)
-{
- int rc;
- struct __auto_login login;
-
- if (!deviceaddr->oda_targetaddr.ota_netaddr.r_addr.len)
- return -ENODEV;
-
- memset(&login, 0, sizeof(login));
- __copy_nfsS_and_zero_terminate(
- deviceaddr->oda_targetaddr.ota_netaddr.r_addr,
- login.uri, sizeof(login.uri), "URI");
-
- __copy_nfsS_and_zero_terminate(
- deviceaddr->oda_osdname,
- login.osdname, sizeof(login.osdname), "OSDNAME");
-
- _sysid_2_hex(deviceaddr->oda_systemid, login.systemid_hex);
-
- rc = __objlayout_upcall(&login);
- if (rc > 0) /* script returns positive values */
- rc = -ENODEV;
-
- return rc;
-}
diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h
deleted file mode 100644
index fc94a5872ed4..000000000000
--- a/fs/nfs/objlayout/objlayout.h
+++ /dev/null
@@ -1,183 +0,0 @@
-/*
- * Data types and function declerations for interfacing with the
- * pNFS standard object layout driver.
- *
- * Copyright (C) 2007 Panasas Inc. [year of first publication]
- * All rights reserved.
- *
- * Benny Halevy <bhalevy@panasas.com>
- * Boaz Harrosh <ooo@electrozaur.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2
- * See the file COPYING included with this distribution for more details.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * 3. Neither the name of the Panasas company nor the names of its
- * contributors may be used to endorse or promote products derived
- * from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
- * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
- * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef _OBJLAYOUT_H
-#define _OBJLAYOUT_H
-
-#include <linux/nfs_fs.h>
-#include <linux/pnfs_osd_xdr.h>
-#include "../pnfs.h"
-
-/*
- * per-inode layout
- */
-struct objlayout {
- struct pnfs_layout_hdr pnfs_layout;
-
- /* for layout_commit */
- enum osd_delta_space_valid_enum {
- OBJ_DSU_INIT = 0,
- OBJ_DSU_VALID,
- OBJ_DSU_INVALID,
- } delta_space_valid;
- s64 delta_space_used; /* consumed by write ops */
-
- /* for layout_return */
- spinlock_t lock;
- struct list_head err_list;
-};
-
-static inline struct objlayout *
-OBJLAYOUT(struct pnfs_layout_hdr *lo)
-{
- return container_of(lo, struct objlayout, pnfs_layout);
-}
-
-/*
- * per-I/O operation state
- * embedded in objects provider io_state data structure
- */
-struct objlayout_io_res {
- struct objlayout *objlay;
-
- void *rpcdata;
- int status; /* res */
- int committed; /* res */
-
- /* Error reporting (layout_return) */
- struct list_head err_list;
- unsigned num_comps;
- /* Pointer to array of error descriptors of size num_comps.
- * It should contain as many entries as devices in the osd_layout
- * that participate in the I/O. It is up to the io_engine to allocate
- * needed space and set num_comps.
- */
- struct pnfs_osd_ioerr *ioerrs;
-};
-
-static inline
-void objlayout_init_ioerrs(struct objlayout_io_res *oir, unsigned num_comps,
- struct pnfs_osd_ioerr *ioerrs, void *rpcdata,
- struct pnfs_layout_hdr *pnfs_layout_type)
-{
- oir->objlay = OBJLAYOUT(pnfs_layout_type);
- oir->rpcdata = rpcdata;
- INIT_LIST_HEAD(&oir->err_list);
- oir->num_comps = num_comps;
- oir->ioerrs = ioerrs;
-}
-
-/*
- * Raid engine I/O API
- */
-extern int objio_alloc_lseg(struct pnfs_layout_segment **outp,
- struct pnfs_layout_hdr *pnfslay,
- struct pnfs_layout_range *range,
- struct xdr_stream *xdr,
- gfp_t gfp_flags);
-extern void objio_free_lseg(struct pnfs_layout_segment *lseg);
-
-/* objio_free_result will free these @oir structs received from
- * objlayout_{read,write}_done
- */
-extern void objio_free_result(struct objlayout_io_res *oir);
-
-extern int objio_read_pagelist(struct nfs_pgio_header *rdata);
-extern int objio_write_pagelist(struct nfs_pgio_header *wdata, int how);
-
-/*
- * callback API
- */
-extern void objlayout_io_set_result(struct objlayout_io_res *oir,
- unsigned index, struct pnfs_osd_objid *pooid,
- int osd_error, u64 offset, u64 length, bool is_write);
-
-static inline void
-objlayout_add_delta_space_used(struct objlayout *objlay, s64 space_used)
-{
- /* If one of the I/Os errored out and the delta_space_used was
- * invalid we render the complete report as invalid. Protocol mandate
- * the DSU be accurate or not reported.
- */
- spin_lock(&objlay->lock);
- if (objlay->delta_space_valid != OBJ_DSU_INVALID) {
- objlay->delta_space_valid = OBJ_DSU_VALID;
- objlay->delta_space_used += space_used;
- }
- spin_unlock(&objlay->lock);
-}
-
-extern void objlayout_read_done(struct objlayout_io_res *oir,
- ssize_t status, bool sync);
-extern void objlayout_write_done(struct objlayout_io_res *oir,
- ssize_t status, bool sync);
-
-/*
- * exported generic objects function vectors
- */
-
-extern struct pnfs_layout_hdr *objlayout_alloc_layout_hdr(struct inode *, gfp_t gfp_flags);
-extern void objlayout_free_layout_hdr(struct pnfs_layout_hdr *);
-
-extern struct pnfs_layout_segment *objlayout_alloc_lseg(
- struct pnfs_layout_hdr *,
- struct nfs4_layoutget_res *,
- gfp_t gfp_flags);
-extern void objlayout_free_lseg(struct pnfs_layout_segment *);
-
-extern enum pnfs_try_status objlayout_read_pagelist(
- struct nfs_pgio_header *);
-
-extern enum pnfs_try_status objlayout_write_pagelist(
- struct nfs_pgio_header *,
- int how);
-
-extern void objlayout_encode_layoutcommit(
- struct pnfs_layout_hdr *,
- struct xdr_stream *,
- const struct nfs4_layoutcommit_args *);
-
-extern void objlayout_encode_layoutreturn(
- struct xdr_stream *,
- const struct nfs4_layoutreturn_args *);
-
-extern int objlayout_autologin(struct pnfs_osd_deviceaddr *deviceaddr);
-
-#endif /* _OBJLAYOUT_H */
diff --git a/fs/nfs/objlayout/pnfs_osd_xdr_cli.c b/fs/nfs/objlayout/pnfs_osd_xdr_cli.c
deleted file mode 100644
index f093c7ec983b..000000000000
--- a/fs/nfs/objlayout/pnfs_osd_xdr_cli.c
+++ /dev/null
@@ -1,415 +0,0 @@
-/*
- * Object-Based pNFS Layout XDR layer
- *
- * Copyright (C) 2007 Panasas Inc. [year of first publication]
- * All rights reserved.
- *
- * Benny Halevy <bhalevy@panasas.com>
- * Boaz Harrosh <ooo@electrozaur.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2
- * See the file COPYING included with this distribution for more details.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * 3. Neither the name of the Panasas company nor the names of its
- * contributors may be used to endorse or promote products derived
- * from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
- * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
- * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include <linux/pnfs_osd_xdr.h>
-
-#define NFSDBG_FACILITY NFSDBG_PNFS_LD
-
-/*
- * The following implementation is based on RFC5664
- */
-
-/*
- * struct pnfs_osd_objid {
- * struct nfs4_deviceid oid_device_id;
- * u64 oid_partition_id;
- * u64 oid_object_id;
- * }; // xdr size 32 bytes
- */
-static __be32 *
-_osd_xdr_decode_objid(__be32 *p, struct pnfs_osd_objid *objid)
-{
- p = xdr_decode_opaque_fixed(p, objid->oid_device_id.data,
- sizeof(objid->oid_device_id.data));
-
- p = xdr_decode_hyper(p, &objid->oid_partition_id);
- p = xdr_decode_hyper(p, &objid->oid_object_id);
- return p;
-}
-/*
- * struct pnfs_osd_opaque_cred {
- * u32 cred_len;
- * void *cred;
- * }; // xdr size [variable]
- * The return pointers are from the xdr buffer
- */
-static int
-_osd_xdr_decode_opaque_cred(struct pnfs_osd_opaque_cred *opaque_cred,
- struct xdr_stream *xdr)
-{
- __be32 *p = xdr_inline_decode(xdr, 1);
-
- if (!p)
- return -EINVAL;
-
- opaque_cred->cred_len = be32_to_cpu(*p++);
-
- p = xdr_inline_decode(xdr, opaque_cred->cred_len);
- if (!p)
- return -EINVAL;
-
- opaque_cred->cred = p;
- return 0;
-}
-
-/*
- * struct pnfs_osd_object_cred {
- * struct pnfs_osd_objid oc_object_id;
- * u32 oc_osd_version;
- * u32 oc_cap_key_sec;
- * struct pnfs_osd_opaque_cred oc_cap_key
- * struct pnfs_osd_opaque_cred oc_cap;
- * }; // xdr size 32 + 4 + 4 + [variable] + [variable]
- */
-static int
-_osd_xdr_decode_object_cred(struct pnfs_osd_object_cred *comp,
- struct xdr_stream *xdr)
-{
- __be32 *p = xdr_inline_decode(xdr, 32 + 4 + 4);
- int ret;
-
- if (!p)
- return -EIO;
-
- p = _osd_xdr_decode_objid(p, &comp->oc_object_id);
- comp->oc_osd_version = be32_to_cpup(p++);
- comp->oc_cap_key_sec = be32_to_cpup(p);
-
- ret = _osd_xdr_decode_opaque_cred(&comp->oc_cap_key, xdr);
- if (unlikely(ret))
- return ret;
-
- ret = _osd_xdr_decode_opaque_cred(&comp->oc_cap, xdr);
- return ret;
-}
-
-/*
- * struct pnfs_osd_data_map {
- * u32 odm_num_comps;
- * u64 odm_stripe_unit;
- * u32 odm_group_width;
- * u32 odm_group_depth;
- * u32 odm_mirror_cnt;
- * u32 odm_raid_algorithm;
- * }; // xdr size 4 + 8 + 4 + 4 + 4 + 4
- */
-static inline int
-_osd_data_map_xdr_sz(void)
-{
- return 4 + 8 + 4 + 4 + 4 + 4;
-}
-
-static __be32 *
-_osd_xdr_decode_data_map(__be32 *p, struct pnfs_osd_data_map *data_map)
-{
- data_map->odm_num_comps = be32_to_cpup(p++);
- p = xdr_decode_hyper(p, &data_map->odm_stripe_unit);
- data_map->odm_group_width = be32_to_cpup(p++);
- data_map->odm_group_depth = be32_to_cpup(p++);
- data_map->odm_mirror_cnt = be32_to_cpup(p++);
- data_map->odm_raid_algorithm = be32_to_cpup(p++);
- dprintk("%s: odm_num_comps=%u odm_stripe_unit=%llu odm_group_width=%u "
- "odm_group_depth=%u odm_mirror_cnt=%u odm_raid_algorithm=%u\n",
- __func__,
- data_map->odm_num_comps,
- (unsigned long long)data_map->odm_stripe_unit,
- data_map->odm_group_width,
- data_map->odm_group_depth,
- data_map->odm_mirror_cnt,
- data_map->odm_raid_algorithm);
- return p;
-}
-
-int pnfs_osd_xdr_decode_layout_map(struct pnfs_osd_layout *layout,
- struct pnfs_osd_xdr_decode_layout_iter *iter, struct xdr_stream *xdr)
-{
- __be32 *p;
-
- memset(iter, 0, sizeof(*iter));
-
- p = xdr_inline_decode(xdr, _osd_data_map_xdr_sz() + 4 + 4);
- if (unlikely(!p))
- return -EINVAL;
-
- p = _osd_xdr_decode_data_map(p, &layout->olo_map);
- layout->olo_comps_index = be32_to_cpup(p++);
- layout->olo_num_comps = be32_to_cpup(p++);
- dprintk("%s: olo_comps_index=%d olo_num_comps=%d\n", __func__,
- layout->olo_comps_index, layout->olo_num_comps);
-
- iter->total_comps = layout->olo_num_comps;
- return 0;
-}
-
-bool pnfs_osd_xdr_decode_layout_comp(struct pnfs_osd_object_cred *comp,
- struct pnfs_osd_xdr_decode_layout_iter *iter, struct xdr_stream *xdr,
- int *err)
-{
- BUG_ON(iter->decoded_comps > iter->total_comps);
- if (iter->decoded_comps == iter->total_comps)
- return false;
-
- *err = _osd_xdr_decode_object_cred(comp, xdr);
- if (unlikely(*err)) {
- dprintk("%s: _osd_xdr_decode_object_cred=>%d decoded_comps=%d "
- "total_comps=%d\n", __func__, *err,
- iter->decoded_comps, iter->total_comps);
- return false; /* stop the loop */
- }
- dprintk("%s: dev(%llx:%llx) par=0x%llx obj=0x%llx "
- "key_len=%u cap_len=%u\n",
- __func__,
- _DEVID_LO(&comp->oc_object_id.oid_device_id),
- _DEVID_HI(&comp->oc_object_id.oid_device_id),
- comp->oc_object_id.oid_partition_id,
- comp->oc_object_id.oid_object_id,
- comp->oc_cap_key.cred_len, comp->oc_cap.cred_len);
-
- iter->decoded_comps++;
- return true;
-}
-
-/*
- * Get Device Information Decoding
- *
- * Note: since Device Information is currently done synchronously, all
- * variable strings fields are left inside the rpc buffer and are only
- * pointed to by the pnfs_osd_deviceaddr members. So the read buffer
- * should not be freed while the returned information is in use.
- */
-/*
- *struct nfs4_string {
- * unsigned int len;
- * char *data;
- *}; // size [variable]
- * NOTE: Returned string points to inside the XDR buffer
- */
-static __be32 *
-__read_u8_opaque(__be32 *p, struct nfs4_string *str)
-{
- str->len = be32_to_cpup(p++);
- str->data = (char *)p;
-
- p += XDR_QUADLEN(str->len);
- return p;
-}
-
-/*
- * struct pnfs_osd_targetid {
- * u32 oti_type;
- * struct nfs4_string oti_scsi_device_id;
- * };// size 4 + [variable]
- */
-static __be32 *
-__read_targetid(__be32 *p, struct pnfs_osd_targetid* targetid)
-{
- u32 oti_type;
-
- oti_type = be32_to_cpup(p++);
- targetid->oti_type = oti_type;
-
- switch (oti_type) {
- case OBJ_TARGET_SCSI_NAME:
- case OBJ_TARGET_SCSI_DEVICE_ID:
- p = __read_u8_opaque(p, &targetid->oti_scsi_device_id);
- }
-
- return p;
-}
-
-/*
- * struct pnfs_osd_net_addr {
- * struct nfs4_string r_netid;
- * struct nfs4_string r_addr;
- * };
- */
-static __be32 *
-__read_net_addr(__be32 *p, struct pnfs_osd_net_addr* netaddr)
-{
- p = __read_u8_opaque(p, &netaddr->r_netid);
- p = __read_u8_opaque(p, &netaddr->r_addr);
-
- return p;
-}
-
-/*
- * struct pnfs_osd_targetaddr {
- * u32 ota_available;
- * struct pnfs_osd_net_addr ota_netaddr;
- * };
- */
-static __be32 *
-__read_targetaddr(__be32 *p, struct pnfs_osd_targetaddr *targetaddr)
-{
- u32 ota_available;
-
- ota_available = be32_to_cpup(p++);
- targetaddr->ota_available = ota_available;
-
- if (ota_available)
- p = __read_net_addr(p, &targetaddr->ota_netaddr);
-
-
- return p;
-}
-
-/*
- * struct pnfs_osd_deviceaddr {
- * struct pnfs_osd_targetid oda_targetid;
- * struct pnfs_osd_targetaddr oda_targetaddr;
- * u8 oda_lun[8];
- * struct nfs4_string oda_systemid;
- * struct pnfs_osd_object_cred oda_root_obj_cred;
- * struct nfs4_string oda_osdname;
- * };
- */
-
-/* We need this version for the pnfs_osd_xdr_decode_deviceaddr which does
- * not have an xdr_stream
- */
-static __be32 *
-__read_opaque_cred(__be32 *p,
- struct pnfs_osd_opaque_cred *opaque_cred)
-{
- opaque_cred->cred_len = be32_to_cpu(*p++);
- opaque_cred->cred = p;
- return p + XDR_QUADLEN(opaque_cred->cred_len);
-}
-
-static __be32 *
-__read_object_cred(__be32 *p, struct pnfs_osd_object_cred *comp)
-{
- p = _osd_xdr_decode_objid(p, &comp->oc_object_id);
- comp->oc_osd_version = be32_to_cpup(p++);
- comp->oc_cap_key_sec = be32_to_cpup(p++);
-
- p = __read_opaque_cred(p, &comp->oc_cap_key);
- p = __read_opaque_cred(p, &comp->oc_cap);
- return p;
-}
-
-void pnfs_osd_xdr_decode_deviceaddr(
- struct pnfs_osd_deviceaddr *deviceaddr, __be32 *p)
-{
- p = __read_targetid(p, &deviceaddr->oda_targetid);
-
- p = __read_targetaddr(p, &deviceaddr->oda_targetaddr);
-
- p = xdr_decode_opaque_fixed(p, deviceaddr->oda_lun,
- sizeof(deviceaddr->oda_lun));
-
- p = __read_u8_opaque(p, &deviceaddr->oda_systemid);
-
- p = __read_object_cred(p, &deviceaddr->oda_root_obj_cred);
-
- p = __read_u8_opaque(p, &deviceaddr->oda_osdname);
-
- /* libosd likes this terminated in dbg. It's last, so no problems */
- deviceaddr->oda_osdname.data[deviceaddr->oda_osdname.len] = 0;
-}
-
-/*
- * struct pnfs_osd_layoutupdate {
- * u32 dsu_valid;
- * s64 dsu_delta;
- * u32 olu_ioerr_flag;
- * }; xdr size 4 + 8 + 4
- */
-int
-pnfs_osd_xdr_encode_layoutupdate(struct xdr_stream *xdr,
- struct pnfs_osd_layoutupdate *lou)
-{
- __be32 *p = xdr_reserve_space(xdr, 4 + 8 + 4);
-
- if (!p)
- return -E2BIG;
-
- *p++ = cpu_to_be32(lou->dsu_valid);
- if (lou->dsu_valid)
- p = xdr_encode_hyper(p, lou->dsu_delta);
- *p++ = cpu_to_be32(lou->olu_ioerr_flag);
- return 0;
-}
-
-/*
- * struct pnfs_osd_objid {
- * struct nfs4_deviceid oid_device_id;
- * u64 oid_partition_id;
- * u64 oid_object_id;
- * }; // xdr size 32 bytes
- */
-static inline __be32 *
-pnfs_osd_xdr_encode_objid(__be32 *p, struct pnfs_osd_objid *object_id)
-{
- p = xdr_encode_opaque_fixed(p, &object_id->oid_device_id.data,
- sizeof(object_id->oid_device_id.data));
- p = xdr_encode_hyper(p, object_id->oid_partition_id);
- p = xdr_encode_hyper(p, object_id->oid_object_id);
-
- return p;
-}
-
-/*
- * struct pnfs_osd_ioerr {
- * struct pnfs_osd_objid oer_component;
- * u64 oer_comp_offset;
- * u64 oer_comp_length;
- * u32 oer_iswrite;
- * u32 oer_errno;
- * }; // xdr size 32 + 24 bytes
- */
-void pnfs_osd_xdr_encode_ioerr(__be32 *p, struct pnfs_osd_ioerr *ioerr)
-{
- p = pnfs_osd_xdr_encode_objid(p, &ioerr->oer_component);
- p = xdr_encode_hyper(p, ioerr->oer_comp_offset);
- p = xdr_encode_hyper(p, ioerr->oer_comp_length);
- *p++ = cpu_to_be32(ioerr->oer_iswrite);
- *p = cpu_to_be32(ioerr->oer_errno);
-}
-
-__be32 *pnfs_osd_xdr_ioerr_reserve_space(struct xdr_stream *xdr)
-{
- __be32 *p;
-
- p = xdr_reserve_space(xdr, 32 + 24);
- if (unlikely(!p))
- dprintk("%s: out of xdr space\n", __func__);
-
- return p;
-}
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 6e629b856a00..ad92b401326c 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -29,19 +29,6 @@
static struct kmem_cache *nfs_page_cachep;
static const struct rpc_call_ops nfs_pgio_common_ops;
-static bool nfs_pgarray_set(struct nfs_page_array *p, unsigned int pagecount)
-{
- p->npages = pagecount;
- if (pagecount <= ARRAY_SIZE(p->page_array))
- p->pagevec = p->page_array;
- else {
- p->pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_KERNEL);
- if (!p->pagevec)
- p->npages = 0;
- }
- return p->pagevec != NULL;
-}
-
struct nfs_pgio_mirror *
nfs_pgio_current_mirror(struct nfs_pageio_descriptor *desc)
{
@@ -115,6 +102,35 @@ nfs_iocounter_wait(struct nfs_lock_context *l_ctx)
TASK_KILLABLE);
}
+/**
+ * nfs_async_iocounter_wait - wait on a rpc_waitqueue for I/O
+ * to complete
+ * @task: the rpc_task that should wait
+ * @l_ctx: nfs_lock_context with io_counter to check
+ *
+ * Returns true if there is outstanding I/O to wait on and the
+ * task has been put to sleep.
+ */
+bool
+nfs_async_iocounter_wait(struct rpc_task *task, struct nfs_lock_context *l_ctx)
+{
+ struct inode *inode = d_inode(l_ctx->open_context->dentry);
+ bool ret = false;
+
+ if (atomic_read(&l_ctx->io_count) > 0) {
+ rpc_sleep_on(&NFS_SERVER(inode)->uoc_rpcwaitq, task, NULL);
+ ret = true;
+ }
+
+ if (atomic_read(&l_ctx->io_count) == 0) {
+ rpc_wake_up_queued_task(&NFS_SERVER(inode)->uoc_rpcwaitq, task);
+ ret = false;
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(nfs_async_iocounter_wait);
+
/*
* nfs_page_group_lock - lock the head of the page group
* @req - request in group that is to be locked
@@ -398,8 +414,11 @@ static void nfs_clear_request(struct nfs_page *req)
req->wb_page = NULL;
}
if (l_ctx != NULL) {
- if (atomic_dec_and_test(&l_ctx->io_count))
+ if (atomic_dec_and_test(&l_ctx->io_count)) {
wake_up_atomic_t(&l_ctx->io_count);
+ if (test_bit(NFS_CONTEXT_UNLOCK, &ctx->flags))
+ rpc_wake_up(&NFS_SERVER(d_inode(ctx->dentry))->uoc_rpcwaitq);
+ }
nfs_put_lock_context(l_ctx);
req->wb_lock_context = NULL;
}
@@ -677,7 +696,8 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
const struct nfs_pgio_completion_ops *compl_ops,
const struct nfs_rw_ops *rw_ops,
size_t bsize,
- int io_flags)
+ int io_flags,
+ gfp_t gfp_flags)
{
struct nfs_pgio_mirror *new;
int i;
@@ -701,7 +721,7 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
/* until we have a request, we don't have an lseg and no
* idea how many mirrors there will be */
new = kcalloc(NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX,
- sizeof(struct nfs_pgio_mirror), GFP_KERNEL);
+ sizeof(struct nfs_pgio_mirror), gfp_flags);
desc->pg_mirrors_dynamic = new;
desc->pg_mirrors = new;
@@ -754,13 +774,24 @@ int nfs_generic_pgio(struct nfs_pageio_descriptor *desc,
*last_page;
struct list_head *head = &mirror->pg_list;
struct nfs_commit_info cinfo;
+ struct nfs_page_array *pg_array = &hdr->page_array;
unsigned int pagecount, pageused;
+ gfp_t gfp_flags = GFP_KERNEL;
pagecount = nfs_page_array_len(mirror->pg_base, mirror->pg_count);
- if (!nfs_pgarray_set(&hdr->page_array, pagecount)) {
- nfs_pgio_error(hdr);
- desc->pg_error = -ENOMEM;
- return desc->pg_error;
+
+ if (pagecount <= ARRAY_SIZE(pg_array->page_array))
+ pg_array->pagevec = pg_array->page_array;
+ else {
+ if (hdr->rw_mode == FMODE_WRITE)
+ gfp_flags = GFP_NOIO;
+ pg_array->pagevec = kcalloc(pagecount, sizeof(struct page *), gfp_flags);
+ if (!pg_array->pagevec) {
+ pg_array->npages = 0;
+ nfs_pgio_error(hdr);
+ desc->pg_error = -ENOMEM;
+ return desc->pg_error;
+ }
}
nfs_init_cinfo(&cinfo, desc->pg_inode, desc->pg_dreq);
@@ -1256,8 +1287,10 @@ void nfs_pageio_cond_complete(struct nfs_pageio_descriptor *desc, pgoff_t index)
mirror = &desc->pg_mirrors[midx];
if (!list_empty(&mirror->pg_list)) {
prev = nfs_list_entry(mirror->pg_list.prev);
- if (index != prev->wb_index + 1)
- nfs_pageio_complete_mirror(desc, midx);
+ if (index != prev->wb_index + 1) {
+ nfs_pageio_complete(desc);
+ break;
+ }
}
}
}
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index dd042498ce7c..adc6ec28d4b5 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -322,9 +322,15 @@ pnfs_set_plh_return_info(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode,
static void
pnfs_clear_layoutreturn_info(struct pnfs_layout_hdr *lo)
{
+ struct pnfs_layout_segment *lseg;
lo->plh_return_iomode = 0;
lo->plh_return_seq = 0;
clear_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags);
+ list_for_each_entry(lseg, &lo->plh_segs, pls_list) {
+ if (!test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags))
+ continue;
+ pnfs_set_plh_return_info(lo, lseg->pls_range.iomode, 0);
+ }
}
static void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo)
@@ -367,9 +373,9 @@ pnfs_mark_layout_stateid_invalid(struct pnfs_layout_hdr *lo,
struct pnfs_layout_segment *lseg, *next;
set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
- pnfs_clear_layoutreturn_info(lo);
list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
pnfs_clear_lseg_state(lseg, lseg_list);
+ pnfs_clear_layoutreturn_info(lo);
pnfs_free_returned_lsegs(lo, lseg_list, &range, 0);
if (test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags) &&
!test_and_set_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags))
@@ -563,7 +569,6 @@ pnfs_put_lseg_locked(struct pnfs_layout_segment *lseg)
}
}
}
-EXPORT_SYMBOL_GPL(pnfs_put_lseg_locked);
/*
* is l2 fully contained in l1?
@@ -728,6 +733,7 @@ pnfs_destroy_layout(struct nfs_inode *nfsi)
pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RW_FAILED);
spin_unlock(&nfsi->vfs_inode.i_lock);
pnfs_free_lseg_list(&tmp_list);
+ nfs_commit_inode(&nfsi->vfs_inode, 0);
pnfs_put_layout_hdr(lo);
} else
spin_unlock(&nfsi->vfs_inode.i_lock);
@@ -1209,7 +1215,6 @@ out:
dprintk("<-- %s status: %d\n", __func__, status);
return status;
}
-EXPORT_SYMBOL_GPL(_pnfs_return_layout);
int
pnfs_commit_and_return_layout(struct inode *inode)
@@ -1991,6 +1996,8 @@ out_forget:
spin_unlock(&ino->i_lock);
lseg->pls_layout = lo;
NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
+ if (!pnfs_layout_is_valid(lo))
+ nfs_commit_inode(ino, 0);
return ERR_PTR(-EAGAIN);
}
@@ -2051,9 +2058,11 @@ void pnfs_error_mark_layout_for_return(struct inode *inode,
bool return_now = false;
spin_lock(&inode->i_lock);
+ if (!pnfs_layout_is_valid(lo)) {
+ spin_unlock(&inode->i_lock);
+ return;
+ }
pnfs_set_plh_return_info(lo, range.iomode, 0);
- /* Block LAYOUTGET */
- set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags);
/*
* mark all matching lsegs so that we are sure to have no live
* segments at hand when sending layoutreturn. See pnfs_put_lseg()
@@ -2075,10 +2084,22 @@ void pnfs_error_mark_layout_for_return(struct inode *inode,
EXPORT_SYMBOL_GPL(pnfs_error_mark_layout_for_return);
void
+pnfs_generic_pg_check_layout(struct nfs_pageio_descriptor *pgio)
+{
+ if (pgio->pg_lseg == NULL ||
+ test_bit(NFS_LSEG_VALID, &pgio->pg_lseg->pls_flags))
+ return;
+ pnfs_put_lseg(pgio->pg_lseg);
+ pgio->pg_lseg = NULL;
+}
+EXPORT_SYMBOL_GPL(pnfs_generic_pg_check_layout);
+
+void
pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
{
u64 rd_size = req->wb_bytes;
+ pnfs_generic_pg_check_layout(pgio);
if (pgio->pg_lseg == NULL) {
if (pgio->pg_dreq == NULL)
rd_size = i_size_read(pgio->pg_inode) - req_offset(req);
@@ -2109,6 +2130,7 @@ void
pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio,
struct nfs_page *req, u64 wb_size)
{
+ pnfs_generic_pg_check_layout(pgio);
if (pgio->pg_lseg == NULL) {
pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
req->wb_context,
@@ -2277,8 +2299,20 @@ pnfs_do_write(struct nfs_pageio_descriptor *desc,
enum pnfs_try_status trypnfs;
trypnfs = pnfs_try_to_write_data(hdr, call_ops, lseg, how);
- if (trypnfs == PNFS_NOT_ATTEMPTED)
+ switch (trypnfs) {
+ case PNFS_NOT_ATTEMPTED:
pnfs_write_through_mds(desc, hdr);
+ case PNFS_ATTEMPTED:
+ break;
+ case PNFS_TRY_AGAIN:
+ /* cleanup hdr and prepare to redo pnfs */
+ if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
+ struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
+ list_splice_init(&hdr->pages, &mirror->pg_list);
+ mirror->pg_recoalesce = 1;
+ }
+ hdr->mds_ops->rpc_release(hdr);
+ }
}
static void pnfs_writehdr_free(struct nfs_pgio_header *hdr)
@@ -2408,10 +2442,20 @@ pnfs_do_read(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr)
enum pnfs_try_status trypnfs;
trypnfs = pnfs_try_to_read_data(hdr, call_ops, lseg);
- if (trypnfs == PNFS_TRY_AGAIN)
- pnfs_read_resend_pnfs(hdr);
- if (trypnfs == PNFS_NOT_ATTEMPTED || hdr->task.tk_status)
+ switch (trypnfs) {
+ case PNFS_NOT_ATTEMPTED:
pnfs_read_through_mds(desc, hdr);
+ case PNFS_ATTEMPTED:
+ break;
+ case PNFS_TRY_AGAIN:
+ /* cleanup hdr and prepare to redo pnfs */
+ if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
+ struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
+ list_splice_init(&hdr->pages, &mirror->pg_list);
+ mirror->pg_recoalesce = 1;
+ }
+ hdr->mds_ops->rpc_release(hdr);
+ }
}
static void pnfs_readhdr_free(struct nfs_pgio_header *hdr)
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 590e1e35781f..2d05b756a8d6 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -173,14 +173,9 @@ struct pnfs_layoutdriver_type {
gfp_t gfp_flags);
int (*prepare_layoutreturn) (struct nfs4_layoutreturn_args *);
- void (*encode_layoutreturn) (struct xdr_stream *xdr,
- const struct nfs4_layoutreturn_args *args);
void (*cleanup_layoutcommit) (struct nfs4_layoutcommit_data *data);
int (*prepare_layoutcommit) (struct nfs4_layoutcommit_args *args);
- void (*encode_layoutcommit) (struct pnfs_layout_hdr *lo,
- struct xdr_stream *xdr,
- const struct nfs4_layoutcommit_args *args);
int (*prepare_layoutstats) (struct nfs42_layoutstat_args *args);
};
@@ -239,6 +234,7 @@ void pnfs_put_lseg_locked(struct pnfs_layout_segment *lseg);
void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *, struct nfs_fsinfo *);
void unset_pnfs_layoutdriver(struct nfs_server *);
+void pnfs_generic_pg_check_layout(struct nfs_pageio_descriptor *pgio);
void pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *, struct nfs_page *);
int pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc);
void pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio,
diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c
index 7250b95549ec..d40755a0984b 100644
--- a/fs/nfs/pnfs_nfs.c
+++ b/fs/nfs/pnfs_nfs.c
@@ -217,7 +217,14 @@ pnfs_generic_alloc_ds_commits(struct nfs_commit_info *cinfo,
for (i = 0; i < fl_cinfo->nbuckets; i++, bucket++) {
if (list_empty(&bucket->committing))
continue;
- data = nfs_commitdata_alloc();
+ /*
+ * If the layout segment is invalid, then let
+ * pnfs_generic_retry_commit() clean up the bucket.
+ */
+ if (bucket->clseg && !pnfs_is_valid_lseg(bucket->clseg) &&
+ !test_bit(NFS_LSEG_LAYOUTRETURN, &bucket->clseg->pls_flags))
+ break;
+ data = nfs_commitdata_alloc(false);
if (!data)
break;
data->ds_commit_index = i;
@@ -283,16 +290,10 @@ pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
unsigned int nreq = 0;
if (!list_empty(mds_pages)) {
- data = nfs_commitdata_alloc();
- if (data != NULL) {
- data->ds_commit_index = -1;
- list_add(&data->pages, &list);
- nreq++;
- } else {
- nfs_retry_commit(mds_pages, NULL, cinfo, 0);
- pnfs_generic_retry_commit(cinfo, 0);
- return -ENOMEM;
- }
+ data = nfs_commitdata_alloc(true);
+ data->ds_commit_index = -1;
+ list_add(&data->pages, &list);
+ nreq++;
}
nreq += pnfs_generic_alloc_ds_commits(cinfo, &list);
@@ -619,7 +620,6 @@ void nfs4_pnfs_v3_ds_connect_unload(void)
get_v3_ds_connect = NULL;
}
}
-EXPORT_SYMBOL_GPL(nfs4_pnfs_v3_ds_connect_unload);
static int _nfs4_pnfs_v3_ds_connect(struct nfs_server *mds_srv,
struct nfs4_pnfs_ds *ds,
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index b7bca8303989..9872cf676a50 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -638,7 +638,7 @@ nfs_proc_lock(struct file *filp, int cmd, struct file_lock *fl)
{
struct inode *inode = file_inode(filp);
- return nlmclnt_proc(NFS_SERVER(inode)->nlm_host, cmd, fl);
+ return nlmclnt_proc(NFS_SERVER(inode)->nlm_host, cmd, fl, NULL);
}
/* Helper functions for NFS lock bounds checking */
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index defc9233e985..a8421d9dab6a 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -35,7 +35,11 @@ static struct kmem_cache *nfs_rdata_cachep;
static struct nfs_pgio_header *nfs_readhdr_alloc(void)
{
- return kmem_cache_zalloc(nfs_rdata_cachep, GFP_KERNEL);
+ struct nfs_pgio_header *p = kmem_cache_zalloc(nfs_rdata_cachep, GFP_KERNEL);
+
+ if (p)
+ p->rw_mode = FMODE_READ;
+ return p;
}
static void nfs_readhdr_free(struct nfs_pgio_header *rhdr)
@@ -64,7 +68,7 @@ void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio,
pg_ops = server->pnfs_curr_ld->pg_read_ops;
#endif
nfs_pageio_init(pgio, inode, pg_ops, compl_ops, &nfs_rw_read_ops,
- server->rsize, 0);
+ server->rsize, 0, GFP_KERNEL);
}
EXPORT_SYMBOL_GPL(nfs_pageio_init_read);
@@ -451,7 +455,6 @@ void nfs_destroy_readpagecache(void)
}
static const struct nfs_rw_ops nfs_rw_read_ops = {
- .rw_mode = FMODE_READ,
.rw_alloc_header = nfs_readhdr_alloc,
.rw_free_header = nfs_readhdr_free,
.rw_done = nfs_readpage_done,
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index cc341fc7fd44..db7ba542559e 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -60,14 +60,28 @@ static mempool_t *nfs_wdata_mempool;
static struct kmem_cache *nfs_cdata_cachep;
static mempool_t *nfs_commit_mempool;
-struct nfs_commit_data *nfs_commitdata_alloc(void)
+struct nfs_commit_data *nfs_commitdata_alloc(bool never_fail)
{
- struct nfs_commit_data *p = mempool_alloc(nfs_commit_mempool, GFP_NOIO);
+ struct nfs_commit_data *p;
- if (p) {
- memset(p, 0, sizeof(*p));
- INIT_LIST_HEAD(&p->pages);
+ if (never_fail)
+ p = mempool_alloc(nfs_commit_mempool, GFP_NOIO);
+ else {
+ /* It is OK to do some reclaim, not no safe to wait
+ * for anything to be returned to the pool.
+ * mempool_alloc() cannot handle that particular combination,
+ * so we need two separate attempts.
+ */
+ p = mempool_alloc(nfs_commit_mempool, GFP_NOWAIT);
+ if (!p)
+ p = kmem_cache_alloc(nfs_cdata_cachep, GFP_NOIO |
+ __GFP_NOWARN | __GFP_NORETRY);
+ if (!p)
+ return NULL;
}
+
+ memset(p, 0, sizeof(*p));
+ INIT_LIST_HEAD(&p->pages);
return p;
}
EXPORT_SYMBOL_GPL(nfs_commitdata_alloc);
@@ -82,8 +96,10 @@ static struct nfs_pgio_header *nfs_writehdr_alloc(void)
{
struct nfs_pgio_header *p = mempool_alloc(nfs_wdata_mempool, GFP_NOIO);
- if (p)
+ if (p) {
memset(p, 0, sizeof(*p));
+ p->rw_mode = FMODE_WRITE;
+ }
return p;
}
@@ -547,9 +563,21 @@ static void nfs_write_error_remove_page(struct nfs_page *req)
{
nfs_unlock_request(req);
nfs_end_page_writeback(req);
- nfs_release_request(req);
generic_error_remove_page(page_file_mapping(req->wb_page),
req->wb_page);
+ nfs_release_request(req);
+}
+
+static bool
+nfs_error_is_fatal_on_server(int err)
+{
+ switch (err) {
+ case 0:
+ case -ERESTARTSYS:
+ case -EINTR:
+ return false;
+ }
+ return nfs_error_is_fatal(err);
}
/*
@@ -557,8 +585,7 @@ static void nfs_write_error_remove_page(struct nfs_page *req)
* May return an error if the user signalled nfs_wait_on_request().
*/
static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
- struct page *page, bool nonblock,
- bool launder)
+ struct page *page, bool nonblock)
{
struct nfs_page *req;
int ret = 0;
@@ -574,19 +601,19 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
WARN_ON_ONCE(test_bit(PG_CLEAN, &req->wb_flags));
ret = 0;
+ /* If there is a fatal error that covers this write, just exit */
+ if (nfs_error_is_fatal_on_server(req->wb_context->error))
+ goto out_launder;
+
if (!nfs_pageio_add_request(pgio, req)) {
ret = pgio->pg_error;
/*
- * Remove the problematic req upon fatal errors
- * in launder case, while other dirty pages can
- * still be around until they get flushed.
+ * Remove the problematic req upon fatal errors on the server
*/
if (nfs_error_is_fatal(ret)) {
nfs_context_set_write_error(req->wb_context, ret);
- if (launder) {
- nfs_write_error_remove_page(req);
- goto out;
- }
+ if (nfs_error_is_fatal_on_server(ret))
+ goto out_launder;
}
nfs_redirty_request(req);
ret = -EAGAIN;
@@ -595,16 +622,18 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
NFSIOS_WRITEPAGES, 1);
out:
return ret;
+out_launder:
+ nfs_write_error_remove_page(req);
+ return ret;
}
static int nfs_do_writepage(struct page *page, struct writeback_control *wbc,
- struct nfs_pageio_descriptor *pgio, bool launder)
+ struct nfs_pageio_descriptor *pgio)
{
int ret;
nfs_pageio_cond_complete(pgio, page_index(page));
- ret = nfs_page_async_flush(pgio, page, wbc->sync_mode == WB_SYNC_NONE,
- launder);
+ ret = nfs_page_async_flush(pgio, page, wbc->sync_mode == WB_SYNC_NONE);
if (ret == -EAGAIN) {
redirty_page_for_writepage(wbc, page);
ret = 0;
@@ -616,8 +645,7 @@ static int nfs_do_writepage(struct page *page, struct writeback_control *wbc,
* Write an mmapped page to the server.
*/
static int nfs_writepage_locked(struct page *page,
- struct writeback_control *wbc,
- bool launder)
+ struct writeback_control *wbc)
{
struct nfs_pageio_descriptor pgio;
struct inode *inode = page_file_mapping(page)->host;
@@ -626,7 +654,7 @@ static int nfs_writepage_locked(struct page *page,
nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE);
nfs_pageio_init_write(&pgio, inode, 0,
false, &nfs_async_write_completion_ops);
- err = nfs_do_writepage(page, wbc, &pgio, launder);
+ err = nfs_do_writepage(page, wbc, &pgio);
nfs_pageio_complete(&pgio);
if (err < 0)
return err;
@@ -639,7 +667,7 @@ int nfs_writepage(struct page *page, struct writeback_control *wbc)
{
int ret;
- ret = nfs_writepage_locked(page, wbc, false);
+ ret = nfs_writepage_locked(page, wbc);
unlock_page(page);
return ret;
}
@@ -648,7 +676,7 @@ static int nfs_writepages_callback(struct page *page, struct writeback_control *
{
int ret;
- ret = nfs_do_writepage(page, wbc, data, false);
+ ret = nfs_do_writepage(page, wbc, data);
unlock_page(page);
return ret;
}
@@ -1367,7 +1395,7 @@ void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
pg_ops = server->pnfs_curr_ld->pg_write_ops;
#endif
nfs_pageio_init(pgio, inode, pg_ops, compl_ops, &nfs_rw_write_ops,
- server->wsize, ioflags);
+ server->wsize, ioflags, GFP_NOIO);
}
EXPORT_SYMBOL_GPL(nfs_pageio_init_write);
@@ -1704,50 +1732,14 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how,
if (list_empty(head))
return 0;
- data = nfs_commitdata_alloc();
-
- if (!data)
- goto out_bad;
+ data = nfs_commitdata_alloc(true);
/* Set up the argument struct */
nfs_init_commit(data, head, NULL, cinfo);
atomic_inc(&cinfo->mds->rpcs_out);
return nfs_initiate_commit(NFS_CLIENT(inode), data, NFS_PROTO(inode),
data->mds_ops, how, 0);
- out_bad:
- nfs_retry_commit(head, NULL, cinfo, 0);
- return -ENOMEM;
-}
-
-int nfs_commit_file(struct file *file, struct nfs_write_verifier *verf)
-{
- struct inode *inode = file_inode(file);
- struct nfs_open_context *open;
- struct nfs_commit_info cinfo;
- struct nfs_page *req;
- int ret;
-
- open = get_nfs_open_context(nfs_file_open_context(file));
- req = nfs_create_request(open, NULL, NULL, 0, i_size_read(inode));
- if (IS_ERR(req)) {
- ret = PTR_ERR(req);
- goto out_put;
- }
-
- nfs_init_cinfo_from_inode(&cinfo, inode);
-
- memcpy(&req->wb_verf, verf, sizeof(struct nfs_write_verifier));
- nfs_request_add_commit_list(req, &cinfo);
- ret = nfs_commit_inode(inode, FLUSH_SYNC);
- if (ret > 0)
- ret = 0;
-
- nfs_free_request(req);
-out_put:
- put_nfs_open_context(open);
- return ret;
}
-EXPORT_SYMBOL_GPL(nfs_commit_file);
/*
* COMMIT call returned
@@ -1985,7 +1977,7 @@ int nfs_wb_page_cancel(struct inode *inode, struct page *page)
/*
* Write back all requests on one page - we do this before reading it.
*/
-int nfs_wb_single_page(struct inode *inode, struct page *page, bool launder)
+int nfs_wb_page(struct inode *inode, struct page *page)
{
loff_t range_start = page_file_offset(page);
loff_t range_end = range_start + (loff_t)(PAGE_SIZE - 1);
@@ -2002,7 +1994,7 @@ int nfs_wb_single_page(struct inode *inode, struct page *page, bool launder)
for (;;) {
wait_on_page_writeback(page);
if (clear_page_dirty_for_io(page)) {
- ret = nfs_writepage_locked(page, &wbc, launder);
+ ret = nfs_writepage_locked(page, &wbc);
if (ret < 0)
goto out_error;
continue;
@@ -2107,7 +2099,6 @@ void nfs_destroy_writepagecache(void)
}
static const struct nfs_rw_ops nfs_rw_write_ops = {
- .rw_mode = FMODE_WRITE,
.rw_alloc_header = nfs_writehdr_alloc,
.rw_free_header = nfs_writehdr_free,
.rw_done = nfs_writeback_done,
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 452334694a5d..12feac6ee2fd 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -334,8 +334,11 @@ nfs3svc_decode_readargs(struct svc_rqst *rqstp, __be32 *p,
if (!p)
return 0;
p = xdr_decode_hyper(p, &args->offset);
-
args->count = ntohl(*p++);
+
+ if (!xdr_argsize_check(rqstp, p))
+ return 0;
+
len = min(args->count, max_blocksize);
/* set up the kvec */
@@ -349,7 +352,7 @@ nfs3svc_decode_readargs(struct svc_rqst *rqstp, __be32 *p,
v++;
}
args->vlen = v;
- return xdr_argsize_check(rqstp, p);
+ return 1;
}
int
@@ -541,9 +544,11 @@ nfs3svc_decode_readlinkargs(struct svc_rqst *rqstp, __be32 *p,
p = decode_fh(p, &args->fh);
if (!p)
return 0;
+ if (!xdr_argsize_check(rqstp, p))
+ return 0;
args->buffer = page_address(*(rqstp->rq_next_page++));
- return xdr_argsize_check(rqstp, p);
+ return 1;
}
int
@@ -569,10 +574,14 @@ nfs3svc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p,
args->verf = p; p += 2;
args->dircount = ~0;
args->count = ntohl(*p++);
+
+ if (!xdr_argsize_check(rqstp, p))
+ return 0;
+
args->count = min_t(u32, args->count, PAGE_SIZE);
args->buffer = page_address(*(rqstp->rq_next_page++));
- return xdr_argsize_check(rqstp, p);
+ return 1;
}
int
@@ -590,6 +599,9 @@ nfs3svc_decode_readdirplusargs(struct svc_rqst *rqstp, __be32 *p,
args->dircount = ntohl(*p++);
args->count = ntohl(*p++);
+ if (!xdr_argsize_check(rqstp, p))
+ return 0;
+
len = args->count = min(args->count, max_blocksize);
while (len > 0) {
struct page *p = *(rqstp->rq_next_page++);
@@ -597,8 +609,7 @@ nfs3svc_decode_readdirplusargs(struct svc_rqst *rqstp, __be32 *p,
args->buffer = page_address(p);
len -= PAGE_SIZE;
}
-
- return xdr_argsize_check(rqstp, p);
+ return 1;
}
int
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index d86031b6ad79..c453a1998e00 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -1259,7 +1259,8 @@ nfsd4_layout_verify(struct svc_export *exp, unsigned int layout_type)
return NULL;
}
- if (!(exp->ex_layout_types & (1 << layout_type))) {
+ if (layout_type >= LAYOUT_TYPE_MAX ||
+ !(exp->ex_layout_types & (1 << layout_type))) {
dprintk("%s: layout type %d not supported\n",
__func__, layout_type);
return NULL;
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index e9ef50addddb..22002fb75a18 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1912,28 +1912,15 @@ static void copy_clid(struct nfs4_client *target, struct nfs4_client *source)
target->cl_clientid.cl_id = source->cl_clientid.cl_id;
}
-int strdup_if_nonnull(char **target, char *source)
-{
- if (source) {
- *target = kstrdup(source, GFP_KERNEL);
- if (!*target)
- return -ENOMEM;
- } else
- *target = NULL;
- return 0;
-}
-
static int copy_cred(struct svc_cred *target, struct svc_cred *source)
{
- int ret;
+ target->cr_principal = kstrdup(source->cr_principal, GFP_KERNEL);
+ target->cr_raw_principal = kstrdup(source->cr_raw_principal,
+ GFP_KERNEL);
+ if ((source->cr_principal && ! target->cr_principal) ||
+ (source->cr_raw_principal && ! target->cr_raw_principal))
+ return -ENOMEM;
- ret = strdup_if_nonnull(&target->cr_principal, source->cr_principal);
- if (ret)
- return ret;
- ret = strdup_if_nonnull(&target->cr_raw_principal,
- source->cr_raw_principal);
- if (ret)
- return ret;
target->cr_flavor = source->cr_flavor;
target->cr_uid = source->cr_uid;
target->cr_gid = source->cr_gid;
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 33017d652b1d..26780d53a6f9 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -2831,9 +2831,14 @@ out_acl:
}
#endif /* CONFIG_NFSD_PNFS */
if (bmval2 & FATTR4_WORD2_SUPPATTR_EXCLCREAT) {
- status = nfsd4_encode_bitmap(xdr, NFSD_SUPPATTR_EXCLCREAT_WORD0,
- NFSD_SUPPATTR_EXCLCREAT_WORD1,
- NFSD_SUPPATTR_EXCLCREAT_WORD2);
+ u32 supp[3];
+
+ memcpy(supp, nfsd_suppattrs[minorversion], sizeof(supp));
+ supp[0] &= NFSD_SUPPATTR_EXCLCREAT_WORD0;
+ supp[1] &= NFSD_SUPPATTR_EXCLCREAT_WORD1;
+ supp[2] &= NFSD_SUPPATTR_EXCLCREAT_WORD2;
+
+ status = nfsd4_encode_bitmap(xdr, supp[0], supp[1], supp[2]);
if (status)
goto out;
}
@@ -4119,8 +4124,7 @@ nfsd4_encode_getdeviceinfo(struct nfsd4_compoundres *resp, __be32 nfserr,
struct nfsd4_getdeviceinfo *gdev)
{
struct xdr_stream *xdr = &resp->xdr;
- const struct nfsd4_layout_ops *ops =
- nfsd4_layout_ops[gdev->gd_layout_type];
+ const struct nfsd4_layout_ops *ops;
u32 starting_len = xdr->buf->len, needed_len;
__be32 *p;
@@ -4137,6 +4141,7 @@ nfsd4_encode_getdeviceinfo(struct nfsd4_compoundres *resp, __be32 nfserr,
/* If maxcount is 0 then just update notifications */
if (gdev->gd_maxcount != 0) {
+ ops = nfsd4_layout_ops[gdev->gd_layout_type];
nfserr = ops->encode_getdeviceinfo(xdr, gdev);
if (nfserr) {
/*
@@ -4189,8 +4194,7 @@ nfsd4_encode_layoutget(struct nfsd4_compoundres *resp, __be32 nfserr,
struct nfsd4_layoutget *lgp)
{
struct xdr_stream *xdr = &resp->xdr;
- const struct nfsd4_layout_ops *ops =
- nfsd4_layout_ops[lgp->lg_layout_type];
+ const struct nfsd4_layout_ops *ops;
__be32 *p;
dprintk("%s: err %d\n", __func__, nfserr);
@@ -4213,6 +4217,7 @@ nfsd4_encode_layoutget(struct nfsd4_compoundres *resp, __be32 nfserr,
*p++ = cpu_to_be32(lgp->lg_seg.iomode);
*p++ = cpu_to_be32(lgp->lg_layout_type);
+ ops = nfsd4_layout_ops[lgp->lg_layout_type];
nfserr = ops->encode_layoutget(xdr, lgp);
out:
kfree(lgp->lg_content);
diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c
index de07ff625777..6a4947a3f4fa 100644
--- a/fs/nfsd/nfsxdr.c
+++ b/fs/nfsd/nfsxdr.c
@@ -257,6 +257,9 @@ nfssvc_decode_readargs(struct svc_rqst *rqstp, __be32 *p,
len = args->count = ntohl(*p++);
p++; /* totalcount - unused */
+ if (!xdr_argsize_check(rqstp, p))
+ return 0;
+
len = min_t(unsigned int, len, NFSSVC_MAXBLKSIZE_V2);
/* set up somewhere to store response.
@@ -272,7 +275,7 @@ nfssvc_decode_readargs(struct svc_rqst *rqstp, __be32 *p,
v++;
}
args->vlen = v;
- return xdr_argsize_check(rqstp, p);
+ return 1;
}
int
@@ -362,9 +365,11 @@ nfssvc_decode_readlinkargs(struct svc_rqst *rqstp, __be32 *p, struct nfsd_readli
p = decode_fh(p, &args->fh);
if (!p)
return 0;
+ if (!xdr_argsize_check(rqstp, p))
+ return 0;
args->buffer = page_address(*(rqstp->rq_next_page++));
- return xdr_argsize_check(rqstp, p);
+ return 1;
}
int
@@ -402,9 +407,11 @@ nfssvc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p,
args->cookie = ntohl(*p++);
args->count = ntohl(*p++);
args->count = min_t(u32, args->count, PAGE_SIZE);
+ if (!xdr_argsize_check(rqstp, p))
+ return 0;
args->buffer = page_address(*(rqstp->rq_next_page++));
- return xdr_argsize_check(rqstp, p);
+ return 1;
}
/*
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 9aaf6ca77569..2be32955d7f2 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -94,6 +94,12 @@ nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp,
err = follow_down(&path);
if (err < 0)
goto out;
+ if (path.mnt == exp->ex_path.mnt && path.dentry == dentry &&
+ nfsd_mountpoint(dentry, exp) == 2) {
+ /* This is only a mountpoint in some other namespace */
+ path_put(&path);
+ goto out;
+ }
exp2 = rqst_exp_get_by_name(rqstp, &path);
if (IS_ERR(exp2)) {
@@ -167,16 +173,26 @@ static int nfsd_lookup_parent(struct svc_rqst *rqstp, struct dentry *dparent, st
/*
* For nfsd purposes, we treat V4ROOT exports as though there was an
* export at *every* directory.
+ * We return:
+ * '1' if this dentry *must* be an export point,
+ * '2' if it might be, if there is really a mount here, and
+ * '0' if there is no chance of an export point here.
*/
int nfsd_mountpoint(struct dentry *dentry, struct svc_export *exp)
{
- if (d_mountpoint(dentry))
+ if (!d_inode(dentry))
+ return 0;
+ if (exp->ex_flags & NFSEXP_V4ROOT)
return 1;
if (nfsd4_is_junction(dentry))
return 1;
- if (!(exp->ex_flags & NFSEXP_V4ROOT))
- return 0;
- return d_inode(dentry) != NULL;
+ if (d_mountpoint(dentry))
+ /*
+ * Might only be a mountpoint in a different namespace,
+ * but we need to check.
+ */
+ return 2;
+ return 0;
}
__be32
diff --git a/fs/open.c b/fs/open.c
index 373787afd638..cd0c5be8d012 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -193,7 +193,8 @@ static long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
goto out_putf;
error = -EPERM;
- if (IS_APPEND(inode))
+ /* Check IS_APPEND on real upper inode */
+ if (IS_APPEND(file_inode(f.file)))
goto out_putf;
sb_start_write(inode->i_sb);
@@ -459,20 +460,17 @@ out:
SYSCALL_DEFINE1(fchdir, unsigned int, fd)
{
struct fd f = fdget_raw(fd);
- struct inode *inode;
- int error = -EBADF;
+ int error;
error = -EBADF;
if (!f.file)
goto out;
- inode = file_inode(f.file);
-
error = -ENOTDIR;
- if (!S_ISDIR(inode->i_mode))
+ if (!d_can_lookup(f.file->f_path.dentry))
goto out_putf;
- error = inode_permission(inode, MAY_EXEC | MAY_CHDIR);
+ error = inode_permission(file_inode(f.file), MAY_EXEC | MAY_CHDIR);
if (!error)
set_fs_pwd(current->fs, &f.file->f_path);
out_putf:
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index 906ea6c93260..9008ab9fbd2e 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -20,6 +20,7 @@
#include <linux/namei.h>
#include <linux/fdtable.h>
#include <linux/ratelimit.h>
+#include <linux/exportfs.h>
#include "overlayfs.h"
#include "ovl_entry.h"
@@ -232,6 +233,79 @@ int ovl_set_attr(struct dentry *upperdentry, struct kstat *stat)
return err;
}
+static struct ovl_fh *ovl_encode_fh(struct dentry *lower, uuid_be *uuid)
+{
+ struct ovl_fh *fh;
+ int fh_type, fh_len, dwords;
+ void *buf;
+ int buflen = MAX_HANDLE_SZ;
+
+ buf = kmalloc(buflen, GFP_TEMPORARY);
+ if (!buf)
+ return ERR_PTR(-ENOMEM);
+
+ /*
+ * We encode a non-connectable file handle for non-dir, because we
+ * only need to find the lower inode number and we don't want to pay
+ * the price or reconnecting the dentry.
+ */
+ dwords = buflen >> 2;
+ fh_type = exportfs_encode_fh(lower, buf, &dwords, 0);
+ buflen = (dwords << 2);
+
+ fh = ERR_PTR(-EIO);
+ if (WARN_ON(fh_type < 0) ||
+ WARN_ON(buflen > MAX_HANDLE_SZ) ||
+ WARN_ON(fh_type == FILEID_INVALID))
+ goto out;
+
+ BUILD_BUG_ON(MAX_HANDLE_SZ + offsetof(struct ovl_fh, fid) > 255);
+ fh_len = offsetof(struct ovl_fh, fid) + buflen;
+ fh = kmalloc(fh_len, GFP_KERNEL);
+ if (!fh) {
+ fh = ERR_PTR(-ENOMEM);
+ goto out;
+ }
+
+ fh->version = OVL_FH_VERSION;
+ fh->magic = OVL_FH_MAGIC;
+ fh->type = fh_type;
+ fh->flags = OVL_FH_FLAG_CPU_ENDIAN;
+ fh->len = fh_len;
+ fh->uuid = *uuid;
+ memcpy(fh->fid, buf, buflen);
+
+out:
+ kfree(buf);
+ return fh;
+}
+
+static int ovl_set_origin(struct dentry *dentry, struct dentry *lower,
+ struct dentry *upper)
+{
+ struct super_block *sb = lower->d_sb;
+ uuid_be *uuid = (uuid_be *) &sb->s_uuid;
+ const struct ovl_fh *fh = NULL;
+ int err;
+
+ /*
+ * When lower layer doesn't support export operations store a 'null' fh,
+ * so we can use the overlay.origin xattr to distignuish between a copy
+ * up and a pure upper inode.
+ */
+ if (sb->s_export_op && sb->s_export_op->fh_to_dentry &&
+ uuid_be_cmp(*uuid, NULL_UUID_BE)) {
+ fh = ovl_encode_fh(lower, uuid);
+ if (IS_ERR(fh))
+ return PTR_ERR(fh);
+ }
+
+ err = ovl_do_setxattr(upper, OVL_XATTR_ORIGIN, fh, fh ? fh->len : 0, 0);
+ kfree(fh);
+
+ return err;
+}
+
static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir,
struct dentry *dentry, struct path *lowerpath,
struct kstat *stat, const char *link,
@@ -316,6 +390,14 @@ static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir,
if (err)
goto out_cleanup;
+ /*
+ * Store identifier of lower inode in upper inode xattr to
+ * allow lookup of the copy up origin inode.
+ */
+ err = ovl_set_origin(dentry, lowerpath->dentry, temp);
+ if (err)
+ goto out_cleanup;
+
if (tmpfile)
err = ovl_do_link(temp, udir, upper, true);
else
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index 6515796460df..723b98b90698 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -138,36 +138,6 @@ static int ovl_set_opaque(struct dentry *dentry, struct dentry *upperdentry)
return err;
}
-static int ovl_dir_getattr(const struct path *path, struct kstat *stat,
- u32 request_mask, unsigned int flags)
-{
- struct dentry *dentry = path->dentry;
- int err;
- enum ovl_path_type type;
- struct path realpath;
- const struct cred *old_cred;
-
- type = ovl_path_real(dentry, &realpath);
- old_cred = ovl_override_creds(dentry->d_sb);
- err = vfs_getattr(&realpath, stat, request_mask, flags);
- revert_creds(old_cred);
- if (err)
- return err;
-
- stat->dev = dentry->d_sb->s_dev;
- stat->ino = dentry->d_inode->i_ino;
-
- /*
- * It's probably not worth it to count subdirs to get the
- * correct link count. nlink=1 seems to pacify 'find' and
- * other utilities.
- */
- if (OVL_TYPE_MERGE(type))
- stat->nlink = 1;
-
- return 0;
-}
-
/* Common operations required to be done after creation of file on upper */
static void ovl_instantiate(struct dentry *dentry, struct inode *inode,
struct dentry *newdentry, bool hardlink)
@@ -182,6 +152,9 @@ static void ovl_instantiate(struct dentry *dentry, struct inode *inode,
inc_nlink(inode);
}
d_instantiate(dentry, inode);
+ /* Force lookup of new upper hardlink to find its lower */
+ if (hardlink)
+ d_drop(dentry);
}
static bool ovl_type_merge(struct dentry *dentry)
@@ -210,7 +183,7 @@ static int ovl_create_upper(struct dentry *dentry, struct inode *inode,
if (err)
goto out_dput;
- if (ovl_type_merge(dentry->d_parent)) {
+ if (ovl_type_merge(dentry->d_parent) && d_is_dir(newdentry)) {
/* Setting opaque here is just an optimization, allow to fail */
ovl_set_opaque(dentry, newdentry);
}
@@ -1070,7 +1043,7 @@ const struct inode_operations ovl_dir_inode_operations = {
.create = ovl_create,
.mknod = ovl_mknod,
.permission = ovl_permission,
- .getattr = ovl_dir_getattr,
+ .getattr = ovl_getattr,
.listxattr = ovl_listxattr,
.get_acl = ovl_get_acl,
.update_time = ovl_update_time,
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index f8fe6bf2036d..ad9547f82da5 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -57,18 +57,78 @@ out:
return err;
}
-static int ovl_getattr(const struct path *path, struct kstat *stat,
- u32 request_mask, unsigned int flags)
+int ovl_getattr(const struct path *path, struct kstat *stat,
+ u32 request_mask, unsigned int flags)
{
struct dentry *dentry = path->dentry;
+ enum ovl_path_type type;
struct path realpath;
const struct cred *old_cred;
+ bool is_dir = S_ISDIR(dentry->d_inode->i_mode);
int err;
- ovl_path_real(dentry, &realpath);
+ type = ovl_path_real(dentry, &realpath);
old_cred = ovl_override_creds(dentry->d_sb);
err = vfs_getattr(&realpath, stat, request_mask, flags);
+ if (err)
+ goto out;
+
+ /*
+ * When all layers are on the same fs, all real inode number are
+ * unique, so we use the overlay st_dev, which is friendly to du -x.
+ *
+ * We also use st_ino of the copy up origin, if we know it.
+ * This guaranties constant st_dev/st_ino across copy up.
+ *
+ * If filesystem supports NFS export ops, this also guaranties
+ * persistent st_ino across mount cycle.
+ */
+ if (ovl_same_sb(dentry->d_sb)) {
+ if (OVL_TYPE_ORIGIN(type)) {
+ struct kstat lowerstat;
+ u32 lowermask = STATX_INO | (!is_dir ? STATX_NLINK : 0);
+
+ ovl_path_lower(dentry, &realpath);
+ err = vfs_getattr(&realpath, &lowerstat,
+ lowermask, flags);
+ if (err)
+ goto out;
+
+ WARN_ON_ONCE(stat->dev != lowerstat.dev);
+ /*
+ * Lower hardlinks are broken on copy up to different
+ * upper files, so we cannot use the lower origin st_ino
+ * for those different files, even for the same fs case.
+ */
+ if (is_dir || lowerstat.nlink == 1)
+ stat->ino = lowerstat.ino;
+ }
+ stat->dev = dentry->d_sb->s_dev;
+ } else if (is_dir) {
+ /*
+ * If not all layers are on the same fs the pair {real st_ino;
+ * overlay st_dev} is not unique, so use the non persistent
+ * overlay st_ino.
+ *
+ * Always use the overlay st_dev for directories, so 'find
+ * -xdev' will scan the entire overlay mount and won't cross the
+ * overlay mount boundaries.
+ */
+ stat->dev = dentry->d_sb->s_dev;
+ stat->ino = dentry->d_inode->i_ino;
+ }
+
+ /*
+ * It's probably not worth it to count subdirs to get the
+ * correct link count. nlink=1 seems to pacify 'find' and
+ * other utilities.
+ */
+ if (is_dir && OVL_TYPE_MERGE(type))
+ stat->nlink = 1;
+
+out:
revert_creds(old_cred);
+
return err;
}
@@ -303,6 +363,41 @@ static const struct inode_operations ovl_symlink_inode_operations = {
.update_time = ovl_update_time,
};
+/*
+ * It is possible to stack overlayfs instance on top of another
+ * overlayfs instance as lower layer. We need to annonate the
+ * stackable i_mutex locks according to stack level of the super
+ * block instance. An overlayfs instance can never be in stack
+ * depth 0 (there is always a real fs below it). An overlayfs
+ * inode lock will use the lockdep annotaion ovl_i_mutex_key[depth].
+ *
+ * For example, here is a snip from /proc/lockdep_chains after
+ * dir_iterate of nested overlayfs:
+ *
+ * [...] &ovl_i_mutex_dir_key[depth] (stack_depth=2)
+ * [...] &ovl_i_mutex_dir_key[depth]#2 (stack_depth=1)
+ * [...] &type->i_mutex_dir_key (stack_depth=0)
+ */
+#define OVL_MAX_NESTING FILESYSTEM_MAX_STACK_DEPTH
+
+static inline void ovl_lockdep_annotate_inode_mutex_key(struct inode *inode)
+{
+#ifdef CONFIG_LOCKDEP
+ static struct lock_class_key ovl_i_mutex_key[OVL_MAX_NESTING];
+ static struct lock_class_key ovl_i_mutex_dir_key[OVL_MAX_NESTING];
+
+ int depth = inode->i_sb->s_stack_depth - 1;
+
+ if (WARN_ON_ONCE(depth < 0 || depth >= OVL_MAX_NESTING))
+ depth = 0;
+
+ if (S_ISDIR(inode->i_mode))
+ lockdep_set_class(&inode->i_rwsem, &ovl_i_mutex_dir_key[depth]);
+ else
+ lockdep_set_class(&inode->i_rwsem, &ovl_i_mutex_key[depth]);
+#endif
+}
+
static void ovl_fill_inode(struct inode *inode, umode_t mode, dev_t rdev)
{
inode->i_ino = get_next_ino();
@@ -312,6 +407,8 @@ static void ovl_fill_inode(struct inode *inode, umode_t mode, dev_t rdev)
inode->i_acl = inode->i_default_acl = ACL_DONT_CACHE;
#endif
+ ovl_lockdep_annotate_inode_mutex_key(inode);
+
switch (mode & S_IFMT) {
case S_IFREG:
inode->i_op = &ovl_file_inode_operations;
diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c
index b8b077821fb0..bad0f665a635 100644
--- a/fs/overlayfs/namei.c
+++ b/fs/overlayfs/namei.c
@@ -12,6 +12,8 @@
#include <linux/namei.h>
#include <linux/xattr.h>
#include <linux/ratelimit.h>
+#include <linux/mount.h>
+#include <linux/exportfs.h>
#include "overlayfs.h"
#include "ovl_entry.h"
@@ -81,6 +83,90 @@ invalid:
goto err_free;
}
+static int ovl_acceptable(void *ctx, struct dentry *dentry)
+{
+ return 1;
+}
+
+static struct dentry *ovl_get_origin(struct dentry *dentry,
+ struct vfsmount *mnt)
+{
+ int res;
+ struct ovl_fh *fh = NULL;
+ struct dentry *origin = NULL;
+ int bytes;
+
+ res = vfs_getxattr(dentry, OVL_XATTR_ORIGIN, NULL, 0);
+ if (res < 0) {
+ if (res == -ENODATA || res == -EOPNOTSUPP)
+ return NULL;
+ goto fail;
+ }
+ /* Zero size value means "copied up but origin unknown" */
+ if (res == 0)
+ return NULL;
+
+ fh = kzalloc(res, GFP_TEMPORARY);
+ if (!fh)
+ return ERR_PTR(-ENOMEM);
+
+ res = vfs_getxattr(dentry, OVL_XATTR_ORIGIN, fh, res);
+ if (res < 0)
+ goto fail;
+
+ if (res < sizeof(struct ovl_fh) || res < fh->len)
+ goto invalid;
+
+ if (fh->magic != OVL_FH_MAGIC)
+ goto invalid;
+
+ /* Treat larger version and unknown flags as "origin unknown" */
+ if (fh->version > OVL_FH_VERSION || fh->flags & ~OVL_FH_FLAG_ALL)
+ goto out;
+
+ /* Treat endianness mismatch as "origin unknown" */
+ if (!(fh->flags & OVL_FH_FLAG_ANY_ENDIAN) &&
+ (fh->flags & OVL_FH_FLAG_BIG_ENDIAN) != OVL_FH_FLAG_CPU_ENDIAN)
+ goto out;
+
+ bytes = (fh->len - offsetof(struct ovl_fh, fid));
+
+ /*
+ * Make sure that the stored uuid matches the uuid of the lower
+ * layer where file handle will be decoded.
+ */
+ if (uuid_be_cmp(fh->uuid, *(uuid_be *) &mnt->mnt_sb->s_uuid))
+ goto out;
+
+ origin = exportfs_decode_fh(mnt, (struct fid *)fh->fid,
+ bytes >> 2, (int)fh->type,
+ ovl_acceptable, NULL);
+ if (IS_ERR(origin)) {
+ /* Treat stale file handle as "origin unknown" */
+ if (origin == ERR_PTR(-ESTALE))
+ origin = NULL;
+ goto out;
+ }
+
+ if (ovl_dentry_weird(origin) ||
+ ((d_inode(origin)->i_mode ^ d_inode(dentry)->i_mode) & S_IFMT)) {
+ dput(origin);
+ origin = NULL;
+ goto invalid;
+ }
+
+out:
+ kfree(fh);
+ return origin;
+
+fail:
+ pr_warn_ratelimited("overlayfs: failed to get origin (%i)\n", res);
+ goto out;
+invalid:
+ pr_warn_ratelimited("overlayfs: invalid origin (%*phN)\n", res, fh);
+ goto out;
+}
+
static bool ovl_is_opaquedir(struct dentry *dentry)
{
int res;
@@ -192,6 +278,45 @@ static int ovl_lookup_layer(struct dentry *base, struct ovl_lookup_data *d,
return 0;
}
+
+static int ovl_check_origin(struct dentry *dentry, struct dentry *upperdentry,
+ struct path **stackp, unsigned int *ctrp)
+{
+ struct super_block *same_sb = ovl_same_sb(dentry->d_sb);
+ struct ovl_entry *roe = dentry->d_sb->s_root->d_fsdata;
+ struct vfsmount *mnt;
+ struct dentry *origin;
+
+ if (!same_sb || !roe->numlower)
+ return 0;
+
+ /*
+ * Since all layers are on the same fs, we use the first layer for
+ * decoding the file handle. We may get a disconnected dentry,
+ * which is fine, because we only need to hold the origin inode in
+ * cache and use its inode number. We may even get a connected dentry,
+ * that is not under the first layer's root. That is also fine for
+ * using it's inode number - it's the same as if we held a reference
+ * to a dentry in first layer that was moved under us.
+ */
+ mnt = roe->lowerstack[0].mnt;
+
+ origin = ovl_get_origin(upperdentry, mnt);
+ if (IS_ERR_OR_NULL(origin))
+ return PTR_ERR(origin);
+
+ BUG_ON(*stackp || *ctrp);
+ *stackp = kmalloc(sizeof(struct path), GFP_TEMPORARY);
+ if (!*stackp) {
+ dput(origin);
+ return -ENOMEM;
+ }
+ **stackp = (struct path) { .dentry = origin, .mnt = mnt };
+ *ctrp = 1;
+
+ return 0;
+}
+
/*
* Returns next layer in stack starting from top.
* Returns -1 if this is the last layer.
@@ -220,6 +345,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
const struct cred *old_cred;
struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
struct ovl_entry *poe = dentry->d_parent->d_fsdata;
+ struct ovl_entry *roe = dentry->d_sb->s_root->d_fsdata;
struct path *stack = NULL;
struct dentry *upperdir, *upperdentry = NULL;
unsigned int ctr = 0;
@@ -253,13 +379,20 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
err = -EREMOTE;
goto out;
}
+ if (upperdentry && !d.is_dir) {
+ BUG_ON(!d.stop || d.redirect);
+ err = ovl_check_origin(dentry, upperdentry,
+ &stack, &ctr);
+ if (err)
+ goto out;
+ }
if (d.redirect) {
upperredirect = kstrdup(d.redirect, GFP_KERNEL);
if (!upperredirect)
goto out_put_upper;
if (d.redirect[0] == '/')
- poe = dentry->d_sb->s_root->d_fsdata;
+ poe = roe;
}
upperopaque = d.opaque;
}
@@ -290,10 +423,8 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
if (d.stop)
break;
- if (d.redirect &&
- d.redirect[0] == '/' &&
- poe != dentry->d_sb->s_root->d_fsdata) {
- poe = dentry->d_sb->s_root->d_fsdata;
+ if (d.redirect && d.redirect[0] == '/' && poe != roe) {
+ poe = roe;
/* Find the current layer on the root dentry */
for (i = 0; i < poe->numlower; i++)
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index 741dc0b6931f..caa36cb9c46d 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -8,18 +8,56 @@
*/
#include <linux/kernel.h>
+#include <linux/uuid.h>
enum ovl_path_type {
__OVL_PATH_UPPER = (1 << 0),
__OVL_PATH_MERGE = (1 << 1),
+ __OVL_PATH_ORIGIN = (1 << 2),
};
#define OVL_TYPE_UPPER(type) ((type) & __OVL_PATH_UPPER)
#define OVL_TYPE_MERGE(type) ((type) & __OVL_PATH_MERGE)
+#define OVL_TYPE_ORIGIN(type) ((type) & __OVL_PATH_ORIGIN)
#define OVL_XATTR_PREFIX XATTR_TRUSTED_PREFIX "overlay."
#define OVL_XATTR_OPAQUE OVL_XATTR_PREFIX "opaque"
#define OVL_XATTR_REDIRECT OVL_XATTR_PREFIX "redirect"
+#define OVL_XATTR_ORIGIN OVL_XATTR_PREFIX "origin"
+
+/*
+ * The tuple (fh,uuid) is a universal unique identifier for a copy up origin,
+ * where:
+ * origin.fh - exported file handle of the lower file
+ * origin.uuid - uuid of the lower filesystem
+ */
+#define OVL_FH_VERSION 0
+#define OVL_FH_MAGIC 0xfb
+
+/* CPU byte order required for fid decoding: */
+#define OVL_FH_FLAG_BIG_ENDIAN (1 << 0)
+#define OVL_FH_FLAG_ANY_ENDIAN (1 << 1)
+
+#define OVL_FH_FLAG_ALL (OVL_FH_FLAG_BIG_ENDIAN | OVL_FH_FLAG_ANY_ENDIAN)
+
+#if defined(__LITTLE_ENDIAN)
+#define OVL_FH_FLAG_CPU_ENDIAN 0
+#elif defined(__BIG_ENDIAN)
+#define OVL_FH_FLAG_CPU_ENDIAN OVL_FH_FLAG_BIG_ENDIAN
+#else
+#error Endianness not defined
+#endif
+
+/* On-disk and in-memeory format for redirect by file handle */
+struct ovl_fh {
+ u8 version; /* 0 */
+ u8 magic; /* 0xfb */
+ u8 len; /* size of this header + size of fid */
+ u8 flags; /* OVL_FH_FLAG_* */
+ u8 type; /* fid_type of fid */
+ uuid_be uuid; /* uuid of filesystem */
+ u8 fid[0]; /* file identifier */
+} __packed;
#define OVL_ISUPPER_MASK 1UL
@@ -151,6 +189,7 @@ int ovl_want_write(struct dentry *dentry);
void ovl_drop_write(struct dentry *dentry);
struct dentry *ovl_workdir(struct dentry *dentry);
const struct cred *ovl_override_creds(struct super_block *sb);
+struct super_block *ovl_same_sb(struct super_block *sb);
struct ovl_entry *ovl_alloc_entry(unsigned int numlower);
bool ovl_dentry_remote(struct dentry *dentry);
bool ovl_dentry_weird(struct dentry *dentry);
@@ -197,6 +236,8 @@ void ovl_workdir_cleanup(struct inode *dir, struct vfsmount *mnt,
/* inode.c */
int ovl_setattr(struct dentry *dentry, struct iattr *attr);
+int ovl_getattr(const struct path *path, struct kstat *stat,
+ u32 request_mask, unsigned int flags);
int ovl_permission(struct inode *inode, int mask);
int ovl_xattr_set(struct dentry *dentry, const char *name, const void *value,
size_t size, int flags);
diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h
index 59614faa14c3..b2023ddb8532 100644
--- a/fs/overlayfs/ovl_entry.h
+++ b/fs/overlayfs/ovl_entry.h
@@ -29,6 +29,8 @@ struct ovl_fs {
const struct cred *creator_cred;
bool tmpfile;
wait_queue_head_t copyup_wq;
+ /* sb common to all layers */
+ struct super_block *same_sb;
};
/* private information held for every overlayfs dentry */
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index c9e70d39c1ea..9828b7de8999 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -49,11 +49,28 @@ static void ovl_dentry_release(struct dentry *dentry)
}
}
+static int ovl_check_append_only(struct inode *inode, int flag)
+{
+ /*
+ * This test was moot in vfs may_open() because overlay inode does
+ * not have the S_APPEND flag, so re-check on real upper inode
+ */
+ if (IS_APPEND(inode)) {
+ if ((flag & O_ACCMODE) != O_RDONLY && !(flag & O_APPEND))
+ return -EPERM;
+ if (flag & O_TRUNC)
+ return -EPERM;
+ }
+
+ return 0;
+}
+
static struct dentry *ovl_d_real(struct dentry *dentry,
const struct inode *inode,
unsigned int open_flags)
{
struct dentry *real;
+ int err;
if (!d_is_reg(dentry)) {
if (!inode || inode == d_inode(dentry))
@@ -65,15 +82,20 @@ static struct dentry *ovl_d_real(struct dentry *dentry,
return dentry;
if (open_flags) {
- int err = ovl_open_maybe_copy_up(dentry, open_flags);
-
+ err = ovl_open_maybe_copy_up(dentry, open_flags);
if (err)
return ERR_PTR(err);
}
real = ovl_dentry_upper(dentry);
- if (real && (!inode || inode == d_inode(real)))
+ if (real && (!inode || inode == d_inode(real))) {
+ if (!inode) {
+ err = ovl_check_append_only(d_inode(real), open_flags);
+ if (err)
+ return ERR_PTR(err);
+ }
return real;
+ }
real = ovl_dentry_lower(dentry);
if (!real)
@@ -709,8 +731,8 @@ static const struct xattr_handler *ovl_xattr_handlers[] = {
static int ovl_fill_super(struct super_block *sb, void *data, int silent)
{
- struct path upperpath = { NULL, NULL };
- struct path workpath = { NULL, NULL };
+ struct path upperpath = { };
+ struct path workpath = { };
struct dentry *root_dentry;
struct inode *realinode;
struct ovl_entry *oe;
@@ -892,11 +914,19 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
ufs->lower_mnt[ufs->numlower] = mnt;
ufs->numlower++;
+
+ /* Check if all lower layers are on same sb */
+ if (i == 0)
+ ufs->same_sb = mnt->mnt_sb;
+ else if (ufs->same_sb != mnt->mnt_sb)
+ ufs->same_sb = NULL;
}
/* If the upper fs is nonexistent, we mark overlayfs r/o too */
if (!ufs->upper_mnt)
sb->s_flags |= MS_RDONLY;
+ else if (ufs->upper_mnt->mnt_sb != ufs->same_sb)
+ ufs->same_sb = NULL;
if (remote)
sb->s_d_op = &ovl_reval_dentry_operations;
diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
index 6e610a205e15..cfdea47313a1 100644
--- a/fs/overlayfs/util.c
+++ b/fs/overlayfs/util.c
@@ -40,6 +40,13 @@ const struct cred *ovl_override_creds(struct super_block *sb)
return override_creds(ofs->creator_cred);
}
+struct super_block *ovl_same_sb(struct super_block *sb)
+{
+ struct ovl_fs *ofs = sb->s_fs_info;
+
+ return ofs->same_sb;
+}
+
struct ovl_entry *ovl_alloc_entry(unsigned int numlower)
{
size_t size = offsetof(struct ovl_entry, lowerstack[numlower]);
@@ -75,11 +82,13 @@ enum ovl_path_type ovl_path_type(struct dentry *dentry)
type = __OVL_PATH_UPPER;
/*
- * Non-dir dentry can hold lower dentry from previous
- * location.
+ * Non-dir dentry can hold lower dentry of its copy up origin.
*/
- if (oe->numlower && d_is_dir(dentry))
- type |= __OVL_PATH_MERGE;
+ if (oe->numlower) {
+ type |= __OVL_PATH_ORIGIN;
+ if (d_is_dir(dentry))
+ type |= __OVL_PATH_MERGE;
+ }
} else {
if (oe->numlower > 1)
type |= __OVL_PATH_MERGE;
@@ -100,7 +109,7 @@ void ovl_path_lower(struct dentry *dentry, struct path *path)
{
struct ovl_entry *oe = dentry->d_fsdata;
- *path = oe->numlower ? oe->lowerstack[0] : (struct path) { NULL, NULL };
+ *path = oe->numlower ? oe->lowerstack[0] : (struct path) { };
}
enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path)
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index 5523df7f17ef..5cb022c8cd33 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -58,7 +58,7 @@ module_param_named(pmsg_size, ramoops_pmsg_size, ulong, 0400);
MODULE_PARM_DESC(pmsg_size, "size of user space message log");
static unsigned long long mem_address;
-module_param(mem_address, ullong, 0400);
+module_param_hw(mem_address, ullong, other, 0400);
MODULE_PARM_DESC(mem_address,
"start of reserved RAM used to store oops/panic logs");
diff --git a/fs/signalfd.c b/fs/signalfd.c
index 270221fcef42..7e3d71109f51 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -38,7 +38,7 @@ void signalfd_cleanup(struct sighand_struct *sighand)
/*
* The lockless check can race with remove_wait_queue() in progress,
* but in this case its caller should run under rcu_read_lock() and
- * sighand_cachep is SLAB_DESTROY_BY_RCU, we can safely return.
+ * sighand_cachep is SLAB_TYPESAFE_BY_RCU, we can safely return.
*/
if (likely(!waitqueue_active(wqh)))
return;
diff --git a/fs/ubifs/Kconfig b/fs/ubifs/Kconfig
index b0d0623c83ed..83a961bf7280 100644
--- a/fs/ubifs/Kconfig
+++ b/fs/ubifs/Kconfig
@@ -61,3 +61,16 @@ config UBIFS_FS_ENCRYPTION
feature is similar to ecryptfs, but it is more memory
efficient since it avoids caching the encrypted and
decrypted pages in the page cache.
+
+config UBIFS_FS_SECURITY
+ bool "UBIFS Security Labels"
+ depends on UBIFS_FS
+ default y
+ help
+ Security labels provide an access control facility to support Linux
+ Security Models (LSMs) accepted by AppArmor, SELinux, Smack and TOMOYO
+ Linux. This option enables an extended attribute handler for file
+ security labels in the ubifs filesystem, so that it requires enabling
+ the extended attribute support in advance.
+
+ If you are not using a security module, say N.
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index 718b749fa11a..7cd8a7b95299 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -2391,8 +2391,8 @@ int dbg_check_nondata_nodes_order(struct ubifs_info *c, struct list_head *head)
ubifs_dump_node(c, sa->node);
return -EINVAL;
}
- if (sa->type != UBIFS_INO_NODE && sa->type != UBIFS_DENT_NODE &&
- sa->type != UBIFS_XENT_NODE) {
+ if (sb->type != UBIFS_INO_NODE && sb->type != UBIFS_DENT_NODE &&
+ sb->type != UBIFS_XENT_NODE) {
ubifs_err(c, "bad node type %d", sb->type);
ubifs_dump_node(c, sb->node);
return -EINVAL;
diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c
index 12b9eb5005ff..fdc311246807 100644
--- a/fs/ubifs/ioctl.c
+++ b/fs/ubifs/ioctl.c
@@ -53,7 +53,7 @@ void ubifs_set_inode_flags(struct inode *inode)
* ioctl2ubifs - convert ioctl inode flags to UBIFS inode flags.
* @ioctl_flags: flags to convert
*
- * This function convert ioctl flags (@FS_COMPR_FL, etc) to UBIFS inode flags
+ * This function converts ioctl flags (@FS_COMPR_FL, etc) to UBIFS inode flags
* (@UBIFS_COMPR_FL, etc).
*/
static int ioctl2ubifs(int ioctl_flags)
@@ -78,8 +78,8 @@ static int ioctl2ubifs(int ioctl_flags)
* ubifs2ioctl - convert UBIFS inode flags to ioctl inode flags.
* @ubifs_flags: flags to convert
*
- * This function convert UBIFS (@UBIFS_COMPR_FL, etc) to ioctl flags
- * (@FS_COMPR_FL, etc).
+ * This function converts UBIFS inode flags (@UBIFS_COMPR_FL, etc) to ioctl
+ * flags (@FS_COMPR_FL, etc).
*/
static int ubifs2ioctl(int ubifs_flags)
{
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index 586d59347fff..3af4472061cc 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -442,7 +442,6 @@ static void clean_buf(const struct ubifs_info *c, void **buf, int lnum,
{
int empty_offs, pad_len;
- lnum = lnum;
dbg_rcvry("cleaning corruption at %d:%d", lnum, *offs);
ubifs_assert(!(*offs & 7));
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 4da10a6d702a..298b4d89eee9 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -1753,13 +1753,23 @@ int ubifs_check_dir_empty(struct inode *dir);
/* xattr.c */
extern const struct xattr_handler *ubifs_xattr_handlers[];
ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size);
-int ubifs_init_security(struct inode *dentry, struct inode *inode,
- const struct qstr *qstr);
int ubifs_xattr_set(struct inode *host, const char *name, const void *value,
size_t size, int flags);
ssize_t ubifs_xattr_get(struct inode *host, const char *name, void *buf,
size_t size);
+#ifdef CONFIG_UBIFS_FS_SECURITY
+extern int ubifs_init_security(struct inode *dentry, struct inode *inode,
+ const struct qstr *qstr);
+#else
+static inline int ubifs_init_security(struct inode *dentry,
+ struct inode *inode, const struct qstr *qstr)
+{
+ return 0;
+}
+#endif
+
+
/* super.c */
struct inode *ubifs_iget(struct super_block *sb, unsigned long inum);
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index 3e53fdbf7997..6c9e62c2ef55 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -559,6 +559,7 @@ out_free:
return err;
}
+#ifdef CONFIG_UBIFS_FS_SECURITY
static int init_xattrs(struct inode *inode, const struct xattr *xattr_array,
void *fs_info)
{
@@ -599,6 +600,7 @@ int ubifs_init_security(struct inode *dentry, struct inode *inode,
}
return err;
}
+#endif
static int xattr_get(const struct xattr_handler *handler,
struct dentry *dentry, struct inode *inode,
@@ -639,15 +641,19 @@ static const struct xattr_handler ubifs_trusted_xattr_handler = {
.set = xattr_set,
};
+#ifdef CONFIG_UBIFS_FS_SECURITY
static const struct xattr_handler ubifs_security_xattr_handler = {
.prefix = XATTR_SECURITY_PREFIX,
.get = xattr_get,
.set = xattr_set,
};
+#endif
const struct xattr_handler *ubifs_xattr_handlers[] = {
&ubifs_user_xattr_handler,
&ubifs_trusted_xattr_handler,
+#ifdef CONFIG_UBIFS_FS_SECURITY
&ubifs_security_xattr_handler,
+#endif
NULL
};
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 47d239dcf3f4..455a575f101d 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -52,6 +52,7 @@
#include "xfs_reflink.h"
#include <linux/namei.h>
+#include <linux/dax.h>
#include <linux/init.h>
#include <linux/slab.h>
#include <linux/mount.h>