From 4e9906e7985b962ca3b9f8ab66c0353e6e3ab45e Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zyan@redhat.com>
Date: Wed, 25 Apr 2018 17:14:05 +0800
Subject: ceph: use bit flags to define vxattr attributes

Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
Acked-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/xattr.c | 34 ++++++++++++++++------------------
 1 file changed, 16 insertions(+), 18 deletions(-)

(limited to 'fs/ceph')

diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 315f7e63e7cc..f7dcafb7c5d4 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -50,10 +50,13 @@ struct ceph_vxattr {
 	size_t name_size;	/* strlen(name) + 1 (for '\0') */
 	size_t (*getxattr_cb)(struct ceph_inode_info *ci, char *val,
 			      size_t size);
-	bool readonly, hidden;
 	bool (*exists_cb)(struct ceph_inode_info *ci);
+	unsigned int flags;
 };
 
+#define VXATTR_FLAG_READONLY		(1<<0)
+#define VXATTR_FLAG_HIDDEN		(1<<1)
+
 /* layouts */
 
 static bool ceph_vxattrcb_layout_exists(struct ceph_inode_info *ci)
@@ -267,27 +270,24 @@ static size_t ceph_vxattrcb_quota_max_files(struct ceph_inode_info *ci,
 		.name = CEPH_XATTR_NAME(_type, _name),			\
 		.name_size = sizeof (CEPH_XATTR_NAME(_type, _name)), \
 		.getxattr_cb = ceph_vxattrcb_ ## _type ## _ ## _name, \
-		.readonly = true,				\
-		.hidden = false,				\
-		.exists_cb = NULL,			\
+		.exists_cb = NULL,				\
+		.flags = VXATTR_FLAG_READONLY,			\
 	}
 #define XATTR_LAYOUT_FIELD(_type, _name, _field)			\
 	{								\
 		.name = CEPH_XATTR_NAME2(_type, _name, _field),	\
 		.name_size = sizeof (CEPH_XATTR_NAME2(_type, _name, _field)), \
 		.getxattr_cb = ceph_vxattrcb_ ## _name ## _ ## _field, \
-		.readonly = false,				\
-		.hidden = true,			\
 		.exists_cb = ceph_vxattrcb_layout_exists,	\
+		.flags = VXATTR_FLAG_HIDDEN,			\
 	}
 #define XATTR_QUOTA_FIELD(_type, _name)					\
 	{								\
 		.name = CEPH_XATTR_NAME(_type, _name),			\
 		.name_size = sizeof(CEPH_XATTR_NAME(_type, _name)),	\
 		.getxattr_cb = ceph_vxattrcb_ ## _type ## _ ## _name,	\
-		.readonly = false,					\
-		.hidden = true,						\
 		.exists_cb = ceph_vxattrcb_quota_exists,		\
+		.flags = VXATTR_FLAG_HIDDEN,				\
 	}
 
 static struct ceph_vxattr ceph_dir_vxattrs[] = {
@@ -295,9 +295,8 @@ static struct ceph_vxattr ceph_dir_vxattrs[] = {
 		.name = "ceph.dir.layout",
 		.name_size = sizeof("ceph.dir.layout"),
 		.getxattr_cb = ceph_vxattrcb_layout,
-		.readonly = false,
-		.hidden = true,
 		.exists_cb = ceph_vxattrcb_layout_exists,
+		.flags = VXATTR_FLAG_HIDDEN,
 	},
 	XATTR_LAYOUT_FIELD(dir, layout, stripe_unit),
 	XATTR_LAYOUT_FIELD(dir, layout, stripe_count),
@@ -316,9 +315,8 @@ static struct ceph_vxattr ceph_dir_vxattrs[] = {
 		.name = "ceph.quota",
 		.name_size = sizeof("ceph.quota"),
 		.getxattr_cb = ceph_vxattrcb_quota,
-		.readonly = false,
-		.hidden = true,
 		.exists_cb = ceph_vxattrcb_quota_exists,
+		.flags = VXATTR_FLAG_HIDDEN,
 	},
 	XATTR_QUOTA_FIELD(quota, max_bytes),
 	XATTR_QUOTA_FIELD(quota, max_files),
@@ -333,9 +331,8 @@ static struct ceph_vxattr ceph_file_vxattrs[] = {
 		.name = "ceph.file.layout",
 		.name_size = sizeof("ceph.file.layout"),
 		.getxattr_cb = ceph_vxattrcb_layout,
-		.readonly = false,
-		.hidden = true,
 		.exists_cb = ceph_vxattrcb_layout_exists,
+		.flags = VXATTR_FLAG_HIDDEN,
 	},
 	XATTR_LAYOUT_FIELD(file, layout, stripe_unit),
 	XATTR_LAYOUT_FIELD(file, layout, stripe_count),
@@ -374,9 +371,10 @@ static size_t __init vxattrs_name_size(struct ceph_vxattr *vxattrs)
 	struct ceph_vxattr *vxattr;
 	size_t size = 0;
 
-	for (vxattr = vxattrs; vxattr->name; vxattr++)
-		if (!vxattr->hidden)
+	for (vxattr = vxattrs; vxattr->name; vxattr++) {
+		if (!(vxattr->flags & VXATTR_FLAG_HIDDEN))
 			size += vxattr->name_size;
+	}
 
 	return size;
 }
@@ -919,7 +917,7 @@ ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size)
 	err = namelen;
 	if (vxattrs) {
 		for (i = 0; vxattrs[i].name; i++) {
-			if (!vxattrs[i].hidden &&
+			if (!(vxattrs[i].flags & VXATTR_FLAG_HIDDEN) &&
 			    !(vxattrs[i].exists_cb &&
 			      !vxattrs[i].exists_cb(ci))) {
 				len = sprintf(names, "%s", vxattrs[i].name);
@@ -1024,7 +1022,7 @@ int __ceph_setxattr(struct inode *inode, const char *name,
 
 	vxattr = ceph_match_vxattr(inode, name);
 	if (vxattr) {
-		if (vxattr->readonly)
+		if (vxattr->flags & VXATTR_FLAG_READONLY)
 			return -EOPNOTSUPP;
 		if (value && !strncmp(vxattr->name, "ceph.quota", 10))
 			check_realm = true;
-- 
cgit v1.2.3


From 49a9f4f6714ec0ca2c6ada2ce764fbdd694962ee Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zyan@redhat.com>
Date: Wed, 25 Apr 2018 17:30:23 +0800
Subject: ceph: always get rstat from auth mds

rstat is not tracked by capability. client can't know if rstat from
non-auth mds is uptodate or not.

Link: http://tracker.ceph.com/issues/23538
Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/caps.c               |  2 ++
 fs/ceph/inode.c              | 21 +++++++++++++++------
 fs/ceph/xattr.c              | 30 ++++++++++++++++++------------
 include/linux/ceph/ceph_fs.h |  1 +
 4 files changed, 36 insertions(+), 18 deletions(-)

(limited to 'fs/ceph')

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 23dbfae16156..1b9f611c9dfe 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -69,6 +69,8 @@ static char *gcap_string(char *s, int c)
 		*s++ = 'w';
 	if (c & CEPH_CAP_GBUFFER)
 		*s++ = 'b';
+	if (c & CEPH_CAP_GWREXTEND)
+		*s++ = 'a';
 	if (c & CEPH_CAP_GLAZYIO)
 		*s++ = 'l';
 	return s;
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index ae056927080d..ec9441c2403b 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -854,6 +854,18 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
 		}
 	}
 
+	/* layout and rstat are not tracked by capability, update them if
+	 * the inode info is from auth mds */
+	if (new_version || (info->cap.flags & CEPH_CAP_FLAG_AUTH)) {
+		if (S_ISDIR(inode->i_mode)) {
+			ci->i_dir_layout = iinfo->dir_layout;
+			ci->i_rbytes = le64_to_cpu(info->rbytes);
+			ci->i_rfiles = le64_to_cpu(info->rfiles);
+			ci->i_rsubdirs = le64_to_cpu(info->rsubdirs);
+			ceph_decode_timespec(&ci->i_rctime, &info->rctime);
+		}
+	}
+
 	/* xattrs */
 	/* note that if i_xattrs.len <= 4, i_xattrs.data will still be NULL. */
 	if ((ci->i_xattrs.version == 0 || !(issued & CEPH_CAP_XATTR_EXCL))  &&
@@ -919,14 +931,9 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
 		inode->i_op = &ceph_dir_iops;
 		inode->i_fop = &ceph_dir_fops;
 
-		ci->i_dir_layout = iinfo->dir_layout;
 
 		ci->i_files = le64_to_cpu(info->files);
 		ci->i_subdirs = le64_to_cpu(info->subdirs);
-		ci->i_rbytes = le64_to_cpu(info->rbytes);
-		ci->i_rfiles = le64_to_cpu(info->rfiles);
-		ci->i_rsubdirs = le64_to_cpu(info->rsubdirs);
-		ceph_decode_timespec(&ci->i_rctime, &info->rctime);
 		break;
 	default:
 		pr_err("fill_inode %llx.%llx BAD mode 0%o\n",
@@ -2178,6 +2185,7 @@ int __ceph_do_getattr(struct inode *inode, struct page *locked_page,
 	struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
 	struct ceph_mds_client *mdsc = fsc->mdsc;
 	struct ceph_mds_request *req;
+	int mode;
 	int err;
 
 	if (ceph_snap(inode) == CEPH_SNAPDIR) {
@@ -2190,7 +2198,8 @@ int __ceph_do_getattr(struct inode *inode, struct page *locked_page,
 	if (!force && ceph_caps_issued_mask(ceph_inode(inode), mask, 1))
 		return 0;
 
-	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS);
+	mode = (mask & CEPH_STAT_RSTAT) ? USE_AUTH_MDS : USE_ANY_MDS;
+	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, mode);
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 	req->r_inode = inode;
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index f7dcafb7c5d4..5bc8edb4c2a6 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -56,6 +56,7 @@ struct ceph_vxattr {
 
 #define VXATTR_FLAG_READONLY		(1<<0)
 #define VXATTR_FLAG_HIDDEN		(1<<1)
+#define VXATTR_FLAG_RSTAT		(1<<2)
 
 /* layouts */
 
@@ -265,14 +266,16 @@ static size_t ceph_vxattrcb_quota_max_files(struct ceph_inode_info *ci,
 #define CEPH_XATTR_NAME2(_type, _name, _name2)	\
 	XATTR_CEPH_PREFIX #_type "." #_name "." #_name2
 
-#define XATTR_NAME_CEPH(_type, _name)					\
+#define XATTR_NAME_CEPH(_type, _name, _flags)				\
 	{								\
 		.name = CEPH_XATTR_NAME(_type, _name),			\
 		.name_size = sizeof (CEPH_XATTR_NAME(_type, _name)), \
 		.getxattr_cb = ceph_vxattrcb_ ## _type ## _ ## _name, \
-		.exists_cb = NULL,				\
-		.flags = VXATTR_FLAG_READONLY,			\
+		.exists_cb = NULL,					\
+		.flags = (VXATTR_FLAG_READONLY | _flags),		\
 	}
+#define XATTR_RSTAT_FIELD(_type, _name)			\
+	XATTR_NAME_CEPH(_type, _name, VXATTR_FLAG_RSTAT)
 #define XATTR_LAYOUT_FIELD(_type, _name, _field)			\
 	{								\
 		.name = CEPH_XATTR_NAME2(_type, _name, _field),	\
@@ -303,14 +306,14 @@ static struct ceph_vxattr ceph_dir_vxattrs[] = {
 	XATTR_LAYOUT_FIELD(dir, layout, object_size),
 	XATTR_LAYOUT_FIELD(dir, layout, pool),
 	XATTR_LAYOUT_FIELD(dir, layout, pool_namespace),
-	XATTR_NAME_CEPH(dir, entries),
-	XATTR_NAME_CEPH(dir, files),
-	XATTR_NAME_CEPH(dir, subdirs),
-	XATTR_NAME_CEPH(dir, rentries),
-	XATTR_NAME_CEPH(dir, rfiles),
-	XATTR_NAME_CEPH(dir, rsubdirs),
-	XATTR_NAME_CEPH(dir, rbytes),
-	XATTR_NAME_CEPH(dir, rctime),
+	XATTR_NAME_CEPH(dir, entries, 0),
+	XATTR_NAME_CEPH(dir, files, 0),
+	XATTR_NAME_CEPH(dir, subdirs, 0),
+	XATTR_RSTAT_FIELD(dir, rentries),
+	XATTR_RSTAT_FIELD(dir, rfiles),
+	XATTR_RSTAT_FIELD(dir, rsubdirs),
+	XATTR_RSTAT_FIELD(dir, rbytes),
+	XATTR_RSTAT_FIELD(dir, rctime),
 	{
 		.name = "ceph.quota",
 		.name_size = sizeof("ceph.quota"),
@@ -807,7 +810,10 @@ ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value,
 	/* let's see if a virtual xattr was requested */
 	vxattr = ceph_match_vxattr(inode, name);
 	if (vxattr) {
-		err = ceph_do_getattr(inode, 0, true);
+		int mask = 0;
+		if (vxattr->flags & VXATTR_FLAG_RSTAT)
+			mask |= CEPH_STAT_RSTAT;
+		err = ceph_do_getattr(inode, mask, true);
 		if (err)
 			return err;
 		err = -ENODATA;
diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
index 7ecfc88314d8..4903deb0777a 100644
--- a/include/linux/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -628,6 +628,7 @@ int ceph_flags_to_mode(int flags);
 				 CEPH_CAP_XATTR_SHARED)
 #define CEPH_STAT_CAP_INLINE_DATA (CEPH_CAP_FILE_SHARED | \
 				   CEPH_CAP_FILE_RD)
+#define CEPH_STAT_RSTAT CEPH_CAP_FILE_WREXTEND
 
 #define CEPH_CAP_ANY_SHARED (CEPH_CAP_AUTH_SHARED |			\
 			      CEPH_CAP_LINK_SHARED |			\
-- 
cgit v1.2.3


From 2af54a72b585a7cc46fb28845a121635c2540563 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zyan@redhat.com>
Date: Fri, 27 Apr 2018 11:14:39 +0800
Subject: ceph: update i_files/i_subdirs only when Fs cap is issued

In MDS, file/subdir counts of a directory inode are protected by
filelock. In request reply without Fs cap, nfiles/nsubdirs can be
stale.

Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/inode.c | 35 ++++++++++++++++++-----------------
 1 file changed, 18 insertions(+), 17 deletions(-)

(limited to 'fs/ceph')

diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index ec9441c2403b..4712c943cdf7 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -739,7 +739,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
 	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
 	struct ceph_mds_reply_inode *info = iinfo->in;
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	int issued = 0, implemented, new_issued;
+	int issued, new_issued, info_caps;
 	struct timespec mtime, atime, ctime;
 	struct ceph_buffer *xattr_blob = NULL;
 	struct ceph_string *pool_ns = NULL;
@@ -754,8 +754,10 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
 	     inode, ceph_vinop(inode), le64_to_cpu(info->version),
 	     ci->i_version);
 
+	info_caps = le32_to_cpu(info->cap.caps);
+
 	/* prealloc new cap struct */
-	if (info->cap.caps && ceph_snap(inode) == CEPH_NOSNAP)
+	if (info_caps && ceph_snap(inode) == CEPH_NOSNAP)
 		new_cap = ceph_get_cap(mdsc, caps_reservation);
 
 	/*
@@ -792,9 +794,9 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
 	     le64_to_cpu(info->version) > (ci->i_version & ~1)))
 		new_version = true;
 
-	issued = __ceph_caps_issued(ci, &implemented);
-	issued |= implemented | __ceph_caps_dirty(ci);
-	new_issued = ~issued & le32_to_cpu(info->cap.caps);
+	__ceph_caps_issued(ci, &issued);
+	issued |= __ceph_caps_dirty(ci);
+	new_issued = ~issued & info_caps;
 
 	/* update inode */
 	inode->i_rdev = le32_to_cpu(info->rdev);
@@ -826,6 +828,11 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
 				&ctime, &mtime, &atime);
 	}
 
+	if (new_version || (info_caps & CEPH_CAP_FILE_SHARED)) {
+		ci->i_files = le64_to_cpu(info->files);
+		ci->i_subdirs = le64_to_cpu(info->subdirs);
+	}
+
 	if (new_version ||
 	    (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
 		s64 old_pool = ci->i_layout.pool_id;
@@ -930,10 +937,6 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
 	case S_IFDIR:
 		inode->i_op = &ceph_dir_iops;
 		inode->i_fop = &ceph_dir_fops;
-
-
-		ci->i_files = le64_to_cpu(info->files);
-		ci->i_subdirs = le64_to_cpu(info->subdirs);
 		break;
 	default:
 		pr_err("fill_inode %llx.%llx BAD mode 0%o\n",
@@ -941,12 +944,11 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
 	}
 
 	/* were we issued a capability? */
-	if (info->cap.caps) {
+	if (info_caps) {
 		if (ceph_snap(inode) == CEPH_NOSNAP) {
-			unsigned caps = le32_to_cpu(info->cap.caps);
 			ceph_add_cap(inode, session,
 				     le64_to_cpu(info->cap.cap_id),
-				     cap_fmode, caps,
+				     cap_fmode, info_caps,
 				     le32_to_cpu(info->cap.wanted),
 				     le32_to_cpu(info->cap.seq),
 				     le32_to_cpu(info->cap.mseq),
@@ -956,7 +958,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
 			/* set dir completion flag? */
 			if (S_ISDIR(inode->i_mode) &&
 			    ci->i_files == 0 && ci->i_subdirs == 0 &&
-			    (caps & CEPH_CAP_FILE_SHARED) &&
+			    (info_caps & CEPH_CAP_FILE_SHARED) &&
 			    (issued & CEPH_CAP_FILE_EXCL) == 0 &&
 			    !__ceph_dir_is_complete(ci)) {
 				dout(" marking %p complete (empty)\n", inode);
@@ -969,8 +971,8 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
 			wake = true;
 		} else {
 			dout(" %p got snap_caps %s\n", inode,
-			     ceph_cap_string(le32_to_cpu(info->cap.caps)));
-			ci->i_snap_caps |= le32_to_cpu(info->cap.caps);
+			     ceph_cap_string(info_caps));
+			ci->i_snap_caps |= info_caps;
 			if (cap_fmode >= 0)
 				__ceph_get_fmode(ci, cap_fmode);
 		}
@@ -985,8 +987,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
 		int cache_caps = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
 		ci->i_inline_version = iinfo->inline_version;
 		if (ci->i_inline_version != CEPH_INLINE_NONE &&
-		    (locked_page ||
-		     (le32_to_cpu(info->cap.caps) & cache_caps)))
+		    (locked_page || (info_caps & cache_caps)))
 			fill_inline = true;
 	}
 
-- 
cgit v1.2.3


From a1c6b8358171c16db0f858a7fbb28aa574b07c09 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zyan@redhat.com>
Date: Fri, 27 Apr 2018 10:29:44 +0800
Subject: ceph: define argument structure for handle_cap_grant

The data structure includes the versioned feilds of cap message.

Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/caps.c | 115 ++++++++++++++++++++++++++++++---------------------------
 1 file changed, 61 insertions(+), 54 deletions(-)

(limited to 'fs/ceph')

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 1b9f611c9dfe..de7b7a34195e 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -3024,24 +3024,32 @@ static void invalidate_aliases(struct inode *inode)
 		dput(prev);
 }
 
+struct cap_extra_info {
+	struct ceph_string *pool_ns;
+	/* inline data */
+	u64 inline_version;
+	void *inline_data;
+	u32 inline_len;
+	/* currently issued */
+	int issued;
+};
+
 /*
  * Handle a cap GRANT message from the MDS.  (Note that a GRANT may
  * actually be a revocation if it specifies a smaller cap set.)
  *
  * caller holds s_mutex and i_ceph_lock, we drop both.
  */
-static void handle_cap_grant(struct ceph_mds_client *mdsc,
-			     struct inode *inode, struct ceph_mds_caps *grant,
-			     struct ceph_string **pns, u64 inline_version,
-			     void *inline_data, u32 inline_len,
-			     struct ceph_buffer *xattr_buf,
+static void handle_cap_grant(struct inode *inode,
 			     struct ceph_mds_session *session,
-			     struct ceph_cap *cap, int issued)
+			     struct ceph_cap *cap,
+			     struct ceph_mds_caps *grant,
+			     struct ceph_buffer *xattr_buf,
+			     struct cap_extra_info *extra_info)
 	__releases(ci->i_ceph_lock)
-	__releases(mdsc->snap_rwsem)
+	__releases(session->s_mdsc->snap_rwsem)
 {
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	int mds = session->s_mds;
 	int seq = le32_to_cpu(grant->seq);
 	int newcaps = le32_to_cpu(grant->caps);
 	int used, wanted, dirty;
@@ -3057,7 +3065,7 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc,
 	bool fill_inline = false;
 
 	dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n",
-	     inode, cap, mds, seq, ceph_cap_string(newcaps));
+	     inode, cap, session->s_mds, seq, ceph_cap_string(newcaps));
 	dout(" size %llu max_size %llu, i_size %llu\n", size, max_size,
 		inode->i_size);
 
@@ -3103,7 +3111,7 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc,
 	__check_cap_issue(ci, cap, newcaps);
 
 	if ((newcaps & CEPH_CAP_AUTH_SHARED) &&
-	    (issued & CEPH_CAP_AUTH_EXCL) == 0) {
+	    (extra_info->issued & CEPH_CAP_AUTH_EXCL) == 0) {
 		inode->i_mode = le32_to_cpu(grant->mode);
 		inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(grant->uid));
 		inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(grant->gid));
@@ -3113,14 +3121,15 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc,
 	}
 
 	if ((newcaps & CEPH_CAP_AUTH_SHARED) &&
-	    (issued & CEPH_CAP_LINK_EXCL) == 0) {
+	    (extra_info->issued & CEPH_CAP_LINK_EXCL) == 0) {
 		set_nlink(inode, le32_to_cpu(grant->nlink));
 		if (inode->i_nlink == 0 &&
 		    (newcaps & (CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL)))
 			deleted_inode = true;
 	}
 
-	if ((issued & CEPH_CAP_XATTR_EXCL) == 0 && grant->xattr_len) {
+	if ((extra_info->issued & CEPH_CAP_XATTR_EXCL) == 0 &&
+	    grant->xattr_len) {
 		int len = le32_to_cpu(grant->xattr_len);
 		u64 version = le64_to_cpu(grant->xattr_version);
 
@@ -3140,7 +3149,7 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc,
 		ceph_decode_timespec(&mtime, &grant->mtime);
 		ceph_decode_timespec(&atime, &grant->atime);
 		ceph_decode_timespec(&ctime, &grant->ctime);
-		ceph_fill_file_time(inode, issued,
+		ceph_fill_file_time(inode, extra_info->issued,
 				    le32_to_cpu(grant->time_warp_seq),
 				    &ctime, &mtime, &atime);
 	}
@@ -3153,15 +3162,16 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc,
 		ceph_file_layout_from_legacy(&ci->i_layout, &grant->layout);
 		old_ns = rcu_dereference_protected(ci->i_layout.pool_ns,
 					lockdep_is_held(&ci->i_ceph_lock));
-		rcu_assign_pointer(ci->i_layout.pool_ns, *pns);
+		rcu_assign_pointer(ci->i_layout.pool_ns, extra_info->pool_ns);
 
-		if (ci->i_layout.pool_id != old_pool || *pns != old_ns)
+		if (ci->i_layout.pool_id != old_pool ||
+		    extra_info->pool_ns != old_ns)
 			ci->i_ceph_flags &= ~CEPH_I_POOL_PERM;
 
-		*pns = old_ns;
+		extra_info->pool_ns = old_ns;
 
 		/* size/truncate_seq? */
-		queue_trunc = ceph_fill_file_size(inode, issued,
+		queue_trunc = ceph_fill_file_size(inode, extra_info->issued,
 					le32_to_cpu(grant->truncate_seq),
 					le64_to_cpu(grant->truncate_size),
 					size);
@@ -3240,24 +3250,26 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc,
 	}
 	BUG_ON(cap->issued & ~cap->implemented);
 
-	if (inline_version > 0 && inline_version >= ci->i_inline_version) {
-		ci->i_inline_version = inline_version;
+	if (extra_info->inline_version > 0 &&
+	    extra_info->inline_version >= ci->i_inline_version) {
+		ci->i_inline_version = extra_info->inline_version;
 		if (ci->i_inline_version != CEPH_INLINE_NONE &&
 		    (newcaps & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)))
 			fill_inline = true;
 	}
 
 	if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) {
-		if (newcaps & ~issued)
+		if (newcaps & ~extra_info->issued)
 			wake = true;
-		kick_flushing_inode_caps(mdsc, session, inode);
-		up_read(&mdsc->snap_rwsem);
+		kick_flushing_inode_caps(session->s_mdsc, session, inode);
+		up_read(&session->s_mdsc->snap_rwsem);
 	} else {
 		spin_unlock(&ci->i_ceph_lock);
 	}
 
 	if (fill_inline)
-		ceph_fill_inline_data(inode, NULL, inline_data, inline_len);
+		ceph_fill_inline_data(inode, NULL, extra_info->inline_data,
+				      extra_info->inline_len);
 
 	if (queue_trunc)
 		ceph_queue_vmtruncate(inode);
@@ -3722,31 +3734,24 @@ void ceph_handle_caps(struct ceph_mds_session *session,
 		      struct ceph_msg *msg)
 {
 	struct ceph_mds_client *mdsc = session->s_mdsc;
-	struct super_block *sb = mdsc->fsc->sb;
 	struct inode *inode;
 	struct ceph_inode_info *ci;
 	struct ceph_cap *cap;
 	struct ceph_mds_caps *h;
 	struct ceph_mds_cap_peer *peer = NULL;
 	struct ceph_snap_realm *realm = NULL;
-	struct ceph_string *pool_ns = NULL;
-	int mds = session->s_mds;
-	int op, issued;
+	int op;
 	u32 seq, mseq;
 	struct ceph_vino vino;
-	u64 tid;
-	u64 inline_version = 0;
-	void *inline_data = NULL;
-	u32  inline_len = 0;
 	void *snaptrace;
 	size_t snaptrace_len;
 	void *p, *end;
+	struct cap_extra_info extra_info = {};
 
-	dout("handle_caps from mds%d\n", mds);
+	dout("handle_caps from mds%d\n", session->s_mds);
 
 	/* decode */
 	end = msg->front.iov_base + msg->front.iov_len;
-	tid = le64_to_cpu(msg->hdr.tid);
 	if (msg->front.iov_len < sizeof(*h))
 		goto bad;
 	h = msg->front.iov_base;
@@ -3781,12 +3786,12 @@ void ceph_handle_caps(struct ceph_mds_session *session,
 	}
 
 	if (le16_to_cpu(msg->hdr.version) >= 4) {
-		ceph_decode_64_safe(&p, end, inline_version, bad);
-		ceph_decode_32_safe(&p, end, inline_len, bad);
-		if (p + inline_len > end)
+		ceph_decode_64_safe(&p, end, extra_info.inline_version, bad);
+		ceph_decode_32_safe(&p, end, extra_info.inline_len, bad);
+		if (p + extra_info.inline_len > end)
 			goto bad;
-		inline_data = p;
-		p += inline_len;
+		extra_info.inline_data = p;
+		p += extra_info.inline_len;
 	}
 
 	if (le16_to_cpu(msg->hdr.version) >= 5) {
@@ -3811,13 +3816,14 @@ void ceph_handle_caps(struct ceph_mds_session *session,
 		ceph_decode_32_safe(&p, end, pool_ns_len, bad);
 		if (pool_ns_len > 0) {
 			ceph_decode_need(&p, end, pool_ns_len, bad);
-			pool_ns = ceph_find_or_create_string(p, pool_ns_len);
+			extra_info.pool_ns =
+				ceph_find_or_create_string(p, pool_ns_len);
 			p += pool_ns_len;
 		}
 	}
 
 	/* lookup ino */
-	inode = ceph_find_inode(sb, vino);
+	inode = ceph_find_inode(mdsc->fsc->sb, vino);
 	ci = ceph_inode(inode);
 	dout(" op %s ino %llx.%llx inode %p\n", ceph_cap_op_name(op), vino.ino,
 	     vino.snap, inode);
@@ -3850,7 +3856,8 @@ void ceph_handle_caps(struct ceph_mds_session *session,
 	/* these will work even if we don't have a cap yet */
 	switch (op) {
 	case CEPH_CAP_OP_FLUSHSNAP_ACK:
-		handle_cap_flushsnap_ack(inode, tid, h, session);
+		handle_cap_flushsnap_ack(inode, le64_to_cpu(msg->hdr.tid),
+					 h, session);
 		goto done;
 
 	case CEPH_CAP_OP_EXPORT:
@@ -3869,10 +3876,9 @@ void ceph_handle_caps(struct ceph_mds_session *session,
 			down_read(&mdsc->snap_rwsem);
 		}
 		handle_cap_import(mdsc, inode, h, peer, session,
-				  &cap, &issued);
-		handle_cap_grant(mdsc, inode, h, &pool_ns,
-				 inline_version, inline_data, inline_len,
-				 msg->middle, session, cap, issued);
+				  &cap, &extra_info.issued);
+		handle_cap_grant(inode, session, cap,
+				 h, msg->middle, &extra_info);
 		if (realm)
 			ceph_put_snap_realm(mdsc, realm);
 		goto done_unlocked;
@@ -3880,10 +3886,11 @@ void ceph_handle_caps(struct ceph_mds_session *session,
 
 	/* the rest require a cap */
 	spin_lock(&ci->i_ceph_lock);
-	cap = __get_cap_for_mds(ceph_inode(inode), mds);
+	cap = __get_cap_for_mds(ceph_inode(inode), session->s_mds);
 	if (!cap) {
 		dout(" no cap on %p ino %llx.%llx from mds%d\n",
-		     inode, ceph_ino(inode), ceph_snap(inode), mds);
+		     inode, ceph_ino(inode), ceph_snap(inode),
+		     session->s_mds);
 		spin_unlock(&ci->i_ceph_lock);
 		goto flush_cap_releases;
 	}
@@ -3892,15 +3899,15 @@ void ceph_handle_caps(struct ceph_mds_session *session,
 	switch (op) {
 	case CEPH_CAP_OP_REVOKE:
 	case CEPH_CAP_OP_GRANT:
-		__ceph_caps_issued(ci, &issued);
-		issued |= __ceph_caps_dirty(ci);
-		handle_cap_grant(mdsc, inode, h, &pool_ns,
-				 inline_version, inline_data, inline_len,
-				 msg->middle, session, cap, issued);
+		__ceph_caps_issued(ci, &extra_info.issued);
+		extra_info.issued |= __ceph_caps_dirty(ci);
+		handle_cap_grant(inode, session, cap,
+				 h, msg->middle, &extra_info);
 		goto done_unlocked;
 
 	case CEPH_CAP_OP_FLUSH_ACK:
-		handle_cap_flush_ack(inode, tid, h, session, cap);
+		handle_cap_flush_ack(inode, le64_to_cpu(msg->hdr.tid),
+				     h, session, cap);
 		break;
 
 	case CEPH_CAP_OP_TRUNC:
@@ -3927,7 +3934,7 @@ done:
 	mutex_unlock(&session->s_mutex);
 done_unlocked:
 	iput(inode);
-	ceph_put_string(pool_ns);
+	ceph_put_string(extra_info.pool_ns);
 	return;
 
 bad:
-- 
cgit v1.2.3


From 4985d6f9e50fa48e35a9dbe1726434f987305cae Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zyan@redhat.com>
Date: Fri, 27 Apr 2018 11:11:31 +0800
Subject: ceph: handle the new nfiles/nsubdirs fields in cap message

Without these new fields, stale st_size is returned in following
case.

1. MDS modifies a directory
2. MDS issues CEPH_CAP_ANY_SHARED to client
3. The client satifies stat(2) by its cached metadata. set st_size
   to "i_files + i_subdirs".

Link: http://tracker.ceph.com/issues/23855
Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/caps.c | 39 ++++++++++++++++++++++++++++++++++-----
 1 file changed, 34 insertions(+), 5 deletions(-)

(limited to 'fs/ceph')

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index de7b7a34195e..477b822e6333 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -3030,6 +3030,10 @@ struct cap_extra_info {
 	u64 inline_version;
 	void *inline_data;
 	u32 inline_len;
+	/* dirstat */
+	bool dirstat_valid;
+	u64 nfiles;
+	u64 nsubdirs;
 	/* currently issued */
 	int issued;
 };
@@ -3154,6 +3158,11 @@ static void handle_cap_grant(struct inode *inode,
 				    &ctime, &mtime, &atime);
 	}
 
+	if ((newcaps & CEPH_CAP_FILE_SHARED) && extra_info->dirstat_valid) {
+		ci->i_files = extra_info->nfiles;
+		ci->i_subdirs = extra_info->nsubdirs;
+	}
+
 	if (newcaps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)) {
 		/* file layout may have changed */
 		s64 old_pool = ci->i_layout.pool_id;
@@ -3741,6 +3750,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
 	struct ceph_mds_cap_peer *peer = NULL;
 	struct ceph_snap_realm *realm = NULL;
 	int op;
+	int msg_version = le16_to_cpu(msg->hdr.version);
 	u32 seq, mseq;
 	struct ceph_vino vino;
 	void *snaptrace;
@@ -3765,7 +3775,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
 	snaptrace_len = le32_to_cpu(h->snap_trace_len);
 	p = snaptrace + snaptrace_len;
 
-	if (le16_to_cpu(msg->hdr.version) >= 2) {
+	if (msg_version >= 2) {
 		u32 flock_len;
 		ceph_decode_32_safe(&p, end, flock_len, bad);
 		if (p + flock_len > end)
@@ -3773,7 +3783,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
 		p += flock_len;
 	}
 
-	if (le16_to_cpu(msg->hdr.version) >= 3) {
+	if (msg_version >= 3) {
 		if (op == CEPH_CAP_OP_IMPORT) {
 			if (p + sizeof(*peer) > end)
 				goto bad;
@@ -3785,7 +3795,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
 		}
 	}
 
-	if (le16_to_cpu(msg->hdr.version) >= 4) {
+	if (msg_version >= 4) {
 		ceph_decode_64_safe(&p, end, extra_info.inline_version, bad);
 		ceph_decode_32_safe(&p, end, extra_info.inline_len, bad);
 		if (p + extra_info.inline_len > end)
@@ -3794,7 +3804,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
 		p += extra_info.inline_len;
 	}
 
-	if (le16_to_cpu(msg->hdr.version) >= 5) {
+	if (msg_version >= 5) {
 		struct ceph_osd_client	*osdc = &mdsc->fsc->client->osdc;
 		u32			epoch_barrier;
 
@@ -3802,7 +3812,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
 		ceph_osdc_update_epoch_barrier(osdc, epoch_barrier);
 	}
 
-	if (le16_to_cpu(msg->hdr.version) >= 8) {
+	if (msg_version >= 8) {
 		u64 flush_tid;
 		u32 caller_uid, caller_gid;
 		u32 pool_ns_len;
@@ -3822,6 +3832,25 @@ void ceph_handle_caps(struct ceph_mds_session *session,
 		}
 	}
 
+	if (msg_version >= 11) {
+		struct ceph_timespec *btime;
+		u64 change_attr;
+		u32 flags;
+
+		/* version >= 9 */
+		if (p + sizeof(*btime) > end)
+			goto bad;
+		btime = p;
+		p += sizeof(*btime);
+		ceph_decode_64_safe(&p, end, change_attr, bad);
+		/* version >= 10 */
+		ceph_decode_32_safe(&p, end, flags, bad);
+		/* version >= 11 */
+		extra_info.dirstat_valid = true;
+		ceph_decode_64_safe(&p, end, extra_info.nfiles, bad);
+		ceph_decode_64_safe(&p, end, extra_info.nsubdirs, bad);
+	}
+
 	/* lookup ino */
 	inode = ceph_find_inode(mdsc->fsc->sb, vino);
 	ci = ceph_inode(inode);
-- 
cgit v1.2.3


From 6dd4940ba5f96270ad428351cd88daf9ab871a97 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 3 May 2018 16:26:55 +0200
Subject: ceph: show wsize only if non-default

This is how it was before commit 95cca2b44e54 ("ceph: limit osd write
size") went in.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/super.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/ceph')

diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index b33082e6878f..3c1155803444 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -551,7 +551,7 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
 
 	if (fsopt->mds_namespace)
 		seq_show_option(m, "mds_namespace", fsopt->mds_namespace);
-	if (fsopt->wsize)
+	if (fsopt->wsize != CEPH_MAX_WRITE_SIZE)
 		seq_printf(m, ",wsize=%d", fsopt->wsize);
 	if (fsopt->rsize != CEPH_MAX_READ_SIZE)
 		seq_printf(m, ",rsize=%d", fsopt->rsize);
-- 
cgit v1.2.3


From 597817ddbbf27af5986d1f3df20390b2738411c6 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zyan@redhat.com>
Date: Tue, 15 May 2018 11:30:43 +0800
Subject: ceph: support file lock on directory

Link: http://tracker.ceph.com/issues/24028
Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/dir.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs/ceph')

diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 1a78dd6f8bf2..036ac0f3a393 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -1486,6 +1486,8 @@ const struct file_operations ceph_dir_fops = {
 	.release = ceph_release,
 	.unlocked_ioctl = ceph_ioctl,
 	.fsync = ceph_fsync,
+	.lock = ceph_lock,
+	.flock = ceph_flock,
 };
 
 const struct file_operations ceph_snapdir_fops = {
-- 
cgit v1.2.3


From 8c6286f1c69743ebdb2ee15f9165f9c4d44eef49 Mon Sep 17 00:00:00 2001
From: Luis Henriques <lhenriques@suse.com>
Date: Mon, 21 May 2018 10:27:29 +0100
Subject: ceph: fix st_nlink stat for directories

Currently, calling stat on a cephfs directory returns 1 for st_nlink.
This behaviour has recently changed in the fuse client, as some
applications seem to expect this value to be either 0 (if it's
unlinked) or 2 + number of subdirectories.  This behaviour was changed
in the fuse client with commit 67c7e4619188 ("client: use common
interp of st_nlink for dirs").

This patch modifies the kernel client to have a similar behaviour.

Link: https://tracker.ceph.com/issues/23873
Signed-off-by: Luis Henriques <lhenriques@suse.com>
Reviewed-by: "Yan, Zheng" <zyan@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/inode.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'fs/ceph')

diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 4712c943cdf7..4aeccb13437b 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -2271,6 +2271,14 @@ int ceph_getattr(const struct path *path, struct kstat *stat,
 				stat->size = ci->i_files + ci->i_subdirs;
 			stat->blocks = 0;
 			stat->blksize = 65536;
+			/*
+			 * Some applications rely on the number of st_nlink
+			 * value on directories to be either 0 (if unlinked)
+			 * or 2 + number of subdirectories.
+			 */
+			if (stat->nlink == 1)
+				/* '.' + '..' + subdirs */
+				stat->nlink = 1 + 1 + ci->i_subdirs;
 		}
 	}
 	return err;
-- 
cgit v1.2.3


From 12b69d5f6fe4064147ddb7e7ea2d4fa4aea3eab5 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zyan@redhat.com>
Date: Fri, 11 May 2018 17:12:02 +0800
Subject: ceph: abort osd requests on force umount

This avoid force umount waiting on page writeback:

  io_schedule+0xd/0x30
  wait_on_page_bit_common+0xc6/0x130
  __filemap_fdatawait_range+0xbd/0x100
  filemap_fdatawait_keep_errors+0x15/0x40
  sync_inodes_sb+0x1cf/0x240
  sync_filesystem+0x52/0x90
  generic_shutdown_super+0x1d/0x110
  ceph_kill_sb+0x28/0x80 [ceph]
  deactivate_locked_super+0x35/0x60
  cleanup_mnt+0x36/0x70
  task_work_run+0x79/0xa0
  exit_to_usermode_loop+0x62/0x70
  do_syscall_64+0xdb/0xf0
  entry_SYSCALL_64_after_hwframe+0x44/0xa9
  0xffffffffffffffff

Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/super.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs/ceph')

diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 3c1155803444..40664e13cc0f 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -793,6 +793,7 @@ static void ceph_umount_begin(struct super_block *sb)
 	if (!fsc)
 		return;
 	fsc->mount_state = CEPH_MOUNT_SHUTDOWN;
+	ceph_osdc_abort_requests(&fsc->client->osdc, -EIO);
 	ceph_mdsc_force_umount(fsc->mdsc);
 	return;
 }
-- 
cgit v1.2.3


From a57d9064e4ee4e9882b922d0627be3d426004c69 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zyan@redhat.com>
Date: Fri, 18 May 2018 16:05:51 +0800
Subject: ceph: flush pending works before shutdown super

Pending works hold inode references, which cause "Busy inodes after
unmount" warning.

Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/super.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'fs/ceph')

diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 40664e13cc0f..a092cdb69288 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -674,6 +674,13 @@ fail:
 	return ERR_PTR(err);
 }
 
+static void flush_fs_workqueues(struct ceph_fs_client *fsc)
+{
+	flush_workqueue(fsc->wb_wq);
+	flush_workqueue(fsc->pg_inv_wq);
+	flush_workqueue(fsc->trunc_wq);
+}
+
 static void destroy_fs_client(struct ceph_fs_client *fsc)
 {
 	dout("destroy_fs_client %p\n", fsc);
@@ -1089,6 +1096,8 @@ static void ceph_kill_sb(struct super_block *s)
 	dout("kill_sb %p\n", s);
 
 	ceph_mdsc_pre_umount(fsc->mdsc);
+	flush_fs_workqueues(fsc);
+
 	generic_shutdown_super(s);
 
 	fsc->client->extra_mon_dispatch = NULL;
-- 
cgit v1.2.3


From c843d13caefad9f2f182f38d6bfe492c9f00e086 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Wed, 30 May 2018 16:29:14 +0200
Subject: libceph: make abort_on_full a per-osdc setting

The intent behind making it a per-request setting was that it would be
set for writes, but not for reads.  As it is, the flag is set for all
fs/ceph requests except for pool perm check stat request (technically
a read).

ceph_osdc_abort_on_full() skips reads since the previous commit and
I don't see a use case for marking individual requests.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Acked-by: Jeff Layton <jlayton@redhat.com>
Reviewed-by: "Yan, Zheng" <zyan@redhat.com>
---
 fs/ceph/addr.c                  | 1 -
 fs/ceph/file.c                  | 1 -
 fs/ceph/super.c                 | 2 ++
 include/linux/ceph/osd_client.h | 2 +-
 net/ceph/osd_client.c           | 9 ++++-----
 5 files changed, 7 insertions(+), 8 deletions(-)

(limited to 'fs/ceph')

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 5f7ad3d0df2e..ca0d5510ed50 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1935,7 +1935,6 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci,
 	err = ceph_osdc_start_request(&fsc->client->osdc, rd_req, false);
 
 	wr_req->r_mtime = ci->vfs_inode.i_mtime;
-	wr_req->r_abort_on_full = true;
 	err2 = ceph_osdc_start_request(&fsc->client->osdc, wr_req, false);
 
 	if (!err)
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index cf0e45b10121..6b9f7f3cd237 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -895,7 +895,6 @@ static void ceph_aio_retry_work(struct work_struct *work)
 	req->r_callback = ceph_aio_complete_req;
 	req->r_inode = inode;
 	req->r_priv = aio_req;
-	req->r_abort_on_full = true;
 
 	ret = ceph_osdc_start_request(req->r_osdc, req, false);
 out:
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index a092cdb69288..cad046aa4fd0 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -616,7 +616,9 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
 		err = PTR_ERR(fsc->client);
 		goto fail;
 	}
+
 	fsc->client->extra_mon_dispatch = extra_mon_dispatch;
+	fsc->client->osdc.abort_on_full = true;
 
 	if (!fsopt->mds_namespace) {
 		ceph_monc_want_map(&fsc->client->monc, CEPH_SUB_MDSMAP,
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index d4191bde95a4..0d6ee04b4c41 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -202,7 +202,6 @@ struct ceph_osd_request {
 	struct timespec r_mtime;              /* ditto */
 	u64 r_data_offset;                    /* ditto */
 	bool r_linger;                        /* don't resend on failure */
-	bool r_abort_on_full;		      /* return ENOSPC when full */
 
 	/* internal */
 	unsigned long r_stamp;                /* jiffies, send or check time */
@@ -348,6 +347,7 @@ struct ceph_osd_client {
 	struct rb_root         linger_map_checks;
 	atomic_t               num_requests;
 	atomic_t               num_homeless;
+	bool                   abort_on_full; /* abort w/ ENOSPC when full */
 	int                    abort_err;
 	struct delayed_work    timeout_work;
 	struct delayed_work    osds_timeout_work;
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 3d055529189c..05c4d27d25fe 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -1030,7 +1030,6 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 				       truncate_size, truncate_seq);
 	}
 
-	req->r_abort_on_full = true;
 	req->r_flags = flags;
 	req->r_base_oloc.pool = layout->pool_id;
 	req->r_base_oloc.pool_ns = ceph_try_get_string(layout->pool_ns);
@@ -2239,7 +2238,7 @@ again:
 		   (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) ||
 		    pool_full(osdc, req->r_t.base_oloc.pool))) {
 		dout("req %p full/pool_full\n", req);
-		if (req->r_abort_on_full) {
+		if (osdc->abort_on_full) {
 			err = -ENOSPC;
 		} else {
 			pr_warn_ratelimited("FULL or reached pool quota\n");
@@ -2446,8 +2445,7 @@ static int abort_on_full_fn(struct ceph_osd_request *req, void *arg)
 	struct ceph_osd_client *osdc = req->r_osdc;
 	bool *victims = arg;
 
-	if (req->r_abort_on_full &&
-	    (req->r_flags & CEPH_OSD_FLAG_WRITE) &&
+	if ((req->r_flags & CEPH_OSD_FLAG_WRITE) &&
 	    (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) ||
 	     pool_full(osdc, req->r_t.base_oloc.pool))) {
 		if (!*victims) {
@@ -2470,7 +2468,8 @@ static void ceph_osdc_abort_on_full(struct ceph_osd_client *osdc)
 {
 	bool victims = false;
 
-	if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) || have_pool_full(osdc))
+	if (osdc->abort_on_full &&
+	    (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) || have_pool_full(osdc)))
 		for_each_request(osdc, abort_on_full_fn, &victims);
 }
 
-- 
cgit v1.2.3


From fa466743a9fc6e4a24ef22285fb384f9ef4a2edb Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zyan@redhat.com>
Date: Fri, 25 May 2018 11:22:56 +0800
Subject: ceph: fix wrong check for the case of updating link count

Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/caps.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs/ceph')

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 477b822e6333..0ae41854d676 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -3059,7 +3059,6 @@ static void handle_cap_grant(struct inode *inode,
 	int used, wanted, dirty;
 	u64 size = le64_to_cpu(grant->size);
 	u64 max_size = le64_to_cpu(grant->max_size);
-	struct timespec mtime, atime, ctime;
 	int check_caps = 0;
 	bool wake = false;
 	bool writeback = false;
@@ -3124,7 +3123,7 @@ static void handle_cap_grant(struct inode *inode,
 		     from_kgid(&init_user_ns, inode->i_gid));
 	}
 
-	if ((newcaps & CEPH_CAP_AUTH_SHARED) &&
+	if ((newcaps & CEPH_CAP_LINK_SHARED) &&
 	    (extra_info->issued & CEPH_CAP_LINK_EXCL) == 0) {
 		set_nlink(inode, le32_to_cpu(grant->nlink));
 		if (inode->i_nlink == 0 &&
@@ -3149,6 +3148,7 @@ static void handle_cap_grant(struct inode *inode,
 	}
 
 	if (newcaps & CEPH_CAP_ANY_RD) {
+		struct timespec mtime, atime, ctime;
 		/* ctime/mtime/atime? */
 		ceph_decode_timespec(&mtime, &grant->mtime);
 		ceph_decode_timespec(&atime, &grant->atime);
-- 
cgit v1.2.3


From aae1a442f8eac6d5442ee479df66d278c73a6ecc Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zyan@redhat.com>
Date: Sat, 26 May 2018 16:54:39 +0800
Subject: ceph: prevent i_version from going back

inode info from non-auth can be stale.

Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/inode.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs/ceph')

diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 4aeccb13437b..4fda7a9d4c9d 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -889,7 +889,8 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
 	}
 
 	/* finally update i_version */
-	ci->i_version = le64_to_cpu(info->version);
+	if (le64_to_cpu(info->version) > ci->i_version)
+		ci->i_version = le64_to_cpu(info->version);
 
 	inode->i_mapping->a_ops = &ceph_aops;
 
-- 
cgit v1.2.3


From 73fb0949cf246b212ff63d692a0ec88db954bb35 Mon Sep 17 00:00:00 2001
From: Luis Henriques <lhenriques@suse.com>
Date: Mon, 28 May 2018 18:37:40 +0100
Subject: ceph: fix use-after-free in ceph_statfs()

KASAN found an UAF in ceph_statfs.  This was a one-off bug but looking at
the code it looks like the monmap access needs to be protected as it can
be modified while we're accessing it.  Fix this by protecting the access
with the monc->mutex.

  BUG: KASAN: use-after-free in ceph_statfs+0x21d/0x2c0
  Read of size 8 at addr ffff88006844f2e0 by task trinity-c5/304

  CPU: 0 PID: 304 Comm: trinity-c5 Not tainted 4.17.0-rc6+ #172
  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.0.0-prebuilt.qemu-project.org 04/01/2014
  Call Trace:
   dump_stack+0xa5/0x11b
   ? show_regs_print_info+0x5/0x5
   ? kmsg_dump_rewind+0x118/0x118
   ? ceph_statfs+0x21d/0x2c0
   print_address_description+0x73/0x2b0
   ? ceph_statfs+0x21d/0x2c0
   kasan_report+0x243/0x360
   ceph_statfs+0x21d/0x2c0
   ? ceph_umount_begin+0x80/0x80
   ? kmem_cache_alloc+0xdf/0x1a0
   statfs_by_dentry+0x79/0xb0
   vfs_statfs+0x28/0x110
   user_statfs+0x8c/0xe0
   ? vfs_statfs+0x110/0x110
   ? __fdget_raw+0x10/0x10
   __se_sys_statfs+0x5d/0xa0
   ? user_statfs+0xe0/0xe0
   ? mutex_unlock+0x1d/0x40
   ? __x64_sys_statfs+0x20/0x30
   do_syscall_64+0xee/0x290
   ? syscall_return_slowpath+0x1c0/0x1c0
   ? page_fault+0x1e/0x30
   ? syscall_return_slowpath+0x13c/0x1c0
   ? prepare_exit_to_usermode+0xdb/0x140
   ? syscall_trace_enter+0x330/0x330
   ? __put_user_4+0x1c/0x30
   entry_SYSCALL_64_after_hwframe+0x44/0xa9

  Allocated by task 130:
   __kmalloc+0x124/0x210
   ceph_monmap_decode+0x1c1/0x400
   dispatch+0x113/0xd20
   ceph_con_workfn+0xa7e/0x44e0
   process_one_work+0x5f0/0xa30
   worker_thread+0x184/0xa70
   kthread+0x1a0/0x1c0
   ret_from_fork+0x35/0x40

  Freed by task 130:
   kfree+0xb8/0x210
   dispatch+0x15a/0xd20
   ceph_con_workfn+0xa7e/0x44e0
   process_one_work+0x5f0/0xa30
   worker_thread+0x184/0xa70
   kthread+0x1a0/0x1c0
   ret_from_fork+0x35/0x40

Signed-off-by: Luis Henriques <lhenriques@suse.com>
Reviewed-by: "Yan, Zheng" <zyan@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/super.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'fs/ceph')

diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index cad046aa4fd0..a8e8e2629fb4 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -45,7 +45,7 @@ static void ceph_put_super(struct super_block *s)
 static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	struct ceph_fs_client *fsc = ceph_inode_to_client(d_inode(dentry));
-	struct ceph_monmap *monmap = fsc->client->monc.monmap;
+	struct ceph_mon_client *monc = &fsc->client->monc;
 	struct ceph_statfs st;
 	u64 fsid;
 	int err;
@@ -58,7 +58,7 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
 	}
 
 	dout("statfs\n");
-	err = ceph_monc_do_statfs(&fsc->client->monc, data_pool, &st);
+	err = ceph_monc_do_statfs(monc, data_pool, &st);
 	if (err < 0)
 		return err;
 
@@ -94,8 +94,11 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
 	buf->f_namelen = NAME_MAX;
 
 	/* Must convert the fsid, for consistent values across arches */
-	fsid = le64_to_cpu(*(__le64 *)(&monmap->fsid)) ^
-	       le64_to_cpu(*((__le64 *)&monmap->fsid + 1));
+	mutex_lock(&monc->mutex);
+	fsid = le64_to_cpu(*(__le64 *)(&monc->monmap->fsid)) ^
+	       le64_to_cpu(*((__le64 *)&monc->monmap->fsid + 1));
+	mutex_unlock(&monc->mutex);
+
 	buf->f_fsid.val[0] = fsid & 0xffffffff;
 	buf->f_fsid.val[1] = fsid >> 32;
 
-- 
cgit v1.2.3


From c36ed50de2ad1649ce0369a4a6fc2cc11b20dfb7 Mon Sep 17 00:00:00 2001
From: Chengguang Xu <cgxu519@gmx.com>
Date: Wed, 30 May 2018 10:13:11 +0800
Subject: ceph: fix alignment of rasize

On currently logic:
when I specify rasize=0~1 then it will be 4096.
when I specify rasize=2~4097 then it will be 8192.

Make it the same as rsize & wsize.

Signed-off-by: Chengguang Xu <cgxu519@gmx.com>
Reviewed-by: "Yan, Zheng" <zyan@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/super.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/ceph')

diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index a8e8e2629fb4..b4ff1392e333 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -271,7 +271,7 @@ static int parse_fsopt_token(char *c, void *private)
 	case Opt_rasize:
 		if (intval < 0)
 			return -EINVAL;
-		fsopt->rasize = ALIGN(intval + PAGE_SIZE - 1, PAGE_SIZE);
+		fsopt->rasize = ALIGN(intval, PAGE_SIZE);
 		break;
 	case Opt_caps_wanted_delay_min:
 		if (intval < 1)
-- 
cgit v1.2.3


From 8db0c7596f1258b28f32a38f2d5bbc0d63c104c9 Mon Sep 17 00:00:00 2001
From: Chengguang Xu <cgxu519@gmx.com>
Date: Wed, 30 May 2018 16:47:06 +0800
Subject: ceph: strengthen rsize/wsize/readdir_max_bytes validation

The check (intval < PAGE_SIZE) will involve type cast, so even when
specifying negative value to rsize/wsize/readdir_max_bytes, it will
pass the validation check successfully.

Signed-off-by: Chengguang Xu <cgxu519@gmx.com>
Reviewed-by: Ilya Dryomov <idryomov@gmail.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/super.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs/ceph')

diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index b4ff1392e333..cec1d3343742 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -259,12 +259,12 @@ static int parse_fsopt_token(char *c, void *private)
 		break;
 		/* misc */
 	case Opt_wsize:
-		if (intval < PAGE_SIZE || intval > CEPH_MAX_WRITE_SIZE)
+		if (intval < (int)PAGE_SIZE || intval > CEPH_MAX_WRITE_SIZE)
 			return -EINVAL;
 		fsopt->wsize = ALIGN(intval, PAGE_SIZE);
 		break;
 	case Opt_rsize:
-		if (intval < PAGE_SIZE || intval > CEPH_MAX_READ_SIZE)
+		if (intval < (int)PAGE_SIZE || intval > CEPH_MAX_READ_SIZE)
 			return -EINVAL;
 		fsopt->rsize = ALIGN(intval, PAGE_SIZE);
 		break;
@@ -289,7 +289,7 @@ static int parse_fsopt_token(char *c, void *private)
 		fsopt->max_readdir = intval;
 		break;
 	case Opt_readdir_max_bytes:
-		if (intval < PAGE_SIZE && intval != 0)
+		if (intval < (int)PAGE_SIZE && intval != 0)
 			return -EINVAL;
 		fsopt->max_readdir_bytes = intval;
 		break;
-- 
cgit v1.2.3


From 3619aa8b74490fe5f803f7e71af02845aede6b5c Mon Sep 17 00:00:00 2001
From: Chengguang Xu <cgxu519@gmx.com>
Date: Mon, 4 Jun 2018 16:03:51 +0800
Subject: ceph: show ino32 if the value is different with default

In current ceph_show_options(), there is no item for showing 'ino32',
so add showing mount option 'ino32' if the value is different with
default.

Signed-off-by: Chengguang Xu <cgxu519@gmx.com>
Reviewed-by: Ilya Dryomov <idryomov@gmail.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/super.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs/ceph')

diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index cec1d3343742..95a3b3ac9b6e 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -537,6 +537,8 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
 		seq_puts(m, ",noasyncreaddir");
 	if ((fsopt->flags & CEPH_MOUNT_OPT_DCACHE) == 0)
 		seq_puts(m, ",nodcache");
+	if (fsopt->flags & CEPH_MOUNT_OPT_INO32)
+		seq_puts(m, ",ino32");
 	if (fsopt->flags & CEPH_MOUNT_OPT_FSCACHE) {
 		seq_show_option(m, "fsc", fsopt->fscache_uniq);
 	}
-- 
cgit v1.2.3