diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2019-05-16 16:24:01 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2019-05-16 16:24:01 -0700 |
commit | 1d9d7cbf28a1c2f84f2a0224466f8eb5f0a62ace (patch) | |
tree | 35aa9ec8433f757073f21e1229e97d736b0c5593 | |
parent | 2c45e7fbc962be1b03f2c2af817a76f5ba810af2 (diff) | |
parent | 00abf69dd24f4444d185982379c5cc3bb7b6d1fc (diff) | |
download | linux-1d9d7cbf28a1c2f84f2a0224466f8eb5f0a62ace.tar.bz2 |
Merge tag 'ceph-for-5.2-rc1' of git://github.com/ceph/ceph-client
Pull ceph updates from Ilya Dryomov:
"On the filesystem side we have:
- a fix to enforce quotas set above the mount point (Luis Henriques)
- support for exporting snapshots through NFS (Zheng Yan)
- proper statx implementation (Jeff Layton). statx flags are mapped
to MDS caps, with AT_STATX_{DONT,FORCE}_SYNC taken into account.
- some follow-up dentry name handling fixes, in particular
elimination of our hand-rolled helper and the switch to __getname()
as suggested by Al (Jeff Layton)
- a set of MDS client cleanups in preparation for async MDS requests
in the future (Jeff Layton)
- a fix to sync the filesystem before remounting (Jeff Layton)
On the rbd side, work is on-going on object-map and fast-diff image
features"
* tag 'ceph-for-5.2-rc1' of git://github.com/ceph/ceph-client: (29 commits)
ceph: flush dirty inodes before proceeding with remount
ceph: fix unaligned access in ceph_send_cap_releases
libceph: make ceph_pr_addr take an struct ceph_entity_addr pointer
libceph: fix unaligned accesses in ceph_entity_addr handling
rbd: don't assert on writes to snapshots
rbd: client_mutex is never nested
ceph: print inode number in __caps_issued_mask debugging messages
ceph: just call get_session in __ceph_lookup_mds_session
ceph: simplify arguments and return semantics of try_get_cap_refs
ceph: fix comment over ceph_drop_caps_for_unlink
ceph: move wait for mds request into helper function
ceph: have ceph_mdsc_do_request call ceph_mdsc_submit_request
ceph: after an MDS request, do callback and completions
ceph: use pathlen values returned by set_request_path_attr
ceph: use __getname/__putname in ceph_mdsc_build_path
ceph: use ceph_mdsc_build_path instead of clone_dentry_name
ceph: fix potential use-after-free in ceph_mdsc_build_path
ceph: dump granular cap info in "caps" debugfs file
ceph: make iterate_session_caps a public symbol
ceph: fix NULL pointer deref when debugging is enabled
...
-rw-r--r-- | drivers/block/rbd.c | 24 | ||||
-rw-r--r-- | fs/ceph/caps.c | 93 | ||||
-rw-r--r-- | fs/ceph/debugfs.c | 40 | ||||
-rw-r--r-- | fs/ceph/export.c | 356 | ||||
-rw-r--r-- | fs/ceph/file.c | 2 | ||||
-rw-r--r-- | fs/ceph/inode.c | 85 | ||||
-rw-r--r-- | fs/ceph/locks.c | 13 | ||||
-rw-r--r-- | fs/ceph/mds_client.c | 205 | ||||
-rw-r--r-- | fs/ceph/mds_client.h | 33 | ||||
-rw-r--r-- | fs/ceph/mdsmap.c | 2 | ||||
-rw-r--r-- | fs/ceph/quota.c | 177 | ||||
-rw-r--r-- | fs/ceph/super.c | 7 | ||||
-rw-r--r-- | fs/ceph/super.h | 2 | ||||
-rw-r--r-- | include/linux/ceph/ceph_fs.h | 6 | ||||
-rw-r--r-- | include/linux/ceph/messenger.h | 3 | ||||
-rw-r--r-- | include/linux/ceph/osdmap.h | 13 | ||||
-rw-r--r-- | net/ceph/cls_lock_client.c | 2 | ||||
-rw-r--r-- | net/ceph/debugfs.c | 4 | ||||
-rw-r--r-- | net/ceph/messenger.c | 121 | ||||
-rw-r--r-- | net/ceph/mon_client.c | 6 | ||||
-rw-r--r-- | net/ceph/osd_client.c | 2 |
21 files changed, 845 insertions, 351 deletions
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 2210c1b9491b..e5009a34f9c2 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -934,7 +934,7 @@ static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts) struct rbd_client *rbdc; int ret; - mutex_lock_nested(&client_mutex, SINGLE_DEPTH_NESTING); + mutex_lock(&client_mutex); rbdc = rbd_client_find(ceph_opts); if (rbdc) { ceph_destroy_options(ceph_opts); @@ -1326,7 +1326,7 @@ static void rbd_obj_zero_range(struct rbd_obj_request *obj_req, u32 off, zero_bvecs(&obj_req->bvec_pos, off, bytes); break; default: - rbd_assert(0); + BUG(); } } @@ -1581,7 +1581,7 @@ static void rbd_obj_request_destroy(struct kref *kref) kfree(obj_request->bvec_pos.bvecs); break; default: - rbd_assert(0); + BUG(); } kfree(obj_request->img_extents); @@ -1781,7 +1781,7 @@ static void rbd_osd_req_setup_data(struct rbd_obj_request *obj_req, u32 which) &obj_req->bvec_pos); break; default: - rbd_assert(0); + BUG(); } } @@ -2036,7 +2036,7 @@ static int __rbd_img_fill_request(struct rbd_img_request *img_req) ret = rbd_obj_setup_zeroout(obj_req); break; default: - rbd_assert(0); + BUG(); } if (ret < 0) return ret; @@ -2383,7 +2383,7 @@ static int rbd_obj_read_from_parent(struct rbd_obj_request *obj_req) &obj_req->bvec_pos); break; default: - rbd_assert(0); + BUG(); } } else { ret = rbd_img_fill_from_bvecs(child_img_req, @@ -2515,7 +2515,7 @@ static int rbd_obj_issue_copyup_ops(struct rbd_obj_request *obj_req, u32 bytes) num_osd_ops += count_zeroout_ops(obj_req); break; default: - rbd_assert(0); + BUG(); } obj_req->osd_req = rbd_osd_req_create(obj_req, num_osd_ops); @@ -2542,7 +2542,7 @@ static int rbd_obj_issue_copyup_ops(struct rbd_obj_request *obj_req, u32 bytes) __rbd_obj_setup_zeroout(obj_req, which); break; default: - rbd_assert(0); + BUG(); } ret = ceph_osdc_alloc_messages(obj_req->osd_req, GFP_NOIO); @@ -3842,8 +3842,12 @@ static void rbd_queue_workfn(struct work_struct *work) goto err_rq; } - rbd_assert(op_type == OBJ_OP_READ || - rbd_dev->spec->snap_id == CEPH_NOSNAP); + if (op_type != OBJ_OP_READ && rbd_dev->spec->snap_id != CEPH_NOSNAP) { + rbd_warn(rbd_dev, "%s on read-only snapshot", + obj_op_name(op_type)); + result = -EIO; + goto err; + } /* * Quit early if the mapped snapshot no longer exists. It's diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 36a8dc699448..72f8e1311392 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -892,8 +892,8 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch) int have = ci->i_snap_caps; if ((have & mask) == mask) { - dout("__ceph_caps_issued_mask %p snap issued %s" - " (mask %s)\n", &ci->vfs_inode, + dout("__ceph_caps_issued_mask ino 0x%lx snap issued %s" + " (mask %s)\n", ci->vfs_inode.i_ino, ceph_cap_string(have), ceph_cap_string(mask)); return 1; @@ -904,8 +904,8 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch) if (!__cap_is_valid(cap)) continue; if ((cap->issued & mask) == mask) { - dout("__ceph_caps_issued_mask %p cap %p issued %s" - " (mask %s)\n", &ci->vfs_inode, cap, + dout("__ceph_caps_issued_mask ino 0x%lx cap %p issued %s" + " (mask %s)\n", ci->vfs_inode.i_ino, cap, ceph_cap_string(cap->issued), ceph_cap_string(mask)); if (touch) @@ -916,8 +916,8 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch) /* does a combination of caps satisfy mask? */ have |= cap->issued; if ((have & mask) == mask) { - dout("__ceph_caps_issued_mask %p combo issued %s" - " (mask %s)\n", &ci->vfs_inode, + dout("__ceph_caps_issued_mask ino 0x%lx combo issued %s" + " (mask %s)\n", ci->vfs_inode.i_ino, ceph_cap_string(cap->issued), ceph_cap_string(mask)); if (touch) { @@ -2257,8 +2257,6 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync) if (datasync) goto out; - inode_lock(inode); - dirty = try_flush_caps(inode, &flush_tid); dout("fsync dirty caps are %s\n", ceph_cap_string(dirty)); @@ -2273,7 +2271,6 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync) ret = wait_event_interruptible(ci->i_cap_wq, caps_are_flushed(inode, flush_tid)); } - inode_unlock(inode); out: dout("fsync %p%s result=%d\n", inode, datasync ? " datasync" : "", ret); return ret; @@ -2528,9 +2525,14 @@ static void __take_cap_refs(struct ceph_inode_info *ci, int got, * to (when applicable), and check against max_size here as well. * Note that caller is responsible for ensuring max_size increases are * requested from the MDS. + * + * Returns 0 if caps were not able to be acquired (yet), a 1 if they were, + * or a negative error code. + * + * FIXME: how does a 0 return differ from -EAGAIN? */ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, - loff_t endoff, bool nonblock, int *got, int *err) + loff_t endoff, bool nonblock, int *got) { struct inode *inode = &ci->vfs_inode; struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; @@ -2550,8 +2552,7 @@ again: if ((file_wanted & need) != need) { dout("try_get_cap_refs need %s file_wanted %s, EBADF\n", ceph_cap_string(need), ceph_cap_string(file_wanted)); - *err = -EBADF; - ret = 1; + ret = -EBADF; goto out_unlock; } @@ -2572,10 +2573,8 @@ again: if (endoff >= 0 && endoff > (loff_t)ci->i_max_size) { dout("get_cap_refs %p endoff %llu > maxsize %llu\n", inode, endoff, ci->i_max_size); - if (endoff > ci->i_requested_max_size) { - *err = -EAGAIN; - ret = 1; - } + if (endoff > ci->i_requested_max_size) + ret = -EAGAIN; goto out_unlock; } /* @@ -2610,8 +2609,7 @@ again: * task isn't in TASK_RUNNING state */ if (nonblock) { - *err = -EAGAIN; - ret = 1; + ret = -EAGAIN; goto out_unlock; } @@ -2640,8 +2638,7 @@ again: if (session_readonly) { dout("get_cap_refs %p needed %s but mds%d readonly\n", inode, ceph_cap_string(need), ci->i_auth_cap->mds); - *err = -EROFS; - ret = 1; + ret = -EROFS; goto out_unlock; } @@ -2650,16 +2647,14 @@ again: if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { dout("get_cap_refs %p forced umount\n", inode); - *err = -EIO; - ret = 1; + ret = -EIO; goto out_unlock; } mds_wanted = __ceph_caps_mds_wanted(ci, false); if (need & ~(mds_wanted & need)) { dout("get_cap_refs %p caps were dropped" " (session killed?)\n", inode); - *err = -ESTALE; - ret = 1; + ret = -ESTALE; goto out_unlock; } if (!(file_wanted & ~mds_wanted)) @@ -2710,7 +2705,7 @@ static void check_max_size(struct inode *inode, loff_t endoff) int ceph_try_get_caps(struct ceph_inode_info *ci, int need, int want, bool nonblock, int *got) { - int ret, err = 0; + int ret; BUG_ON(need & ~CEPH_CAP_FILE_RD); BUG_ON(want & ~(CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO|CEPH_CAP_FILE_SHARED)); @@ -2718,15 +2713,8 @@ int ceph_try_get_caps(struct ceph_inode_info *ci, int need, int want, if (ret < 0) return ret; - ret = try_get_cap_refs(ci, need, want, 0, nonblock, got, &err); - if (ret) { - if (err == -EAGAIN) { - ret = 0; - } else if (err < 0) { - ret = err; - } - } - return ret; + ret = try_get_cap_refs(ci, need, want, 0, nonblock, got); + return ret == -EAGAIN ? 0 : ret; } /* @@ -2737,7 +2725,7 @@ int ceph_try_get_caps(struct ceph_inode_info *ci, int need, int want, int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, loff_t endoff, int *got, struct page **pinned_page) { - int _got, ret, err = 0; + int _got, ret; ret = ceph_pool_perm_check(ci, need); if (ret < 0) @@ -2747,21 +2735,19 @@ int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, if (endoff > 0) check_max_size(&ci->vfs_inode, endoff); - err = 0; _got = 0; ret = try_get_cap_refs(ci, need, want, endoff, - false, &_got, &err); - if (ret) { - if (err == -EAGAIN) - continue; - if (err < 0) - ret = err; - } else { + false, &_got); + if (ret == -EAGAIN) { + continue; + } else if (!ret) { + int err; + DEFINE_WAIT_FUNC(wait, woken_wake_function); add_wait_queue(&ci->i_cap_wq, &wait); - while (!try_get_cap_refs(ci, need, want, endoff, - true, &_got, &err)) { + while (!(err = try_get_cap_refs(ci, need, want, endoff, + true, &_got))) { if (signal_pending(current)) { ret = -ERESTARTSYS; break; @@ -2770,19 +2756,14 @@ int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, } remove_wait_queue(&ci->i_cap_wq, &wait); - if (err == -EAGAIN) continue; - if (err < 0) - ret = err; } - if (ret < 0) { - if (err == -ESTALE) { - /* session was killed, try renew caps */ - ret = ceph_renew_caps(&ci->vfs_inode); - if (ret == 0) - continue; - } + if (ret == -ESTALE) { + /* session was killed, try renew caps */ + ret = ceph_renew_caps(&ci->vfs_inode); + if (ret == 0) + continue; return ret; } @@ -4099,7 +4080,7 @@ void ceph_put_fmode(struct ceph_inode_info *ci, int fmode) } /* - * For a soon-to-be unlinked file, drop the AUTH_RDCACHE caps. If it + * For a soon-to-be unlinked file, drop the LINK caps. If it * looks like the link count will hit 0, drop any other caps (other * than PIN) we don't specifically want (due to the file still being * open). diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index 98365e74cb4a..b3fc5fe26a1a 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -37,7 +37,7 @@ static int mdsmap_show(struct seq_file *s, void *p) struct ceph_entity_addr *addr = &mdsmap->m_info[i].addr; int state = mdsmap->m_info[i].state; seq_printf(s, "\tmds%d\t%s\t(%s)\n", i, - ceph_pr_addr(&addr->in_addr), + ceph_pr_addr(addr), ceph_mds_state_name(state)); } return 0; @@ -88,7 +88,7 @@ static int mdsc_show(struct seq_file *s, void *p) req->r_dentry, path ? path : ""); spin_unlock(&req->r_dentry->d_lock); - kfree(path); + ceph_mdsc_free_path(path, pathlen); } else if (req->r_path1) { seq_printf(s, " #%llx/%s", req->r_ino1.ino, req->r_path1); @@ -108,7 +108,7 @@ static int mdsc_show(struct seq_file *s, void *p) req->r_old_dentry, path ? path : ""); spin_unlock(&req->r_old_dentry->d_lock); - kfree(path); + ceph_mdsc_free_path(path, pathlen); } else if (req->r_path2 && req->r_op != CEPH_MDS_OP_SYMLINK) { if (req->r_ino2.ino) seq_printf(s, " #%llx/%s", req->r_ino2.ino, @@ -124,18 +124,48 @@ static int mdsc_show(struct seq_file *s, void *p) return 0; } +static int caps_show_cb(struct inode *inode, struct ceph_cap *cap, void *p) +{ + struct seq_file *s = p; + + seq_printf(s, "0x%-17lx%-17s%-17s\n", inode->i_ino, + ceph_cap_string(cap->issued), + ceph_cap_string(cap->implemented)); + return 0; +} + static int caps_show(struct seq_file *s, void *p) { struct ceph_fs_client *fsc = s->private; - int total, avail, used, reserved, min; + struct ceph_mds_client *mdsc = fsc->mdsc; + int total, avail, used, reserved, min, i; ceph_reservation_status(fsc, &total, &avail, &used, &reserved, &min); seq_printf(s, "total\t\t%d\n" "avail\t\t%d\n" "used\t\t%d\n" "reserved\t%d\n" - "min\t%d\n", + "min\t\t%d\n\n", total, avail, used, reserved, min); + seq_printf(s, "ino issued implemented\n"); + seq_printf(s, "-----------------------------------------------\n"); + + mutex_lock(&mdsc->mutex); + for (i = 0; i < mdsc->max_sessions; i++) { + struct ceph_mds_session *session; + + session = __ceph_lookup_mds_session(mdsc, i); + if (!session) + continue; + mutex_unlock(&mdsc->mutex); + mutex_lock(&session->s_mutex); + ceph_iterate_session_caps(session, caps_show_cb, s); + mutex_unlock(&session->s_mutex); + ceph_put_mds_session(session); + mutex_lock(&mdsc->mutex); + } + mutex_unlock(&mdsc->mutex); + return 0; } diff --git a/fs/ceph/export.c b/fs/ceph/export.c index 3c59ad180ef0..d3ef7ee429ec 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c @@ -22,18 +22,77 @@ struct ceph_nfs_confh { u64 ino, parent_ino; } __attribute__ ((packed)); +/* + * fh for snapped inode + */ +struct ceph_nfs_snapfh { + u64 ino; + u64 snapid; + u64 parent_ino; + u32 hash; +} __attribute__ ((packed)); + +static int ceph_encode_snapfh(struct inode *inode, u32 *rawfh, int *max_len, + struct inode *parent_inode) +{ + const static int snap_handle_length = + sizeof(struct ceph_nfs_snapfh) >> 2; + struct ceph_nfs_snapfh *sfh = (void *)rawfh; + u64 snapid = ceph_snap(inode); + int ret; + bool no_parent = true; + + if (*max_len < snap_handle_length) { + *max_len = snap_handle_length; + ret = FILEID_INVALID; + goto out; + } + + ret = -EINVAL; + if (snapid != CEPH_SNAPDIR) { + struct inode *dir; + struct dentry *dentry = d_find_alias(inode); + if (!dentry) + goto out; + + rcu_read_lock(); + dir = d_inode_rcu(dentry->d_parent); + if (ceph_snap(dir) != CEPH_SNAPDIR) { + sfh->parent_ino = ceph_ino(dir); + sfh->hash = ceph_dentry_hash(dir, dentry); + no_parent = false; + } + rcu_read_unlock(); + dput(dentry); + } + + if (no_parent) { + if (!S_ISDIR(inode->i_mode)) + goto out; + sfh->parent_ino = sfh->ino; + sfh->hash = 0; + } + sfh->ino = ceph_ino(inode); + sfh->snapid = snapid; + + *max_len = snap_handle_length; + ret = FILEID_BTRFS_WITH_PARENT; +out: + dout("encode_snapfh %llx.%llx ret=%d\n", ceph_vinop(inode), ret); + return ret; +} + static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len, struct inode *parent_inode) { + const static int handle_length = + sizeof(struct ceph_nfs_fh) >> 2; + const static int connected_handle_length = + sizeof(struct ceph_nfs_confh) >> 2; int type; - struct ceph_nfs_fh *fh = (void *)rawfh; - struct ceph_nfs_confh *cfh = (void *)rawfh; - int connected_handle_length = sizeof(*cfh)/4; - int handle_length = sizeof(*fh)/4; - /* don't re-export snaps */ if (ceph_snap(inode) != CEPH_NOSNAP) - return -EINVAL; + return ceph_encode_snapfh(inode, rawfh, max_len, parent_inode); if (parent_inode && (*max_len < connected_handle_length)) { *max_len = connected_handle_length; @@ -44,6 +103,7 @@ static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len, } if (parent_inode) { + struct ceph_nfs_confh *cfh = (void *)rawfh; dout("encode_fh %llx with parent %llx\n", ceph_ino(inode), ceph_ino(parent_inode)); cfh->ino = ceph_ino(inode); @@ -51,6 +111,7 @@ static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len, *max_len = connected_handle_length; type = FILEID_INO32_GEN_PARENT; } else { + struct ceph_nfs_fh *fh = (void *)rawfh; dout("encode_fh %llx\n", ceph_ino(inode)); fh->ino = ceph_ino(inode); *max_len = handle_length; @@ -59,7 +120,7 @@ static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len, return type; } -static struct dentry *__fh_to_dentry(struct super_block *sb, u64 ino) +static struct inode *__lookup_inode(struct super_block *sb, u64 ino) { struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc; struct inode *inode; @@ -81,7 +142,7 @@ static struct dentry *__fh_to_dentry(struct super_block *sb, u64 ino) mask = CEPH_STAT_CAP_INODE; if (ceph_security_xattr_wanted(d_inode(sb->s_root))) mask |= CEPH_CAP_XATTR_SHARED; - req->r_args.getattr.mask = cpu_to_le32(mask); + req->r_args.lookupino.mask = cpu_to_le32(mask); req->r_ino1 = vino; req->r_num_caps = 1; @@ -91,16 +152,114 @@ static struct dentry *__fh_to_dentry(struct super_block *sb, u64 ino) ihold(inode); ceph_mdsc_put_request(req); if (!inode) - return ERR_PTR(-ESTALE); - if (inode->i_nlink == 0) { - iput(inode); - return ERR_PTR(-ESTALE); - } + return err < 0 ? ERR_PTR(err) : ERR_PTR(-ESTALE); } + return inode; +} + +struct inode *ceph_lookup_inode(struct super_block *sb, u64 ino) +{ + struct inode *inode = __lookup_inode(sb, ino); + if (IS_ERR(inode)) + return inode; + if (inode->i_nlink == 0) { + iput(inode); + return ERR_PTR(-ESTALE); + } + return inode; +} +static struct dentry *__fh_to_dentry(struct super_block *sb, u64 ino) +{ + struct inode *inode = __lookup_inode(sb, ino); + if (IS_ERR(inode)) + return ERR_CAST(inode); + if (inode->i_nlink == 0) { + iput(inode); + return ERR_PTR(-ESTALE); + } return d_obtain_alias(inode); } +static struct dentry *__snapfh_to_dentry(struct super_block *sb, + struct ceph_nfs_snapfh *sfh, + bool want_parent) +{ + struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc; + struct ceph_mds_request *req; + struct inode *inode; + struct ceph_vino vino; + int mask; + int err; + bool unlinked = false; + + if (want_parent) { + vino.ino = sfh->parent_ino; + if (sfh->snapid == CEPH_SNAPDIR) + vino.snap = CEPH_NOSNAP; + else if (sfh->ino == sfh->parent_ino) + vino.snap = CEPH_SNAPDIR; + else + vino.snap = sfh->snapid; + } else { + vino.ino = sfh->ino; + vino.snap = sfh->snapid; + } + inode = ceph_find_inode(sb, vino); + if (inode) + return d_obtain_alias(inode); + + req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPINO, + USE_ANY_MDS); + if (IS_ERR(req)) + return ERR_CAST(req); + + mask = CEPH_STAT_CAP_INODE; + if (ceph_security_xattr_wanted(d_inode(sb->s_root))) + mask |= CEPH_CAP_XATTR_SHARED; + req->r_args.lookupino.mask = cpu_to_le32(mask); + if (vino.snap < CEPH_NOSNAP) { + req->r_args.lookupino.snapid = cpu_to_le64(vino.snap); + if (!want_parent && sfh->ino != sfh->parent_ino) { + req->r_args.lookupino.parent = + cpu_to_le64(sfh->parent_ino); + req->r_args.lookupino.hash = + cpu_to_le32(sfh->hash); + } + } + + req->r_ino1 = vino; + req->r_num_caps = 1; + err = ceph_mdsc_do_request(mdsc, NULL, req); + inode = req->r_target_inode; + if (inode) { + if (vino.snap == CEPH_SNAPDIR) { + if (inode->i_nlink == 0) + unlinked = true; + inode = ceph_get_snapdir(inode); + } else if (ceph_snap(inode) == vino.snap) { + ihold(inode); + } else { + /* mds does not support lookup snapped inode */ + err = -EOPNOTSUPP; + inode = NULL; + } + } + ceph_mdsc_put_request(req); + + if (want_parent) { + dout("snapfh_to_parent %llx.%llx\n err=%d\n", + vino.ino, vino.snap, err); + } else { + dout("snapfh_to_dentry %llx.%llx parent %llx hash %x err=%d", + vino.ino, vino.snap, sfh->parent_ino, sfh->hash, err); + } + if (!inode) + return ERR_PTR(-ESTALE); + /* see comments in ceph_get_parent() */ + return unlinked ? d_obtain_root(inode) : d_obtain_alias(inode); +} + /* * convert regular fh to dentry */ @@ -110,6 +269,11 @@ static struct dentry *ceph_fh_to_dentry(struct super_block *sb, { struct ceph_nfs_fh *fh = (void *)fid->raw; + if (fh_type == FILEID_BTRFS_WITH_PARENT) { + struct ceph_nfs_snapfh *sfh = (void *)fid->raw; + return __snapfh_to_dentry(sb, sfh, false); + } + if (fh_type != FILEID_INO32_GEN && fh_type != FILEID_INO32_GEN_PARENT) return NULL; @@ -163,13 +327,49 @@ static struct dentry *__get_parent(struct super_block *sb, static struct dentry *ceph_get_parent(struct dentry *child) { - /* don't re-export snaps */ - if (ceph_snap(d_inode(child)) != CEPH_NOSNAP) - return ERR_PTR(-EINVAL); - - dout("get_parent %p ino %llx.%llx\n", - child, ceph_vinop(d_inode(child))); - return __get_parent(child->d_sb, child, 0); + struct inode *inode = d_inode(child); + struct dentry *dn; + + if (ceph_snap(inode) != CEPH_NOSNAP) { + struct inode* dir; + bool unlinked = false; + /* do not support non-directory */ + if (!d_is_dir(child)) { + dn = ERR_PTR(-EINVAL); + goto out; + } + dir = __lookup_inode(inode->i_sb, ceph_ino(inode)); + if (IS_ERR(dir)) { + dn = ERR_CAST(dir); + goto out; + } + /* There can be multiple paths to access snapped inode. + * For simplicity, treat snapdir of head inode as parent */ + if (ceph_snap(inode) != CEPH_SNAPDIR) { + struct inode *snapdir = ceph_get_snapdir(dir); + if (dir->i_nlink == 0) + unlinked = true; + iput(dir); + if (IS_ERR(snapdir)) { + dn = ERR_CAST(snapdir); + goto out; + } + dir = snapdir; + } + /* If directory has already been deleted, futher get_parent + * will fail. Do not mark snapdir dentry as disconnected, + * this prevent exportfs from doing futher get_parent. */ + if (unlinked) + dn = d_obtain_root(dir); + else + dn = d_obtain_alias(dir); + } else { + dn = __get_parent(child->d_sb, child, 0); + } +out: + dout("get_parent %p ino %llx.%llx err=%ld\n", + child, ceph_vinop(inode), (IS_ERR(dn) ? PTR_ERR(dn) : 0)); + return dn; } /* @@ -182,6 +382,11 @@ static struct dentry *ceph_fh_to_parent(struct super_block *sb, struct ceph_nfs_confh *cfh = (void *)fid->raw; struct dentry *dentry; + if (fh_type == FILEID_BTRFS_WITH_PARENT) { + struct ceph_nfs_snapfh *sfh = (void *)fid->raw; + return __snapfh_to_dentry(sb, sfh, true); + } + if (fh_type != FILEID_INO32_GEN_PARENT) return NULL; if (fh_len < sizeof(*cfh) / 4) @@ -194,14 +399,115 @@ static struct dentry *ceph_fh_to_parent(struct super_block *sb, return dentry; } +static int __get_snap_name(struct dentry *parent, char *name, + struct dentry *child) +{ + struct inode *inode = d_inode(child); + struct inode *dir = d_inode(parent); + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_mds_request *req = NULL; + char *last_name = NULL; + unsigned next_offset = 2; + int err = -EINVAL; + + if (ceph_ino(inode) != ceph_ino(dir)) + goto out; + if (ceph_snap(inode) == CEPH_SNAPDIR) { + if (ceph_snap(dir) == CEPH_NOSNAP) { + strcpy(name, fsc->mount_options->snapdir_name); + err = 0; + } + goto out; + } + if (ceph_snap(dir) != CEPH_SNAPDIR) + goto out; + + while (1) { + struct ceph_mds_reply_info_parsed *rinfo; + struct ceph_mds_reply_dir_entry *rde; + int i; + + req = ceph_mdsc_create_request(fsc->mdsc, CEPH_MDS_OP_LSSNAP, + USE_AUTH_MDS); + if (IS_ERR(req)) { + err = PTR_ERR(req); + req = NULL; + goto out; + } + err = ceph_alloc_readdir_reply_buffer(req, inode); + if (err) + goto out; + + req->r_direct_mode = USE_AUTH_MDS; + req->r_readdir_offset = next_offset; + req->r_args.readdir.flags = + cpu_to_le16(CEPH_READDIR_REPLY_BITFLAGS); + if (last_name) { + req->r_path2 = last_name; + last_name = NULL; + } + + req->r_inode = dir; + ihold(dir); + req->r_dentry = dget(parent); + + inode_lock(dir); + err = ceph_mdsc_do_request(fsc->mdsc, NULL, req); + inode_unlock(dir); + + if (err < 0) + goto out; + + rinfo = &req->r_reply_info; + for (i = 0; i < rinfo->dir_nr; i++) { + rde = rinfo->dir_entries + i; + BUG_ON(!rde->inode.in); + if (ceph_snap(inode) == + le64_to_cpu(rde->inode.in->snapid)) { + memcpy(name, rde->name, rde->name_len); + name[rde->name_len] = '\0'; + err = 0; + goto out; + } + } + + if (rinfo->dir_end) + break; + + BUG_ON(rinfo->dir_nr <= 0); + rde = rinfo->dir_entries + (rinfo->dir_nr - 1); + next_offset += rinfo->dir_nr; + last_name = kstrndup(rde->name, rde->name_len, GFP_KERNEL); + if (!last_name) { + err = -ENOMEM; + goto out; + } + + ceph_mdsc_put_request(req); + req = NULL; + } + err = -ENOENT; +out: + if (req) + ceph_mdsc_put_request(req); + kfree(last_name); + dout("get_snap_name %p ino %llx.%llx err=%d\n", + child, ceph_vinop(inode), err); + return err; +} + static int ceph_get_name(struct dentry *parent, char *name, struct dentry *child) { struct ceph_mds_client *mdsc; struct ceph_mds_request *req; + struct inode *inode = d_inode(child); int err; - mdsc = ceph_inode_to_client(d_inode(child))->mdsc; + if (ceph_snap(inode) != CEPH_NOSNAP) + return __get_snap_name(parent, name, child); + + mdsc = ceph_inode_to_client(inode)->mdsc; req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPNAME, USE_ANY_MDS); if (IS_ERR(req)) @@ -209,8 +515,8 @@ static int ceph_get_name(struct dentry *parent, char *name, inode_lock(d_inode(parent)); - req->r_inode = d_inode(child); - ihold(d_inode(child)); + req->r_inode = inode; + ihold(inode); req->r_ino2 = ceph_vino(d_inode(parent)); req->r_parent = d_inode(parent); set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); @@ -224,10 +530,10 @@ static int ceph_get_name(struct dentry *parent, char *name, memcpy(name, rinfo->dname, rinfo->dname_len); name[rinfo->dname_len] = 0; dout("get_name %p ino %llx.%llx name %s\n", - child, ceph_vinop(d_inode(child)), name); + child, ceph_vinop(inode), name); } else { dout("get_name %p ino %llx.%llx err %d\n", - child, ceph_vinop(d_inode(child)), err); + child, ceph_vinop(inode), err); } ceph_mdsc_put_request(req); diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 84725b53ac21..305daf043eb0 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -929,7 +929,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, dout("sync_direct_%s on file %p %lld~%u snapc %p seq %lld\n", (write ? "write" : "read"), file, pos, (unsigned)count, - snapc, snapc->seq); + snapc, snapc ? snapc->seq : 0); ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + count - 1); diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 35dae6d5493a..f85355bf49c4 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -2266,43 +2266,72 @@ int ceph_permission(struct inode *inode, int mask) return err; } +/* Craft a mask of needed caps given a set of requested statx attrs. */ +static int statx_to_caps(u32 want) +{ + int mask = 0; + + if (want & (STATX_MODE|STATX_UID|STATX_GID|STATX_CTIME)) + mask |= CEPH_CAP_AUTH_SHARED; + + if (want & (STATX_NLINK|STATX_CTIME)) + mask |= CEPH_CAP_LINK_SHARED; + + if (want & (STATX_ATIME|STATX_MTIME|STATX_CTIME|STATX_SIZE| + STATX_BLOCKS)) + mask |= CEPH_CAP_FILE_SHARED; + + if (want & (STATX_CTIME)) + mask |= CEPH_CAP_XATTR_SHARED; + + return mask; +} + /* - * Get all attributes. Hopefully somedata we'll have a statlite() - * and can limit the fields we require to be accurate. + * Get all the attributes. If we have sufficient caps for the requested attrs, + * then we can avoid talking to the MDS at all. */ int ceph_getattr(const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { struct inode *inode = d_inode(path->dentry); struct ceph_inode_info *ci = ceph_inode(inode); - int err; + int err = 0; - err = ceph_do_getattr(inode, CEPH_STAT_CAP_INODE_ALL, false); - if (!err) { - generic_fillattr(inode, stat); - stat->ino = ceph_translate_ino(inode->i_sb, inode->i_ino); - if (ceph_snap(inode) == CEPH_NOSNAP) - stat->dev = inode->i_sb->s_dev; + /* Skip the getattr altogether if we're asked not to sync */ + if (!(flags & AT_STATX_DONT_SYNC)) { + err = ceph_do_getattr(inode, statx_to_caps(request_mask), + flags & AT_STATX_FORCE_SYNC); + if (err) + return err; + } + + generic_fillattr(inode, stat); + stat->ino = ceph_translate_ino(inode->i_sb, inode->i_ino); + if (ceph_snap(inode) == CEPH_NOSNAP) + stat->dev = inode->i_sb->s_dev; + else + stat->dev = ci->i_snapid_map ? ci->i_snapid_map->dev : 0; + + if (S_ISDIR(inode->i_mode)) { + if (ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), + RBYTES)) + stat->size = ci->i_rbytes; else - stat->dev = ci->i_snapid_map ? ci->i_snapid_map->dev : 0; - - if (S_ISDIR(inode->i_mode)) { - if (ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), - RBYTES)) - stat->size = ci->i_rbytes; - else - stat->size = ci->i_files + ci->i_subdirs; - stat->blocks = 0; - stat->blksize = 65536; - /* - * Some applications rely on the number of st_nlink - * value on directories to be either 0 (if unlinked) - * or 2 + number of subdirectories. - */ - if (stat->nlink == 1) - /* '.' + '..' + subdirs */ - stat->nlink = 1 + 1 + ci->i_subdirs; - } + stat->size = ci->i_files + ci->i_subdirs; + stat->blocks = 0; + stat->blksize = 65536; + /* + * Some applications rely on the number of st_nlink + * value on directories to be either 0 (if unlinked) + * or 2 + number of subdirectories. + */ + if (stat->nlink == 1) + /* '.' + '..' + subdirs */ + stat->nlink = 1 + 1 + ci->i_subdirs; } + + /* Mask off any higher bits (e.g. btime) until we have support */ + stat->result_mask = request_mask & STATX_BASIC_STATS; return err; } diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c index 9dae2ec7e1fa..ac9b53b89365 100644 --- a/fs/ceph/locks.c +++ b/fs/ceph/locks.c @@ -237,15 +237,6 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl) spin_lock(&ci->i_ceph_lock); if (ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) { err = -EIO; - } else if (op == CEPH_MDS_OP_SETFILELOCK) { - /* - * increasing i_filelock_ref closes race window between - * handling request reply and adding file_lock struct to - * inode. Otherwise, i_auth_cap may get trimmed in the - * window. Caller function will decrease the counter. - */ - fl->fl_ops = &ceph_fl_lock_ops; - atomic_inc(&ci->i_filelock_ref); } spin_unlock(&ci->i_ceph_lock); if (err < 0) { @@ -299,10 +290,6 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl) spin_lock(&ci->i_ceph_lock); if (ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) { err = -EIO; - } else { - /* see comment in ceph_lock */ - fl->fl_ops = &ceph_fl_lock_ops; - atomic_inc(&ci->i_filelock_ref); } spin_unlock(&ci->i_ceph_lock); if (err < 0) { diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 9049c2a3e972..959b1bf7c327 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -550,15 +550,9 @@ void ceph_put_mds_session(struct ceph_mds_session *s) struct ceph_mds_session *__ceph_lookup_mds_session(struct ceph_mds_client *mdsc, int mds) { - struct ceph_mds_session *session; - if (mds >= mdsc->max_sessions || !mdsc->sessions[mds]) return NULL; - session = mdsc->sessions[mds]; - dout("lookup_mds_session %p %d\n", session, - refcount_read(&session->s_ref)); - get_session(session); - return session; + return get_session(mdsc->sessions[mds]); } static bool __have_session(struct ceph_mds_client *mdsc, int mds) @@ -1284,9 +1278,9 @@ static void cleanup_session_requests(struct ceph_mds_client *mdsc, * * Caller must hold session s_mutex. */ -static int iterate_session_caps(struct ceph_mds_session *session, - int (*cb)(struct inode *, struct ceph_cap *, - void *), void *arg) +int ceph_iterate_session_caps(struct ceph_mds_session *session, + int (*cb)(struct inode *, struct ceph_cap *, + void *), void *arg) { struct list_head *p; struct ceph_cap *cap; @@ -1451,7 +1445,7 @@ static void remove_session_caps(struct ceph_mds_session *session) LIST_HEAD(dispose); dout("remove_session_caps on %p\n", session); - iterate_session_caps(session, remove_session_caps_cb, fsc); + ceph_iterate_session_caps(session, remove_session_caps_cb, fsc); wake_up_all(&fsc->mdsc->cap_flushing_wq); @@ -1534,8 +1528,8 @@ static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap, static void wake_up_session_caps(struct ceph_mds_session *session, int ev) { dout("wake_up_session_caps %p mds%d\n", session, session->s_mds); - iterate_session_caps(session, wake_up_session_cb, - (void *)(unsigned long)ev); + ceph_iterate_session_caps(session, wake_up_session_cb, + (void *)(unsigned long)ev); } /* @@ -1768,7 +1762,7 @@ int ceph_trim_caps(struct ceph_mds_client *mdsc, session->s_mds, session->s_nr_caps, max_caps, trim_caps); if (trim_caps > 0) { session->s_trim_caps = trim_caps; - iterate_session_caps(session, trim_caps_cb, session); + ceph_iterate_session_caps(session, trim_caps_cb, session); dout("trim_caps mds%d done: %d / %d, trimmed %d\n", session->s_mds, session->s_nr_caps, max_caps, trim_caps - session->s_trim_caps); @@ -1861,7 +1855,8 @@ again: num_cap_releases--; head = msg->front.iov_base; - le32_add_cpu(&head->num, 1); + put_unaligned_le32(get_unaligned_le32(&head->num) + 1, + &head->num); item = msg->front.iov_base + msg->front.iov_len; item->ino = cpu_to_le64(cap->cap_ino); item->cap_id = cpu_to_le64(cap->cap_id); @@ -2089,43 +2084,29 @@ static inline u64 __get_oldest_tid(struct ceph_mds_client *mdsc) * Encode hidden .snap dirs as a double /, i.e. * foo/.snap/bar -> foo//bar */ -char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base, +char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *pbase, int stop_on_nosnap) { struct dentry *temp; char *path; - int len, pos; + int pos; unsigned seq; + u64 base; if (!dentry) return ERR_PTR(-EINVAL); -retry: - len = 0; - seq = read_seqbegin(&rename_lock); - rcu_read_lock(); - for (temp = dentry; !IS_ROOT(temp);) { - struct inode *inode = d_inode(temp); - if (inode && ceph_snap(inode) == CEPH_SNAPDIR) - len++; /* slash only */ - else if (stop_on_nosnap && inode && - ceph_snap(inode) == CEPH_NOSNAP) - break; - else - len += 1 + temp->d_name.len; - temp = temp->d_parent; - } - rcu_read_unlock(); - if (len) - len--; /* no leading '/' */ - - path = kmalloc(len+1, GFP_NOFS); + path = __getname(); if (!path) return ERR_PTR(-ENOMEM); - pos = len; - path[pos] = 0; /* trailing null */ +retry: + pos = PATH_MAX - 1; + path[pos] = '\0'; + + seq = read_seqbegin(&rename_lock); rcu_read_lock(); - for (temp = dentry; !IS_ROOT(temp) && pos != 0; ) { + temp = dentry; + for (;;) { struct inode *inode; spin_lock(&temp->d_lock); @@ -2143,83 +2124,54 @@ retry: spin_unlock(&temp->d_lock); break; } - strncpy(path + pos, temp->d_name.name, - temp->d_name.len); + memcpy(path + pos, temp->d_name.name, temp->d_name.len); } spin_unlock(&temp->d_lock); - if (pos) - path[--pos] = '/'; temp = temp->d_parent; + + /* Are we at the root? */ + if (IS_ROOT(temp)) + break; + + /* Are we out of buffer? */ + if (--pos < 0) + break; + + path[pos] = '/'; } + base = ceph_ino(d_inode(temp)); rcu_read_unlock(); - if (pos != 0 || read_seqretry(&rename_lock, seq)) { + if (pos < 0 || read_seqretry(&rename_lock, seq)) { pr_err("build_path did not end path lookup where " - "expected, namelen is %d, pos is %d\n", len, pos); + "expected, pos is %d\n", pos); /* presumably this is only possible if racing with a rename of one of the parent directories (we can not lock the dentries above us to prevent this, but retrying should be harmless) */ - kfree(path); goto retry; } - *base = ceph_ino(d_inode(temp)); - *plen = len; + *pbase = base; + *plen = PATH_MAX - 1 - pos; dout("build_path on %p %d built %llx '%.*s'\n", - dentry, d_count(dentry), *base, len, path); - return path; -} - -/* Duplicate the dentry->d_name.name safely */ -static int clone_dentry_name(struct dentry *dentry, const char **ppath, - int *ppathlen) -{ - u32 len; - char *name; - -retry: - len = READ_ONCE(dentry->d_name.len); - name = kmalloc(len + 1, GFP_NOFS); - if (!name) - return -ENOMEM; - - spin_lock(&dentry->d_lock); - if (dentry->d_name.len != len) { - spin_unlock(&dentry->d_lock); - kfree(name); - goto retry; - } - memcpy(name, dentry->d_name.name, len); - spin_unlock(&dentry->d_lock); - - name[len] = '\0'; - *ppath = name; - *ppathlen = len; - return 0; + dentry, d_count(dentry), base, *plen, path + pos); + return path + pos; } static int build_dentry_path(struct dentry *dentry, struct inode *dir, const char **ppath, int *ppathlen, u64 *pino, bool *pfreepath, bool parent_locked) { - int ret; char *path; rcu_read_lock(); if (!dir) dir = d_inode_rcu(dentry->d_parent); - if (dir && ceph_snap(dir) == CEPH_NOSNAP) { + if (dir && parent_locked && ceph_snap(dir) == CEPH_NOSNAP) { *pino = ceph_ino(dir); rcu_read_unlock(); - if (parent_locked) { - *ppath = dentry->d_name.name; - *ppathlen = dentry->d_name.len; - } else { - ret = clone_dentry_name(dentry, ppath, ppathlen); - if (ret) - return ret; - *pfreepath = true; - } + *ppath = dentry->d_name.name; + *ppathlen = dentry->d_name.len; return 0; } rcu_read_unlock(); @@ -2331,9 +2283,9 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, (!!req->r_inode_drop + !!req->r_dentry_drop + !!req->r_old_inode_drop + !!req->r_old_dentry_drop); if (req->r_dentry_drop) - len += req->r_dentry->d_name.len; + len += pathlen1; if (req->r_old_dentry_drop) - len += req->r_old_dentry->d_name.len; + len += pathlen2; msg = ceph_msg_new2(CEPH_MSG_CLIENT_REQUEST, len, 1, GFP_NOFS, false); if (!msg) { @@ -2410,10 +2362,10 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, out_free2: if (freepath2) - kfree((char *)path2); + ceph_mdsc_free_path((char *)path2, pathlen2); out_free1: if (freepath1) - kfree((char *)path1); + ceph_mdsc_free_path((char *)path1, pathlen1); out: return msg; } @@ -2427,8 +2379,7 @@ static void complete_request(struct ceph_mds_client *mdsc, { if (req->r_callback) req->r_callback(mdsc, req); - else - complete_all(&req->r_completion); + complete_all(&req->r_completion); } /* @@ -2670,28 +2621,11 @@ static void kick_requests(struct ceph_mds_client *mdsc, int mds) } } -void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, +int ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, struct inode *dir, struct ceph_mds_request *req) { - dout("submit_request on %p\n", req); - mutex_lock(&mdsc->mutex); - __register_request(mdsc, req, NULL); - __do_request(mdsc, req); - mutex_unlock(&mdsc->mutex); -} - -/* - * Synchrously perform an mds request. Take care of all of the - * session setup, forwarding, retry details. - */ -int ceph_mdsc_do_request(struct ceph_mds_client *mdsc, - struct inode *dir, - struct ceph_mds_request *req) -{ int err; - dout("do_request on %p\n", req); - /* take CAP_PIN refs for r_inode, r_parent, r_old_dentry */ if (req->r_inode) ceph_get_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN); @@ -2701,18 +2635,21 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc, ceph_get_cap_refs(ceph_inode(req->r_old_dentry_dir), CEPH_CAP_PIN); - /* issue */ + dout("submit_request on %p for inode %p\n", req, dir); mutex_lock(&mdsc->mutex); __register_request(mdsc, req, dir); __do_request(mdsc, req); + err = req->r_err; + mutex_unlock(&mdsc->mutex); + return err; +} - if (req->r_err) { - err = req->r_err; - goto out; - } +static int ceph_mdsc_wait_request(struct ceph_mds_client *mdsc, + struct ceph_mds_request *req) +{ + int err; /* wait */ - mutex_unlock(&mdsc->mutex); dout("do_request waiting\n"); if (!req->r_timeout && req->r_wait_for_completion) { err = req->r_wait_for_completion(mdsc, req); @@ -2753,8 +2690,26 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc, err = req->r_err; } -out: mutex_unlock(&mdsc->mutex); + return err; +} + +/* + * Synchrously perform an mds request. Take care of all of the + * session setup, forwarding, retry details. + */ +int ceph_mdsc_do_request(struct ceph_mds_client *mdsc, + struct inode *dir, + struct ceph_mds_request *req) +{ + int err; + + dout("do_request on %p\n", req); + + /* issue */ + err = ceph_mdsc_submit_request(mdsc, dir, req); + if (!err) + err = ceph_mdsc_wait_request(mdsc, req); dout("do_request %p done, result %d\n", req, err); return err; } @@ -3485,7 +3440,7 @@ out_freeflocks: ceph_pagelist_encode_string(pagelist, path, pathlen); ceph_pagelist_append(pagelist, &rec, sizeof(rec.v1)); out_freepath: - kfree(path); + ceph_mdsc_free_path(path, pathlen); } out_err: @@ -3642,7 +3597,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, recon_state.msg_version = 2; } /* trsaverse this session's caps */ - err = iterate_session_caps(session, encode_caps_cb, &recon_state); + err = ceph_iterate_session_caps(session, encode_caps_cb, &recon_state); spin_lock(&session->s_cap_lock); session->s_cap_reconnect = 0; @@ -4125,6 +4080,8 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) mdsc->max_sessions = 0; mdsc->stopping = 0; atomic64_set(&mdsc->quotarealms_count, 0); + mdsc->quotarealms_inodes = RB_ROOT; + mutex_init(&mdsc->quotarealms_inodes_mutex); mdsc->last_snap_seq = 0; init_rwsem(&mdsc->snap_rwsem); mdsc->snap_realms = RB_ROOT; @@ -4216,6 +4173,8 @@ void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc) * their inode/dcache refs */ ceph_msgr_flush(); + + ceph_cleanup_quotarealms_inodes(mdsc); } /* diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 50385a481fdb..a83f28bc2387 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -326,6 +326,18 @@ struct ceph_snapid_map { }; /* + * node for list of quotarealm inodes that are not visible from the filesystem + * mountpoint, but required to handle, e.g. quotas. + */ +struct ceph_quotarealm_inode { + struct rb_node node; + u64 ino; + unsigned long timeout; /* last time a lookup failed for this inode */ + struct mutex mutex; + struct inode *inode; +}; + +/* * mds client state */ struct ceph_mds_client { @@ -344,6 +356,12 @@ struct ceph_mds_client { int stopping; /* true if shutting down */ atomic64_t quotarealms_count; /* # realms with quota */ + /* + * We keep a list of inodes we don't see in the mountpoint but that we + * need to track quota realms. + */ + struct rb_root quotarealms_inodes; + struct mutex quotarealms_inodes_mutex; /* * snap_rwsem will cover cap linkage into snaprealms, and @@ -447,8 +465,9 @@ extern int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req, struct inode *dir); extern struct ceph_mds_request * ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode); -extern void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, - struct ceph_mds_request *req); +extern int ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, + struct inode *dir, + struct ceph_mds_request *req); extern int ceph_mdsc_do_request(struct ceph_mds_client *mdsc, struct inode *dir, struct ceph_mds_request *req); @@ -468,8 +487,18 @@ extern void ceph_flush_cap_releases(struct ceph_mds_client *mdsc, struct ceph_mds_session *session); extern void ceph_queue_cap_reclaim_work(struct ceph_mds_client *mdsc); extern void ceph_reclaim_caps_nr(struct ceph_mds_client *mdsc, int nr); +extern int ceph_iterate_session_caps(struct ceph_mds_session *session, + int (*cb)(struct inode *, + struct ceph_cap *, void *), + void *arg); extern void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc); +static inline void ceph_mdsc_free_path(char *path, int len) +{ + if (path) + __putname(path - (PATH_MAX - 1 - len)); +} + extern char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base, int stop_on_nosnap); diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c index 1a2c5d390f7f..701b4fb0fb5a 100644 --- a/fs/ceph/mdsmap.c +++ b/fs/ceph/mdsmap.c @@ -205,7 +205,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s\n", i+1, n, global_id, mds, inc, - ceph_pr_addr(&addr.in_addr), + ceph_pr_addr(&addr), ceph_mds_state_name(state)); if (mds < 0 || state <= 0) diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c index 9455d3aef0c3..c4522212872c 100644 --- a/fs/ceph/quota.c +++ b/fs/ceph/quota.c @@ -22,7 +22,16 @@ void ceph_adjust_quota_realms_count(struct inode *inode, bool inc) static inline bool ceph_has_realms_with_quotas(struct inode *inode) { struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; - return atomic64_read(&mdsc->quotarealms_count) > 0; + struct super_block *sb = mdsc->fsc->sb; + + if (atomic64_read(&mdsc->quotarealms_count) > 0) + return true; + /* if root is the real CephFS root, we don't have quota realms */ + if (sb->s_root->d_inode && + (sb->s_root->d_inode->i_ino == CEPH_INO_ROOT)) + return false; + /* otherwise, we can't know for sure */ + return true; } void ceph_handle_quota(struct ceph_mds_client *mdsc, @@ -68,6 +77,108 @@ void ceph_handle_quota(struct ceph_mds_client *mdsc, iput(inode); } +static struct ceph_quotarealm_inode * +find_quotarealm_inode(struct ceph_mds_client *mdsc, u64 ino) +{ + struct ceph_quotarealm_inode *qri = NULL; + struct rb_node **node, *parent = NULL; + + mutex_lock(&mdsc->quotarealms_inodes_mutex); + node = &(mdsc->quotarealms_inodes.rb_node); + while (*node) { + parent = *node; + qri = container_of(*node, struct ceph_quotarealm_inode, node); + + if (ino < qri->ino) + node = &((*node)->rb_left); + else if (ino > qri->ino) + node = &((*node)->rb_right); + else + break; + } + if (!qri || (qri->ino != ino)) { + /* Not found, create a new one and insert it */ + qri = kmalloc(sizeof(*qri), GFP_KERNEL); + if (qri) { + qri->ino = ino; + qri->inode = NULL; + qri->timeout = 0; + mutex_init(&qri->mutex); + rb_link_node(&qri->node, parent, node); + rb_insert_color(&qri->node, &mdsc->quotarealms_inodes); + } else + pr_warn("Failed to alloc quotarealms_inode\n"); + } + mutex_unlock(&mdsc->quotarealms_inodes_mutex); + + return qri; +} + +/* + * This function will try to lookup a realm inode which isn't visible in the + * filesystem mountpoint. A list of these kind of inodes (not visible) is + * maintained in the mdsc and freed only when the filesystem is umounted. + * + * Note that these inodes are kept in this list even if the lookup fails, which + * allows to prevent useless lookup requests. + */ +static struct inode *lookup_quotarealm_inode(struct ceph_mds_client *mdsc, + struct super_block *sb, + struct ceph_snap_realm *realm) +{ + struct ceph_quotarealm_inode *qri; + struct inode *in; + + qri = find_quotarealm_inode(mdsc, realm->ino); + if (!qri) + return NULL; + + mutex_lock(&qri->mutex); + if (qri->inode) { + /* A request has already returned the inode */ + mutex_unlock(&qri->mutex); + return qri->inode; + } + /* Check if this inode lookup has failed recently */ + if (qri->timeout && + time_before_eq(jiffies, qri->timeout)) { + mutex_unlock(&qri->mutex); + return NULL; + } + in = ceph_lookup_inode(sb, realm->ino); + if (IS_ERR(in)) { + pr_warn("Can't lookup inode %llx (err: %ld)\n", + realm->ino, PTR_ERR(in)); + qri->timeout = jiffies + msecs_to_jiffies(60 * 1000); /* XXX */ + } else { + qri->timeout = 0; + qri->inode = in; + } + mutex_unlock(&qri->mutex); + + return in; +} + +void ceph_cleanup_quotarealms_inodes(struct ceph_mds_client *mdsc) +{ + struct ceph_quotarealm_inode *qri; + struct rb_node *node; + + /* + * It should now be safe to clean quotarealms_inode tree without holding + * mdsc->quotarealms_inodes_mutex... + */ + mutex_lock(&mdsc->quotarealms_inodes_mutex); + while (!RB_EMPTY_ROOT(&mdsc->quotarealms_inodes)) { + node = rb_first(&mdsc->quotarealms_inodes); + qri = rb_entry(node, struct ceph_quotarealm_inode, node); + rb_erase(node, &mdsc->quotarealms_inodes); + iput(qri->inode); + kfree(qri); + } + mutex_unlock(&mdsc->quotarealms_inodes_mutex); +} + /* * This function walks through the snaprealm for an inode and returns the * ceph_snap_realm for the first snaprealm that has quotas set (either max_files @@ -76,9 +187,15 @@ void ceph_handle_quota(struct ceph_mds_client *mdsc, * * Note that the caller is responsible for calling ceph_put_snap_realm() on the * returned realm. + * + * Callers of this function need to hold mdsc->snap_rwsem. However, if there's + * a need to do an inode lookup, this rwsem will be temporarily dropped. Hence + * the 'retry' argument: if rwsem needs to be dropped and 'retry' is 'false' + * this function will return -EAGAIN; otherwise, the snaprealms walk-through + * will be restarted. */ static struct ceph_snap_realm *get_quota_realm(struct ceph_mds_client *mdsc, - struct inode *inode) + struct inode *inode, bool retry) { struct ceph_inode_info *ci = NULL; struct ceph_snap_realm *realm, *next; @@ -88,6 +205,7 @@ static struct ceph_snap_realm *get_quota_realm(struct ceph_mds_client *mdsc, if (ceph_snap(inode) != CEPH_NOSNAP) return NULL; +restart: realm = ceph_inode(inode)->i_snap_realm; if (realm) ceph_get_snap_realm(mdsc, realm); @@ -95,11 +213,25 @@ static struct ceph_snap_realm *get_quota_realm(struct ceph_mds_client *mdsc, pr_err_ratelimited("get_quota_realm: ino (%llx.%llx) " "null i_snap_realm\n", ceph_vinop(inode)); while (realm) { + bool has_inode; + spin_lock(&realm->inodes_with_caps_lock); - in = realm->inode ? igrab(realm->inode) : NULL; + has_inode = realm->inode; + in = has_inode ? igrab(realm->inode) : NULL; spin_unlock(&realm->inodes_with_caps_lock); - if (!in) + if (has_inode && !in) break; + if (!in) { + up_read(&mdsc->snap_rwsem); + in = lookup_quotarealm_inode(mdsc, inode->i_sb, realm); + down_read(&mdsc->snap_rwsem); + if (IS_ERR_OR_NULL(in)) + break; + ceph_put_snap_realm(mdsc, realm); + if (!retry) + return ERR_PTR(-EAGAIN); + goto restart; + } ci = ceph_inode(in); has_quota = __ceph_has_any_quota(ci); @@ -125,9 +257,22 @@ bool ceph_quota_is_same_realm(struct inode *old, struct inode *new) struct ceph_snap_realm *old_realm, *new_realm; bool is_same; +restart: + /* + * We need to lookup 2 quota realms atomically, i.e. with snap_rwsem. + * However, get_quota_realm may drop it temporarily. By setting the + * 'retry' parameter to 'false', we'll get -EAGAIN if the rwsem was + * dropped and we can then restart the whole operation. + */ down_read(&mdsc->snap_rwsem); - old_realm = get_quota_realm(mdsc, old); - new_realm = get_quota_realm(mdsc, new); + old_realm = get_quota_realm(mdsc, old, true); + new_realm = get_quota_realm(mdsc, new, false); + if (PTR_ERR(new_realm) == -EAGAIN) { + up_read(&mdsc->snap_rwsem); + if (old_realm) + ceph_put_snap_realm(mdsc, old_realm); + goto restart; + } is_same = (old_realm == new_realm); up_read(&mdsc->snap_rwsem); @@ -166,6 +311,7 @@ static bool check_quota_exceeded(struct inode *inode, enum quota_check_op op, return false; down_read(&mdsc->snap_rwsem); +restart: realm = ceph_inode(inode)->i_snap_realm; if (realm) ceph_get_snap_realm(mdsc, realm); @@ -173,12 +319,23 @@ static bool check_quota_exceeded(struct inode *inode, enum quota_check_op op, pr_err_ratelimited("check_quota_exceeded: ino (%llx.%llx) " "null i_snap_realm\n", ceph_vinop(inode)); while (realm) { + bool has_inode; + spin_lock(&realm->inodes_with_caps_lock); - in = realm->inode ? igrab(realm->inode) : NULL; + has_inode = realm->inode; + in = has_inode ? igrab(realm->inode) : NULL; spin_unlock(&realm->inodes_with_caps_lock); - if (!in) + if (has_inode && !in) break; - + if (!in) { + up_read(&mdsc->snap_rwsem); + in = lookup_quotarealm_inode(mdsc, inode->i_sb, realm); + down_read(&mdsc->snap_rwsem); + if (IS_ERR_OR_NULL(in)) + break; + ceph_put_snap_realm(mdsc, realm); + goto restart; + } ci = ceph_inode(in); spin_lock(&ci->i_ceph_lock); if (op == QUOTA_CHECK_MAX_FILES_OP) { @@ -314,7 +471,7 @@ bool ceph_quota_update_statfs(struct ceph_fs_client *fsc, struct kstatfs *buf) bool is_updated = false; down_read(&mdsc->snap_rwsem); - realm = get_quota_realm(mdsc, d_inode(fsc->sb->s_root)); + realm = get_quota_realm(mdsc, d_inode(fsc->sb->s_root), true); up_read(&mdsc->snap_rwsem); if (!realm) return false; diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 285edda4fc3b..c864b44c8341 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -845,6 +845,12 @@ static void ceph_umount_begin(struct super_block *sb) return; } +static int ceph_remount(struct super_block *sb, int *flags, char *data) +{ + sync_filesystem(sb); + return 0; +} + static const struct super_operations ceph_super_ops = { .alloc_inode = ceph_alloc_inode, .destroy_inode = ceph_destroy_inode, @@ -853,6 +859,7 @@ static const struct super_operations ceph_super_ops = { .drop_inode = ceph_drop_inode, .sync_fs = ceph_sync_fs, .put_super = ceph_put_super, + .remount_fs = ceph_remount, .show_options = ceph_show_options, .statfs = ceph_statfs, .umount_begin = ceph_umount_begin, diff --git a/fs/ceph/super.h b/fs/ceph/super.h index c5b4a05905c0..6edab9a750f8 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -1083,6 +1083,7 @@ extern long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg); /* export.c */ extern const struct export_operations ceph_export_ops; +struct inode *ceph_lookup_inode(struct super_block *sb, u64 ino); /* locks.c */ extern __init void ceph_flock_init(void); @@ -1133,5 +1134,6 @@ extern bool ceph_quota_is_max_bytes_approaching(struct inode *inode, loff_t newlen); extern bool ceph_quota_update_statfs(struct ceph_fs_client *fsc, struct kstatfs *buf); +extern void ceph_cleanup_quotarealms_inodes(struct ceph_mds_client *mdsc); #endif /* _FS_CEPH_SUPER_H */ diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h index 4903deb0777a..3ac0feaf2b5e 100644 --- a/include/linux/ceph/ceph_fs.h +++ b/include/linux/ceph/ceph_fs.h @@ -436,6 +436,12 @@ union ceph_mds_request_args { __le64 length; /* num bytes to lock from start */ __u8 wait; /* will caller wait for lock to become available? */ } __attribute__ ((packed)) filelock_change; + struct { + __le32 mask; /* CEPH_CAP_* */ + __le64 snapid; + __le64 parent; + __le32 hash; + } __attribute__ ((packed)) lookupino; } __attribute__ ((packed)); #define CEPH_MDS_FLAG_REPLAY 1 /* this is a replayed op */ diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index 800a2128d411..23895d178149 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -323,7 +323,8 @@ struct ceph_connection { }; -extern const char *ceph_pr_addr(const struct sockaddr_storage *ss); +extern const char *ceph_pr_addr(const struct ceph_entity_addr *addr); + extern int ceph_parse_ips(const char *c, const char *end, struct ceph_entity_addr *addr, int max_count, int *count); diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index 5675b1f09bc5..e081b56f1c1d 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -110,17 +110,16 @@ struct ceph_object_id { int name_len; }; +#define __CEPH_OID_INITIALIZER(oid) { .name = (oid).inline_name } + +#define CEPH_DEFINE_OID_ONSTACK(oid) \ + struct ceph_object_id oid = __CEPH_OID_INITIALIZER(oid) + static inline void ceph_oid_init(struct ceph_object_id *oid) { - oid->name = oid->inline_name; - oid->name_len = 0; + *oid = (struct ceph_object_id) __CEPH_OID_INITIALIZER(*oid); } -#define CEPH_OID_INIT_ONSTACK(oid) \ - ({ ceph_oid_init(&oid); oid; }) -#define CEPH_DEFINE_OID_ONSTACK(oid) \ - struct ceph_object_id oid = CEPH_OID_INIT_ONSTACK(oid) - static inline bool ceph_oid_empty(const struct ceph_object_id *oid) { return oid->name == oid->inline_name && !oid->name_len; diff --git a/net/ceph/cls_lock_client.c b/net/ceph/cls_lock_client.c index 2105a6eaa66c..4cc28541281b 100644 --- a/net/ceph/cls_lock_client.c +++ b/net/ceph/cls_lock_client.c @@ -271,7 +271,7 @@ static int decode_locker(void **p, void *end, struct ceph_locker *locker) dout("%s %s%llu cookie %s addr %s\n", __func__, ENTITY_NAME(locker->id.name), locker->id.cookie, - ceph_pr_addr(&locker->info.addr.in_addr)); + ceph_pr_addr(&locker->info.addr)); return 0; } diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c index 46f65709a6ff..63aef9915f75 100644 --- a/net/ceph/debugfs.c +++ b/net/ceph/debugfs.c @@ -46,7 +46,7 @@ static int monmap_show(struct seq_file *s, void *p) seq_printf(s, "\t%s%lld\t%s\n", ENTITY_NAME(inst->name), - ceph_pr_addr(&inst->addr.in_addr)); + ceph_pr_addr(&inst->addr)); } return 0; } @@ -82,7 +82,7 @@ static int osdmap_show(struct seq_file *s, void *p) char sb[64]; seq_printf(s, "osd%d\t%s\t%3d%%\t(%s)\t%3d%%\n", - i, ceph_pr_addr(&addr->in_addr), + i, ceph_pr_addr(addr), ((map->osd_weight[i]*100) >> 16), ceph_osdmap_state_str(sb, sizeof(sb), state), ((ceph_get_primary_affinity(map, i)*100) >> 16)); diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 3083988ce729..3ee380758ddd 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -186,17 +186,18 @@ static atomic_t addr_str_seq = ATOMIC_INIT(0); static struct page *zero_page; /* used in certain error cases */ -const char *ceph_pr_addr(const struct sockaddr_storage *ss) +const char *ceph_pr_addr(const struct ceph_entity_addr *addr) { int i; char *s; - struct sockaddr_in *in4 = (struct sockaddr_in *) ss; - struct sockaddr_in6 *in6 = (struct sockaddr_in6 *) ss; + struct sockaddr_storage ss = addr->in_addr; /* align */ + struct sockaddr_in *in4 = (struct sockaddr_in *)&ss; + struct sockaddr_in6 *in6 = (struct sockaddr_in6 *)&ss; i = atomic_inc_return(&addr_str_seq) & ADDR_STR_COUNT_MASK; s = addr_str[i]; - switch (ss->ss_family) { + switch (ss.ss_family) { case AF_INET: snprintf(s, MAX_ADDR_STR_LEN, "%pI4:%hu", &in4->sin_addr, ntohs(in4->sin_port)); @@ -209,7 +210,7 @@ const char *ceph_pr_addr(const struct sockaddr_storage *ss) default: snprintf(s, MAX_ADDR_STR_LEN, "(unknown sockaddr family %hu)", - ss->ss_family); + ss.ss_family); } return s; @@ -449,7 +450,7 @@ static void set_sock_callbacks(struct socket *sock, */ static int ceph_tcp_connect(struct ceph_connection *con) { - struct sockaddr_storage *paddr = &con->peer_addr.in_addr; + struct sockaddr_storage ss = con->peer_addr.in_addr; /* align */ struct socket *sock; unsigned int noio_flag; int ret; @@ -458,7 +459,7 @@ static int ceph_tcp_connect(struct ceph_connection *con) /* sock_create_kern() allocates with GFP_KERNEL */ noio_flag = memalloc_noio_save(); - ret = sock_create_kern(read_pnet(&con->msgr->net), paddr->ss_family, + ret = sock_create_kern(read_pnet(&con->msgr->net), ss.ss_family, SOCK_STREAM, IPPROTO_TCP, &sock); memalloc_noio_restore(noio_flag); if (ret) @@ -471,18 +472,18 @@ static int ceph_tcp_connect(struct ceph_connection *con) set_sock_callbacks(sock, con); - dout("connect %s\n", ceph_pr_addr(&con->peer_addr.in_addr)); + dout("connect %s\n", ceph_pr_addr(&con->peer_addr)); con_sock_state_connecting(con); - ret = sock->ops->connect(sock, (struct sockaddr *)paddr, sizeof(*paddr), + ret = sock->ops->connect(sock, (struct sockaddr *)&ss, sizeof(ss), O_NONBLOCK); if (ret == -EINPROGRESS) { dout("connect %s EINPROGRESS sk_state = %u\n", - ceph_pr_addr(&con->peer_addr.in_addr), + ceph_pr_addr(&con->peer_addr), sock->sk->sk_state); } else if (ret < 0) { pr_err("connect %s error %d\n", - ceph_pr_addr(&con->peer_addr.in_addr), ret); + ceph_pr_addr(&con->peer_addr), ret); sock_release(sock); return ret; } @@ -669,8 +670,7 @@ static void reset_connection(struct ceph_connection *con) void ceph_con_close(struct ceph_connection *con) { mutex_lock(&con->mutex); - dout("con_close %p peer %s\n", con, - ceph_pr_addr(&con->peer_addr.in_addr)); + dout("con_close %p peer %s\n", con, ceph_pr_addr(&con->peer_addr)); con->state = CON_STATE_CLOSED; con_flag_clear(con, CON_FLAG_LOSSYTX); /* so we retry next connect */ @@ -694,7 +694,7 @@ void ceph_con_open(struct ceph_connection *con, struct ceph_entity_addr *addr) { mutex_lock(&con->mutex); - dout("con_open %p %s\n", con, ceph_pr_addr(&addr->in_addr)); + dout("con_open %p %s\n", con, ceph_pr_addr(addr)); WARN_ON(con->state != CON_STATE_CLOSED); con->state = CON_STATE_PREOPEN; @@ -1788,21 +1788,22 @@ static int verify_hello(struct ceph_connection *con) { if (memcmp(con->in_banner, CEPH_BANNER, strlen(CEPH_BANNER))) { pr_err("connect to %s got bad banner\n", - ceph_pr_addr(&con->peer_addr.in_addr)); + ceph_pr_addr(&con->peer_addr)); con->error_msg = "protocol error, bad banner"; return -1; } return 0; } -static bool addr_is_blank(struct sockaddr_storage *ss) +static bool addr_is_blank(struct ceph_entity_addr *addr) { - struct in_addr *addr = &((struct sockaddr_in *)ss)->sin_addr; - struct in6_addr *addr6 = &((struct sockaddr_in6 *)ss)->sin6_addr; + struct sockaddr_storage ss = addr->in_addr; /* align */ + struct in_addr *addr4 = &((struct sockaddr_in *)&ss)->sin_addr; + struct in6_addr *addr6 = &((struct sockaddr_in6 *)&ss)->sin6_addr; - switch (ss->ss_family) { + switch (ss.ss_family) { case AF_INET: - return addr->s_addr == htonl(INADDR_ANY); + return addr4->s_addr == htonl(INADDR_ANY); case AF_INET6: return ipv6_addr_any(addr6); default: @@ -1810,25 +1811,25 @@ static bool addr_is_blank(struct sockaddr_storage *ss) } } -static int addr_port(struct sockaddr_storage *ss) +static int addr_port(struct ceph_entity_addr *addr) { - switch (ss->ss_family) { + switch (get_unaligned(&addr->in_addr.ss_family)) { case AF_INET: - return ntohs(((struct sockaddr_in *)ss)->sin_port); + return ntohs(get_unaligned(&((struct sockaddr_in *)&addr->in_addr)->sin_port)); case AF_INET6: - return ntohs(((struct sockaddr_in6 *)ss)->sin6_port); + return ntohs(get_unaligned(&((struct sockaddr_in6 *)&addr->in_addr)->sin6_port)); } return 0; } -static void addr_set_port(struct sockaddr_storage *ss, int p) +static void addr_set_port(struct ceph_entity_addr *addr, int p) { - switch (ss->ss_family) { + switch (get_unaligned(&addr->in_addr.ss_family)) { case AF_INET: - ((struct sockaddr_in *)ss)->sin_port = htons(p); + put_unaligned(htons(p), &((struct sockaddr_in *)&addr->in_addr)->sin_port); break; case AF_INET6: - ((struct sockaddr_in6 *)ss)->sin6_port = htons(p); + put_unaligned(htons(p), &((struct sockaddr_in6 *)&addr->in_addr)->sin6_port); break; } } @@ -1836,21 +1837,18 @@ static void addr_set_port(struct sockaddr_storage *ss, int p) /* * Unlike other *_pton function semantics, zero indicates success. */ -static int ceph_pton(const char *str, size_t len, struct sockaddr_storage *ss, +static int ceph_pton(const char *str, size_t len, struct ceph_entity_addr *addr, char delim, const char **ipend) { - struct sockaddr_in *in4 = (struct sockaddr_in *) ss; - struct sockaddr_in6 *in6 = (struct sockaddr_in6 *) ss; + memset(&addr->in_addr, 0, sizeof(addr->in_addr)); - memset(ss, 0, sizeof(*ss)); - - if (in4_pton(str, len, (u8 *)&in4->sin_addr.s_addr, delim, ipend)) { - ss->ss_family = AF_INET; + if (in4_pton(str, len, (u8 *)&((struct sockaddr_in *)&addr->in_addr)->sin_addr.s_addr, delim, ipend)) { + put_unaligned(AF_INET, &addr->in_addr.ss_family); return 0; } - if (in6_pton(str, len, (u8 *)&in6->sin6_addr.s6_addr, delim, ipend)) { - ss->ss_family = AF_INET6; + if (in6_pton(str, len, (u8 *)&((struct sockaddr_in6 *)&addr->in_addr)->sin6_addr.s6_addr, delim, ipend)) { + put_unaligned(AF_INET6, &addr->in_addr.ss_family); return 0; } @@ -1862,7 +1860,7 @@ static int ceph_pton(const char *str, size_t len, struct sockaddr_storage *ss, */ #ifdef CONFIG_CEPH_LIB_USE_DNS_RESOLVER static int ceph_dns_resolve_name(const char *name, size_t namelen, - struct sockaddr_storage *ss, char delim, const char **ipend) + struct ceph_entity_addr *addr, char delim, const char **ipend) { const char *end, *delim_p; char *colon_p, *ip_addr = NULL; @@ -1891,7 +1889,7 @@ static int ceph_dns_resolve_name(const char *name, size_t namelen, /* do dns_resolve upcall */ ip_len = dns_query(NULL, name, end - name, NULL, &ip_addr, NULL); if (ip_len > 0) - ret = ceph_pton(ip_addr, ip_len, ss, -1, NULL); + ret = ceph_pton(ip_addr, ip_len, addr, -1, NULL); else ret = -ESRCH; @@ -1900,13 +1898,13 @@ static int ceph_dns_resolve_name(const char *name, size_t namelen, *ipend = end; pr_info("resolve '%.*s' (ret=%d): %s\n", (int)(end - name), name, - ret, ret ? "failed" : ceph_pr_addr(ss)); + ret, ret ? "failed" : ceph_pr_addr(addr)); return ret; } #else static inline int ceph_dns_resolve_name(const char *name, size_t namelen, - struct sockaddr_storage *ss, char delim, const char **ipend) + struct ceph_entity_addr *addr, char delim, const char **ipend) { return -EINVAL; } @@ -1917,13 +1915,13 @@ static inline int ceph_dns_resolve_name(const char *name, size_t namelen, * then try to extract a hostname to resolve using userspace DNS upcall. */ static int ceph_parse_server_name(const char *name, size_t namelen, - struct sockaddr_storage *ss, char delim, const char **ipend) + struct ceph_entity_addr *addr, char delim, const char **ipend) { int ret; - ret = ceph_pton(name, namelen, ss, delim, ipend); + ret = ceph_pton(name, namelen, addr, delim, ipend); if (ret) - ret = ceph_dns_resolve_name(name, namelen, ss, delim, ipend); + ret = ceph_dns_resolve_name(name, namelen, addr, delim, ipend); return ret; } @@ -1942,7 +1940,6 @@ int ceph_parse_ips(const char *c, const char *end, dout("parse_ips on '%.*s'\n", (int)(end-c), c); for (i = 0; i < max_count; i++) { const char *ipend; - struct sockaddr_storage *ss = &addr[i].in_addr; int port; char delim = ','; @@ -1951,7 +1948,7 @@ int ceph_parse_ips(const char *c, const char *end, p++; } - ret = ceph_parse_server_name(p, end - p, ss, delim, &ipend); + ret = ceph_parse_server_name(p, end - p, &addr[i], delim, &ipend); if (ret) goto bad; ret = -EINVAL; @@ -1982,9 +1979,9 @@ int ceph_parse_ips(const char *c, const char *end, port = CEPH_MON_PORT; } - addr_set_port(ss, port); + addr_set_port(&addr[i], port); - dout("parse_ips got %s\n", ceph_pr_addr(ss)); + dout("parse_ips got %s\n", ceph_pr_addr(&addr[i])); if (p == end) break; @@ -2023,12 +2020,12 @@ static int process_banner(struct ceph_connection *con) */ if (memcmp(&con->peer_addr, &con->actual_peer_addr, sizeof(con->peer_addr)) != 0 && - !(addr_is_blank(&con->actual_peer_addr.in_addr) && + !(addr_is_blank(&con->actual_peer_addr) && con->actual_peer_addr.nonce == con->peer_addr.nonce)) { pr_warn("wrong peer, want %s/%d, got %s/%d\n", - ceph_pr_addr(&con->peer_addr.in_addr), + ceph_pr_addr(&con->peer_addr), (int)le32_to_cpu(con->peer_addr.nonce), - ceph_pr_addr(&con->actual_peer_addr.in_addr), + ceph_pr_addr(&con->actual_peer_addr), (int)le32_to_cpu(con->actual_peer_addr.nonce)); con->error_msg = "wrong peer at address"; return -1; @@ -2037,16 +2034,16 @@ static int process_banner(struct ceph_connection *con) /* * did we learn our address? */ - if (addr_is_blank(&con->msgr->inst.addr.in_addr)) { - int port = addr_port(&con->msgr->inst.addr.in_addr); + if (addr_is_blank(&con->msgr->inst.addr)) { + int port = addr_port(&con->msgr->inst.addr); memcpy(&con->msgr->inst.addr.in_addr, &con->peer_addr_for_me.in_addr, sizeof(con->peer_addr_for_me.in_addr)); - addr_set_port(&con->msgr->inst.addr.in_addr, port); + addr_set_port(&con->msgr->inst.addr, port); encode_my_addr(con->msgr); dout("process_banner learned my addr is %s\n", - ceph_pr_addr(&con->msgr->inst.addr.in_addr)); + ceph_pr_addr(&con->msgr->inst.addr)); } return 0; @@ -2097,7 +2094,7 @@ static int process_connect(struct ceph_connection *con) pr_err("%s%lld %s feature set mismatch," " my %llx < server's %llx, missing %llx\n", ENTITY_NAME(con->peer_name), - ceph_pr_addr(&con->peer_addr.in_addr), + ceph_pr_addr(&con->peer_addr), sup_feat, server_feat, server_feat & ~sup_feat); con->error_msg = "missing required protocol features"; reset_connection(con); @@ -2107,7 +2104,7 @@ static int process_connect(struct ceph_connection *con) pr_err("%s%lld %s protocol version mismatch," " my %d != server's %d\n", ENTITY_NAME(con->peer_name), - ceph_pr_addr(&con->peer_addr.in_addr), + ceph_pr_addr(&con->peer_addr), le32_to_cpu(con->out_connect.protocol_version), le32_to_cpu(con->in_reply.protocol_version)); con->error_msg = "protocol version mismatch"; @@ -2141,7 +2138,7 @@ static int process_connect(struct ceph_connection *con) le32_to_cpu(con->in_reply.connect_seq)); pr_err("%s%lld %s connection reset\n", ENTITY_NAME(con->peer_name), - ceph_pr_addr(&con->peer_addr.in_addr)); + ceph_pr_addr(&con->peer_addr)); reset_connection(con); con_out_kvec_reset(con); ret = prepare_write_connect(con); @@ -2198,7 +2195,7 @@ static int process_connect(struct ceph_connection *con) pr_err("%s%lld %s protocol feature mismatch," " my required %llx > server's %llx, need %llx\n", ENTITY_NAME(con->peer_name), - ceph_pr_addr(&con->peer_addr.in_addr), + ceph_pr_addr(&con->peer_addr), req_feat, server_feat, req_feat & ~server_feat); con->error_msg = "missing required protocol features"; reset_connection(con); @@ -2405,7 +2402,7 @@ static int read_partial_message(struct ceph_connection *con) if ((s64)seq - (s64)con->in_seq < 1) { pr_info("skipping %s%lld %s seq %lld expected %lld\n", ENTITY_NAME(con->peer_name), - ceph_pr_addr(&con->peer_addr.in_addr), + ceph_pr_addr(&con->peer_addr), seq, con->in_seq + 1); con->in_base_pos = -front_len - middle_len - data_len - sizeof_footer(con); @@ -2984,10 +2981,10 @@ static void ceph_con_workfn(struct work_struct *work) static void con_fault(struct ceph_connection *con) { dout("fault %p state %lu to peer %s\n", - con, con->state, ceph_pr_addr(&con->peer_addr.in_addr)); + con, con->state, ceph_pr_addr(&con->peer_addr)); pr_warn("%s%lld %s %s\n", ENTITY_NAME(con->peer_name), - ceph_pr_addr(&con->peer_addr.in_addr), con->error_msg); + ceph_pr_addr(&con->peer_addr), con->error_msg); con->error_msg = NULL; WARN_ON(con->state != CON_STATE_CONNECTING && diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index a53e4fbb6319..895679d3529b 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -76,7 +76,7 @@ struct ceph_monmap *ceph_monmap_decode(void *p, void *end) m->num_mon); for (i = 0; i < m->num_mon; i++) dout("monmap_decode mon%d is %s\n", i, - ceph_pr_addr(&m->mon_inst[i].addr.in_addr)); + ceph_pr_addr(&m->mon_inst[i].addr)); return m; bad: @@ -203,7 +203,7 @@ static void reopen_session(struct ceph_mon_client *monc) { if (!monc->hunting) pr_info("mon%d %s session lost, hunting for new mon\n", - monc->cur_mon, ceph_pr_addr(&monc->con.peer_addr.in_addr)); + monc->cur_mon, ceph_pr_addr(&monc->con.peer_addr)); __close_session(monc); __open_session(monc); @@ -1178,7 +1178,7 @@ static void handle_auth_reply(struct ceph_mon_client *monc, __resend_generic_request(monc); pr_info("mon%d %s session established\n", monc->cur_mon, - ceph_pr_addr(&monc->con.peer_addr.in_addr)); + ceph_pr_addr(&monc->con.peer_addr)); } out: diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 6f739de28918..9a8eca5eda65 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -4926,7 +4926,7 @@ static int decode_watcher(void **p, void *end, struct ceph_watch_item *item) dout("%s %s%llu cookie %llu addr %s\n", __func__, ENTITY_NAME(item->name), item->cookie, - ceph_pr_addr(&item->addr.in_addr)); + ceph_pr_addr(&item->addr)); return 0; } |