summaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
Diffstat (limited to 'net')
-rw-r--r--net/ceph/Kconfig3
-rw-r--r--net/ceph/Makefile3
-rw-r--r--net/ceph/auth.c408
-rw-r--r--net/ceph/auth_none.c5
-rw-r--r--net/ceph/auth_x.c298
-rw-r--r--net/ceph/auth_x_protocol.h3
-rw-r--r--net/ceph/ceph_common.c63
-rw-r--r--net/ceph/ceph_strings.c28
-rw-r--r--net/ceph/crypto.h3
-rw-r--r--net/ceph/decode.c101
-rw-r--r--net/ceph/messenger.c1958
-rw-r--r--net/ceph/messenger_v1.c1506
-rw-r--r--net/ceph/messenger_v2.c3443
-rw-r--r--net/ceph/mon_client.c320
-rw-r--r--net/ceph/osd_client.c111
-rw-r--r--net/ceph/osdmap.c45
-rw-r--r--net/dccp/ipv4.c2
-rw-r--r--net/dccp/ipv6.c6
-rw-r--r--net/ipv4/icmp.c4
-rw-r--r--net/ipv4/inet_connection_sock.c4
-rw-r--r--net/ipv4/ip_output.c2
-rw-r--r--net/ipv4/ping.c2
-rw-r--r--net/ipv4/raw.c2
-rw-r--r--net/ipv4/syncookies.c2
-rw-r--r--net/ipv4/udp.c2
-rw-r--r--net/ipv6/af_inet6.c2
-rw-r--r--net/ipv6/datagram.c2
-rw-r--r--net/ipv6/icmp.c6
-rw-r--r--net/ipv6/inet6_connection_sock.c4
-rw-r--r--net/ipv6/netfilter/nf_reject_ipv6.c2
-rw-r--r--net/ipv6/ping.c2
-rw-r--r--net/ipv6/raw.c2
-rw-r--r--net/ipv6/syncookies.c2
-rw-r--r--net/ipv6/tcp_ipv6.c4
-rw-r--r--net/ipv6/udp.c2
-rw-r--r--net/l2tp/l2tp_ip6.c2
-rw-r--r--net/netfilter/nf_synproxy_core.c2
-rw-r--r--net/rds/ib.c10
-rw-r--r--net/rds/ib.h13
-rw-r--r--net/rds/ib_cm.c128
-rw-r--r--net/rds/ib_recv.c18
-rw-r--r--net/rds/ib_send.c8
-rw-r--r--net/socket.c15
-rw-r--r--net/sunrpc/auth_gss/gss_rpc_upcall.c15
-rw-r--r--net/sunrpc/auth_gss/gss_rpc_xdr.c3
-rw-r--r--net/sunrpc/cache.c41
-rw-r--r--net/sunrpc/clnt.c5
-rw-r--r--net/sunrpc/debugfs.c4
-rw-r--r--net/sunrpc/sched.c65
-rw-r--r--net/sunrpc/svc.c16
-rw-r--r--net/sunrpc/svc_xprt.c4
-rw-r--r--net/sunrpc/svcsock.c8
-rw-r--r--net/sunrpc/xdr.c895
-rw-r--r--net/sunrpc/xprt.c117
-rw-r--r--net/sunrpc/xprtrdma/Makefile2
-rw-r--r--net/sunrpc/xprtrdma/backchannel.c6
-rw-r--r--net/sunrpc/xprtrdma/frwr_ops.c81
-rw-r--r--net/sunrpc/xprtrdma/module.c1
-rw-r--r--net/sunrpc/xprtrdma/rpc_rdma.c72
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_backchannel.c14
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_pcl.c306
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_recvfrom.c314
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_rw.c600
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_sendto.c562
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_transport.c2
-rw-r--r--net/sunrpc/xprtrdma/transport.c8
-rw-r--r--net/sunrpc/xprtrdma/verbs.c30
-rw-r--r--net/sunrpc/xprtrdma/xprt_rdma.h9
-rw-r--r--net/sunrpc/xprtsock.c7
-rw-r--r--net/xfrm/xfrm_state.c6
70 files changed, 8614 insertions, 3127 deletions
diff --git a/net/ceph/Kconfig b/net/ceph/Kconfig
index f36f9a3a4e20..c5c4eef3a9ff 100644
--- a/net/ceph/Kconfig
+++ b/net/ceph/Kconfig
@@ -5,6 +5,9 @@ config CEPH_LIB
select LIBCRC32C
select CRYPTO_AES
select CRYPTO_CBC
+ select CRYPTO_GCM
+ select CRYPTO_HMAC
+ select CRYPTO_SHA256
select CRYPTO
select KEYS
default n
diff --git a/net/ceph/Makefile b/net/ceph/Makefile
index ce09bb4fb249..8802a0c0155d 100644
--- a/net/ceph/Makefile
+++ b/net/ceph/Makefile
@@ -14,4 +14,5 @@ libceph-y := ceph_common.o messenger.o msgpool.o buffer.o pagelist.o \
crypto.o armor.o \
auth_x.o \
ceph_strings.o ceph_hash.o \
- pagevec.o snapshot.o string_table.o
+ pagevec.o snapshot.o string_table.o \
+ messenger_v1.o messenger_v2.o
diff --git a/net/ceph/auth.c b/net/ceph/auth.c
index fbeee068ea14..eb261aa5fe18 100644
--- a/net/ceph/auth.c
+++ b/net/ceph/auth.c
@@ -21,28 +21,31 @@ static u32 supported_protocols[] = {
CEPH_AUTH_CEPHX
};
-static int ceph_auth_init_protocol(struct ceph_auth_client *ac, int protocol)
+static int init_protocol(struct ceph_auth_client *ac, int proto)
{
- switch (protocol) {
+ dout("%s proto %d\n", __func__, proto);
+
+ switch (proto) {
case CEPH_AUTH_NONE:
return ceph_auth_none_init(ac);
case CEPH_AUTH_CEPHX:
return ceph_x_init(ac);
default:
- return -ENOENT;
+ pr_err("bad auth protocol %d\n", proto);
+ return -EINVAL;
}
}
/*
* setup, teardown.
*/
-struct ceph_auth_client *ceph_auth_init(const char *name, const struct ceph_crypto_key *key)
+struct ceph_auth_client *ceph_auth_init(const char *name,
+ const struct ceph_crypto_key *key,
+ const int *con_modes)
{
struct ceph_auth_client *ac;
int ret;
- dout("auth_init name '%s'\n", name);
-
ret = -ENOMEM;
ac = kzalloc(sizeof(*ac), GFP_NOFS);
if (!ac)
@@ -54,8 +57,12 @@ struct ceph_auth_client *ceph_auth_init(const char *name, const struct ceph_cryp
ac->name = name;
else
ac->name = CEPH_AUTH_NAME_DEFAULT;
- dout("auth_init name %s\n", ac->name);
ac->key = key;
+ ac->preferred_mode = con_modes[0];
+ ac->fallback_mode = con_modes[1];
+
+ dout("%s name '%s' preferred_mode %d fallback_mode %d\n", __func__,
+ ac->name, ac->preferred_mode, ac->fallback_mode);
return ac;
out:
@@ -145,31 +152,35 @@ bad:
goto out;
}
-static int ceph_build_auth_request(struct ceph_auth_client *ac,
- void *msg_buf, size_t msg_len)
+static int build_request(struct ceph_auth_client *ac, bool add_header,
+ void *buf, int buf_len)
{
- struct ceph_mon_request_header *monhdr = msg_buf;
- void *p = monhdr + 1;
- void *end = msg_buf + msg_len;
+ void *end = buf + buf_len;
+ void *p;
int ret;
- monhdr->have_version = 0;
- monhdr->session_mon = cpu_to_le16(-1);
- monhdr->session_mon_tid = 0;
-
- ceph_encode_32(&p, ac->protocol);
+ p = buf;
+ if (add_header) {
+ /* struct ceph_mon_request_header + protocol */
+ ceph_encode_64_safe(&p, end, 0, e_range);
+ ceph_encode_16_safe(&p, end, -1, e_range);
+ ceph_encode_64_safe(&p, end, 0, e_range);
+ ceph_encode_32_safe(&p, end, ac->protocol, e_range);
+ }
+ ceph_encode_need(&p, end, sizeof(u32), e_range);
ret = ac->ops->build_request(ac, p + sizeof(u32), end);
if (ret < 0) {
- pr_err("error %d building auth method %s request\n", ret,
- ac->ops->name);
- goto out;
+ pr_err("auth protocol '%s' building request failed: %d\n",
+ ceph_auth_proto_name(ac->protocol), ret);
+ return ret;
}
dout(" built request %d bytes\n", ret);
ceph_encode_32(&p, ret);
- ret = p + ret - msg_buf;
-out:
- return ret;
+ return p + ret - buf;
+
+e_range:
+ return -ERANGE;
}
/*
@@ -229,10 +240,10 @@ int ceph_handle_auth_reply(struct ceph_auth_client *ac,
ac->ops = NULL;
}
if (ac->protocol != protocol) {
- ret = ceph_auth_init_protocol(ac, protocol);
+ ret = init_protocol(ac, protocol);
if (ret) {
- pr_err("error %d on auth protocol %d init\n",
- ret, protocol);
+ pr_err("auth protocol '%s' init failed: %d\n",
+ ceph_auth_proto_name(protocol), ret);
goto out;
}
}
@@ -240,12 +251,13 @@ int ceph_handle_auth_reply(struct ceph_auth_client *ac,
ac->negotiating = false;
}
- ret = ac->ops->handle_reply(ac, result, payload, payload_end);
- if (ret == -EAGAIN) {
- ret = ceph_build_auth_request(ac, reply_buf, reply_len);
- } else if (ret) {
- pr_err("auth method '%s' error %d\n", ac->ops->name, ret);
- }
+ ret = ac->ops->handle_reply(ac, result, payload, payload_end,
+ NULL, NULL, NULL, NULL);
+ if (ret == -EAGAIN)
+ ret = build_request(ac, true, reply_buf, reply_len);
+ else if (ret)
+ pr_err("auth protocol '%s' mauth authentication failed: %d\n",
+ ceph_auth_proto_name(ac->protocol), result);
out:
mutex_unlock(&ac->mutex);
@@ -264,7 +276,7 @@ int ceph_build_auth(struct ceph_auth_client *ac,
mutex_lock(&ac->mutex);
if (ac->ops->should_authenticate(ac))
- ret = ceph_build_auth_request(ac, msg_buf, msg_len);
+ ret = build_request(ac, true, msg_buf, msg_len);
mutex_unlock(&ac->mutex);
return ret;
}
@@ -281,19 +293,38 @@ int ceph_auth_is_authenticated(struct ceph_auth_client *ac)
}
EXPORT_SYMBOL(ceph_auth_is_authenticated);
-int ceph_auth_create_authorizer(struct ceph_auth_client *ac,
- int peer_type,
- struct ceph_auth_handshake *auth)
+int __ceph_auth_get_authorizer(struct ceph_auth_client *ac,
+ struct ceph_auth_handshake *auth,
+ int peer_type, bool force_new,
+ int *proto, int *pref_mode, int *fallb_mode)
{
- int ret = 0;
+ int ret;
mutex_lock(&ac->mutex);
- if (ac->ops && ac->ops->create_authorizer)
+ if (force_new && auth->authorizer) {
+ ceph_auth_destroy_authorizer(auth->authorizer);
+ auth->authorizer = NULL;
+ }
+ if (!auth->authorizer)
ret = ac->ops->create_authorizer(ac, peer_type, auth);
+ else if (ac->ops->update_authorizer)
+ ret = ac->ops->update_authorizer(ac, peer_type, auth);
+ else
+ ret = 0;
+ if (ret)
+ goto out;
+
+ *proto = ac->protocol;
+ if (pref_mode && fallb_mode) {
+ *pref_mode = ac->preferred_mode;
+ *fallb_mode = ac->fallback_mode;
+ }
+
+out:
mutex_unlock(&ac->mutex);
return ret;
}
-EXPORT_SYMBOL(ceph_auth_create_authorizer);
+EXPORT_SYMBOL(__ceph_auth_get_authorizer);
void ceph_auth_destroy_authorizer(struct ceph_authorizer *a)
{
@@ -301,20 +332,6 @@ void ceph_auth_destroy_authorizer(struct ceph_authorizer *a)
}
EXPORT_SYMBOL(ceph_auth_destroy_authorizer);
-int ceph_auth_update_authorizer(struct ceph_auth_client *ac,
- int peer_type,
- struct ceph_auth_handshake *a)
-{
- int ret = 0;
-
- mutex_lock(&ac->mutex);
- if (ac->ops && ac->ops->update_authorizer)
- ret = ac->ops->update_authorizer(ac, peer_type, a);
- mutex_unlock(&ac->mutex);
- return ret;
-}
-EXPORT_SYMBOL(ceph_auth_update_authorizer);
-
int ceph_auth_add_authorizer_challenge(struct ceph_auth_client *ac,
struct ceph_authorizer *a,
void *challenge_buf,
@@ -332,13 +349,18 @@ int ceph_auth_add_authorizer_challenge(struct ceph_auth_client *ac,
EXPORT_SYMBOL(ceph_auth_add_authorizer_challenge);
int ceph_auth_verify_authorizer_reply(struct ceph_auth_client *ac,
- struct ceph_authorizer *a)
+ struct ceph_authorizer *a,
+ void *reply, int reply_len,
+ u8 *session_key, int *session_key_len,
+ u8 *con_secret, int *con_secret_len)
{
int ret = 0;
mutex_lock(&ac->mutex);
if (ac->ops && ac->ops->verify_authorizer_reply)
- ret = ac->ops->verify_authorizer_reply(ac, a);
+ ret = ac->ops->verify_authorizer_reply(ac, a,
+ reply, reply_len, session_key, session_key_len,
+ con_secret, con_secret_len);
mutex_unlock(&ac->mutex);
return ret;
}
@@ -352,3 +374,279 @@ void ceph_auth_invalidate_authorizer(struct ceph_auth_client *ac, int peer_type)
mutex_unlock(&ac->mutex);
}
EXPORT_SYMBOL(ceph_auth_invalidate_authorizer);
+
+/*
+ * msgr2 authentication
+ */
+
+static bool contains(const int *arr, int cnt, int val)
+{
+ int i;
+
+ for (i = 0; i < cnt; i++) {
+ if (arr[i] == val)
+ return true;
+ }
+
+ return false;
+}
+
+static int encode_con_modes(void **p, void *end, int pref_mode, int fallb_mode)
+{
+ WARN_ON(pref_mode == CEPH_CON_MODE_UNKNOWN);
+ if (fallb_mode != CEPH_CON_MODE_UNKNOWN) {
+ ceph_encode_32_safe(p, end, 2, e_range);
+ ceph_encode_32_safe(p, end, pref_mode, e_range);
+ ceph_encode_32_safe(p, end, fallb_mode, e_range);
+ } else {
+ ceph_encode_32_safe(p, end, 1, e_range);
+ ceph_encode_32_safe(p, end, pref_mode, e_range);
+ }
+
+ return 0;
+
+e_range:
+ return -ERANGE;
+}
+
+/*
+ * Similar to ceph_auth_build_hello().
+ */
+int ceph_auth_get_request(struct ceph_auth_client *ac, void *buf, int buf_len)
+{
+ int proto = ac->key ? CEPH_AUTH_CEPHX : CEPH_AUTH_NONE;
+ void *end = buf + buf_len;
+ void *lenp;
+ void *p;
+ int ret;
+
+ mutex_lock(&ac->mutex);
+ if (ac->protocol == CEPH_AUTH_UNKNOWN) {
+ ret = init_protocol(ac, proto);
+ if (ret) {
+ pr_err("auth protocol '%s' init failed: %d\n",
+ ceph_auth_proto_name(proto), ret);
+ goto out;
+ }
+ } else {
+ WARN_ON(ac->protocol != proto);
+ ac->ops->reset(ac);
+ }
+
+ p = buf;
+ ceph_encode_32_safe(&p, end, ac->protocol, e_range);
+ ret = encode_con_modes(&p, end, ac->preferred_mode, ac->fallback_mode);
+ if (ret)
+ goto out;
+
+ lenp = p;
+ p += 4; /* space for len */
+
+ ceph_encode_8_safe(&p, end, CEPH_AUTH_MODE_MON, e_range);
+ ret = ceph_auth_entity_name_encode(ac->name, &p, end);
+ if (ret)
+ goto out;
+
+ ceph_encode_64_safe(&p, end, ac->global_id, e_range);
+ ceph_encode_32(&lenp, p - lenp - 4);
+ ret = p - buf;
+
+out:
+ mutex_unlock(&ac->mutex);
+ return ret;
+
+e_range:
+ ret = -ERANGE;
+ goto out;
+}
+
+int ceph_auth_handle_reply_more(struct ceph_auth_client *ac, void *reply,
+ int reply_len, void *buf, int buf_len)
+{
+ int ret;
+
+ mutex_lock(&ac->mutex);
+ ret = ac->ops->handle_reply(ac, 0, reply, reply + reply_len,
+ NULL, NULL, NULL, NULL);
+ if (ret == -EAGAIN)
+ ret = build_request(ac, false, buf, buf_len);
+ else
+ WARN_ON(ret >= 0);
+ mutex_unlock(&ac->mutex);
+ return ret;
+}
+
+int ceph_auth_handle_reply_done(struct ceph_auth_client *ac,
+ u64 global_id, void *reply, int reply_len,
+ u8 *session_key, int *session_key_len,
+ u8 *con_secret, int *con_secret_len)
+{
+ int ret;
+
+ mutex_lock(&ac->mutex);
+ if (global_id && ac->global_id != global_id) {
+ dout("%s global_id %llu -> %llu\n", __func__, ac->global_id,
+ global_id);
+ ac->global_id = global_id;
+ }
+
+ ret = ac->ops->handle_reply(ac, 0, reply, reply + reply_len,
+ session_key, session_key_len,
+ con_secret, con_secret_len);
+ mutex_unlock(&ac->mutex);
+ return ret;
+}
+
+bool ceph_auth_handle_bad_method(struct ceph_auth_client *ac,
+ int used_proto, int result,
+ const int *allowed_protos, int proto_cnt,
+ const int *allowed_modes, int mode_cnt)
+{
+ mutex_lock(&ac->mutex);
+ WARN_ON(used_proto != ac->protocol);
+
+ if (result == -EOPNOTSUPP) {
+ if (!contains(allowed_protos, proto_cnt, ac->protocol)) {
+ pr_err("auth protocol '%s' not allowed\n",
+ ceph_auth_proto_name(ac->protocol));
+ goto not_allowed;
+ }
+ if (!contains(allowed_modes, mode_cnt, ac->preferred_mode) &&
+ (ac->fallback_mode == CEPH_CON_MODE_UNKNOWN ||
+ !contains(allowed_modes, mode_cnt, ac->fallback_mode))) {
+ pr_err("preferred mode '%s' not allowed\n",
+ ceph_con_mode_name(ac->preferred_mode));
+ if (ac->fallback_mode == CEPH_CON_MODE_UNKNOWN)
+ pr_err("no fallback mode\n");
+ else
+ pr_err("fallback mode '%s' not allowed\n",
+ ceph_con_mode_name(ac->fallback_mode));
+ goto not_allowed;
+ }
+ }
+
+ WARN_ON(result == -EOPNOTSUPP || result >= 0);
+ pr_err("auth protocol '%s' msgr authentication failed: %d\n",
+ ceph_auth_proto_name(ac->protocol), result);
+
+ mutex_unlock(&ac->mutex);
+ return true;
+
+not_allowed:
+ mutex_unlock(&ac->mutex);
+ return false;
+}
+
+int ceph_auth_get_authorizer(struct ceph_auth_client *ac,
+ struct ceph_auth_handshake *auth,
+ int peer_type, void *buf, int *buf_len)
+{
+ void *end = buf + *buf_len;
+ int pref_mode, fallb_mode;
+ int proto;
+ void *p;
+ int ret;
+
+ ret = __ceph_auth_get_authorizer(ac, auth, peer_type, true, &proto,
+ &pref_mode, &fallb_mode);
+ if (ret)
+ return ret;
+
+ p = buf;
+ ceph_encode_32_safe(&p, end, proto, e_range);
+ ret = encode_con_modes(&p, end, pref_mode, fallb_mode);
+ if (ret)
+ return ret;
+
+ ceph_encode_32_safe(&p, end, auth->authorizer_buf_len, e_range);
+ *buf_len = p - buf;
+ return 0;
+
+e_range:
+ return -ERANGE;
+}
+EXPORT_SYMBOL(ceph_auth_get_authorizer);
+
+int ceph_auth_handle_svc_reply_more(struct ceph_auth_client *ac,
+ struct ceph_auth_handshake *auth,
+ void *reply, int reply_len,
+ void *buf, int *buf_len)
+{
+ void *end = buf + *buf_len;
+ void *p;
+ int ret;
+
+ ret = ceph_auth_add_authorizer_challenge(ac, auth->authorizer,
+ reply, reply_len);
+ if (ret)
+ return ret;
+
+ p = buf;
+ ceph_encode_32_safe(&p, end, auth->authorizer_buf_len, e_range);
+ *buf_len = p - buf;
+ return 0;
+
+e_range:
+ return -ERANGE;
+}
+EXPORT_SYMBOL(ceph_auth_handle_svc_reply_more);
+
+int ceph_auth_handle_svc_reply_done(struct ceph_auth_client *ac,
+ struct ceph_auth_handshake *auth,
+ void *reply, int reply_len,
+ u8 *session_key, int *session_key_len,
+ u8 *con_secret, int *con_secret_len)
+{
+ return ceph_auth_verify_authorizer_reply(ac, auth->authorizer,
+ reply, reply_len, session_key, session_key_len,
+ con_secret, con_secret_len);
+}
+EXPORT_SYMBOL(ceph_auth_handle_svc_reply_done);
+
+bool ceph_auth_handle_bad_authorizer(struct ceph_auth_client *ac,
+ int peer_type, int used_proto, int result,
+ const int *allowed_protos, int proto_cnt,
+ const int *allowed_modes, int mode_cnt)
+{
+ mutex_lock(&ac->mutex);
+ WARN_ON(used_proto != ac->protocol);
+
+ if (result == -EOPNOTSUPP) {
+ if (!contains(allowed_protos, proto_cnt, ac->protocol)) {
+ pr_err("auth protocol '%s' not allowed by %s\n",
+ ceph_auth_proto_name(ac->protocol),
+ ceph_entity_type_name(peer_type));
+ goto not_allowed;
+ }
+ if (!contains(allowed_modes, mode_cnt, ac->preferred_mode) &&
+ (ac->fallback_mode == CEPH_CON_MODE_UNKNOWN ||
+ !contains(allowed_modes, mode_cnt, ac->fallback_mode))) {
+ pr_err("preferred mode '%s' not allowed by %s\n",
+ ceph_con_mode_name(ac->preferred_mode),
+ ceph_entity_type_name(peer_type));
+ if (ac->fallback_mode == CEPH_CON_MODE_UNKNOWN)
+ pr_err("no fallback mode\n");
+ else
+ pr_err("fallback mode '%s' not allowed by %s\n",
+ ceph_con_mode_name(ac->fallback_mode),
+ ceph_entity_type_name(peer_type));
+ goto not_allowed;
+ }
+ }
+
+ WARN_ON(result == -EOPNOTSUPP || result >= 0);
+ pr_err("auth protocol '%s' authorization to %s failed: %d\n",
+ ceph_auth_proto_name(ac->protocol),
+ ceph_entity_type_name(peer_type), result);
+
+ if (ac->ops->invalidate_authorizer)
+ ac->ops->invalidate_authorizer(ac, peer_type);
+
+ mutex_unlock(&ac->mutex);
+ return true;
+
+not_allowed:
+ mutex_unlock(&ac->mutex);
+ return false;
+}
+EXPORT_SYMBOL(ceph_auth_handle_bad_authorizer);
diff --git a/net/ceph/auth_none.c b/net/ceph/auth_none.c
index edb7042479ed..70e86e462250 100644
--- a/net/ceph/auth_none.c
+++ b/net/ceph/auth_none.c
@@ -70,7 +70,9 @@ static int build_request(struct ceph_auth_client *ac, void *buf, void *end)
* authenticate state, so nothing happens here.
*/
static int handle_reply(struct ceph_auth_client *ac, int result,
- void *buf, void *end)
+ void *buf, void *end, u8 *session_key,
+ int *session_key_len, u8 *con_secret,
+ int *con_secret_len)
{
struct ceph_auth_none_info *xi = ac->private;
@@ -116,7 +118,6 @@ static int ceph_auth_none_create_authorizer(
}
static const struct ceph_auth_client_ops ceph_auth_none_ops = {
- .name = "none",
.reset = reset,
.destroy = destroy,
.is_authenticated = is_authenticated,
diff --git a/net/ceph/auth_x.c b/net/ceph/auth_x.c
index b52732337ca6..9815cfe42af0 100644
--- a/net/ceph/auth_x.c
+++ b/net/ceph/auth_x.c
@@ -22,12 +22,15 @@ static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed);
static int ceph_x_is_authenticated(struct ceph_auth_client *ac)
{
struct ceph_x_info *xi = ac->private;
- int need;
+ int missing;
+ int need; /* missing + need renewal */
ceph_x_validate_tickets(ac, &need);
- dout("ceph_x_is_authenticated want=%d need=%d have=%d\n",
- ac->want_keys, need, xi->have_keys);
- return (ac->want_keys & xi->have_keys) == ac->want_keys;
+ missing = ac->want_keys & ~xi->have_keys;
+ WARN_ON((need & missing) != missing);
+ dout("%s want 0x%x have 0x%x missing 0x%x -> %d\n", __func__,
+ ac->want_keys, xi->have_keys, missing, !missing);
+ return !missing;
}
static int ceph_x_should_authenticate(struct ceph_auth_client *ac)
@@ -36,9 +39,9 @@ static int ceph_x_should_authenticate(struct ceph_auth_client *ac)
int need;
ceph_x_validate_tickets(ac, &need);
- dout("ceph_x_should_authenticate want=%d need=%d have=%d\n",
- ac->want_keys, need, xi->have_keys);
- return need != 0;
+ dout("%s want 0x%x have 0x%x need 0x%x -> %d\n", __func__,
+ ac->want_keys, xi->have_keys, need, !!need);
+ return !!need;
}
static int ceph_x_encrypt_offset(void)
@@ -197,7 +200,7 @@ static int process_one_ticket(struct ceph_auth_client *ac,
dout(" decrypted %d bytes\n", ret);
dend = dp + ret;
- tkt_struct_v = ceph_decode_8(&dp);
+ ceph_decode_8_safe(&dp, dend, tkt_struct_v, bad);
if (tkt_struct_v != 1)
goto bad;
@@ -205,6 +208,7 @@ static int process_one_ticket(struct ceph_auth_client *ac,
if (ret)
goto out;
+ ceph_decode_need(&dp, dend, sizeof(struct ceph_timespec), bad);
ceph_decode_timespec64(&validity, dp);
dp += sizeof(struct ceph_timespec);
new_expires = ktime_get_real_seconds() + validity.tv_sec;
@@ -265,22 +269,21 @@ out:
static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
struct ceph_crypto_key *secret,
- void *buf, void *end)
+ void **p, void *end)
{
- void *p = buf;
u8 reply_struct_v;
u32 num;
int ret;
- ceph_decode_8_safe(&p, end, reply_struct_v, bad);
+ ceph_decode_8_safe(p, end, reply_struct_v, bad);
if (reply_struct_v != 1)
return -EINVAL;
- ceph_decode_32_safe(&p, end, num, bad);
+ ceph_decode_32_safe(p, end, num, bad);
dout("%d tickets\n", num);
while (num--) {
- ret = process_one_ticket(ac, secret, &p, end);
+ ret = process_one_ticket(ac, secret, p, end);
if (ret)
return ret;
}
@@ -379,6 +382,7 @@ static int ceph_x_build_authorizer(struct ceph_auth_client *ac,
}
}
au->service = th->service;
+ WARN_ON(!th->secret_id);
au->secret_id = th->secret_id;
msg_a = au->buf->vec.iov_base;
@@ -442,9 +446,10 @@ static bool need_key(struct ceph_x_ticket_handler *th)
static bool have_key(struct ceph_x_ticket_handler *th)
{
- if (th->have_key) {
- if (ktime_get_real_seconds() >= th->expires)
- th->have_key = false;
+ if (th->have_key && ktime_get_real_seconds() >= th->expires) {
+ dout("ticket %d (%s) secret_id %llu expired\n", th->service,
+ ceph_entity_type_name(th->service), th->secret_id);
+ th->have_key = false;
}
return th->have_key;
@@ -486,6 +491,7 @@ static int ceph_x_build_request(struct ceph_auth_client *ac,
struct ceph_x_info *xi = ac->private;
int need;
struct ceph_x_request_header *head = buf;
+ void *p;
int ret;
struct ceph_x_ticket_handler *th =
get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
@@ -494,18 +500,17 @@ static int ceph_x_build_request(struct ceph_auth_client *ac,
return PTR_ERR(th);
ceph_x_validate_tickets(ac, &need);
-
- dout("build_request want %x have %x need %x\n",
- ac->want_keys, xi->have_keys, need);
+ dout("%s want 0x%x have 0x%x need 0x%x\n", __func__, ac->want_keys,
+ xi->have_keys, need);
if (need & CEPH_ENTITY_TYPE_AUTH) {
struct ceph_x_authenticate *auth = (void *)(head + 1);
- void *p = auth + 1;
void *enc_buf = xi->auth_authorizer.enc_buf;
struct ceph_x_challenge_blob *blob = enc_buf +
ceph_x_encrypt_offset();
u64 *u;
+ p = auth + 1;
if (p > end)
return -ERANGE;
@@ -521,7 +526,7 @@ static int ceph_x_build_request(struct ceph_auth_client *ac,
if (ret < 0)
return ret;
- auth->struct_v = 1;
+ auth->struct_v = 2; /* nautilus+ */
auth->key = 0;
for (u = (u64 *)enc_buf; u + 1 <= (u64 *)(enc_buf + ret); u++)
auth->key ^= *(__le64 *)u;
@@ -534,39 +539,117 @@ static int ceph_x_build_request(struct ceph_auth_client *ac,
if (ret < 0)
return ret;
+ /* nautilus+: request service tickets at the same time */
+ need = ac->want_keys & ~CEPH_ENTITY_TYPE_AUTH;
+ WARN_ON(!need);
+ ceph_encode_32_safe(&p, end, need, e_range);
return p - buf;
}
if (need) {
- void *p = head + 1;
- struct ceph_x_service_ticket_request *req;
-
- if (p > end)
- return -ERANGE;
- head->op = cpu_to_le16(CEPHX_GET_PRINCIPAL_SESSION_KEY);
-
+ dout(" get_principal_session_key\n");
ret = ceph_x_build_authorizer(ac, th, &xi->auth_authorizer);
if (ret)
return ret;
- ceph_encode_copy(&p, xi->auth_authorizer.buf->vec.iov_base,
- xi->auth_authorizer.buf->vec.iov_len);
- req = p;
- req->keys = cpu_to_le32(need);
- p += sizeof(*req);
+ p = buf;
+ ceph_encode_16_safe(&p, end, CEPHX_GET_PRINCIPAL_SESSION_KEY,
+ e_range);
+ ceph_encode_copy_safe(&p, end,
+ xi->auth_authorizer.buf->vec.iov_base,
+ xi->auth_authorizer.buf->vec.iov_len, e_range);
+ ceph_encode_8_safe(&p, end, 1, e_range);
+ ceph_encode_32_safe(&p, end, need, e_range);
return p - buf;
}
return 0;
+
+e_range:
+ return -ERANGE;
+}
+
+static int handle_auth_session_key(struct ceph_auth_client *ac,
+ void **p, void *end,
+ u8 *session_key, int *session_key_len,
+ u8 *con_secret, int *con_secret_len)
+{
+ struct ceph_x_info *xi = ac->private;
+ struct ceph_x_ticket_handler *th;
+ void *dp, *dend;
+ int len;
+ int ret;
+
+ /* AUTH ticket */
+ ret = ceph_x_proc_ticket_reply(ac, &xi->secret, p, end);
+ if (ret)
+ return ret;
+
+ if (*p == end) {
+ /* pre-nautilus (or didn't request service tickets!) */
+ WARN_ON(session_key || con_secret);
+ return 0;
+ }
+
+ th = get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
+ if (IS_ERR(th))
+ return PTR_ERR(th);
+
+ if (session_key) {
+ memcpy(session_key, th->session_key.key, th->session_key.len);
+ *session_key_len = th->session_key.len;
+ }
+
+ /* connection secret */
+ ceph_decode_32_safe(p, end, len, e_inval);
+ dout("%s connection secret blob len %d\n", __func__, len);
+ if (len > 0) {
+ dp = *p + ceph_x_encrypt_offset();
+ ret = ceph_x_decrypt(&th->session_key, p, *p + len);
+ if (ret < 0)
+ return ret;
+
+ dout("%s decrypted %d bytes\n", __func__, ret);
+ dend = dp + ret;
+
+ ceph_decode_32_safe(&dp, dend, len, e_inval);
+ if (len > CEPH_MAX_CON_SECRET_LEN) {
+ pr_err("connection secret too big %d\n", len);
+ return -EINVAL;
+ }
+
+ dout("%s connection secret len %d\n", __func__, len);
+ if (con_secret) {
+ memcpy(con_secret, dp, len);
+ *con_secret_len = len;
+ }
+ }
+
+ /* service tickets */
+ ceph_decode_32_safe(p, end, len, e_inval);
+ dout("%s service tickets blob len %d\n", __func__, len);
+ if (len > 0) {
+ ret = ceph_x_proc_ticket_reply(ac, &th->session_key,
+ p, *p + len);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+
+e_inval:
+ return -EINVAL;
}
static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result,
- void *buf, void *end)
+ void *buf, void *end,
+ u8 *session_key, int *session_key_len,
+ u8 *con_secret, int *con_secret_len)
{
struct ceph_x_info *xi = ac->private;
- struct ceph_x_reply_header *head = buf;
struct ceph_x_ticket_handler *th;
int len = end - buf;
+ void *p;
int op;
int ret;
@@ -587,22 +670,25 @@ static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result,
return -EAGAIN;
}
- op = le16_to_cpu(head->op);
- result = le32_to_cpu(head->result);
+ p = buf;
+ ceph_decode_16_safe(&p, end, op, e_inval);
+ ceph_decode_32_safe(&p, end, result, e_inval);
dout("handle_reply op %d result %d\n", op, result);
switch (op) {
case CEPHX_GET_AUTH_SESSION_KEY:
- /* verify auth key */
- ret = ceph_x_proc_ticket_reply(ac, &xi->secret,
- buf + sizeof(*head), end);
+ /* AUTH ticket + [connection secret] + service tickets */
+ ret = handle_auth_session_key(ac, &p, end, session_key,
+ session_key_len, con_secret,
+ con_secret_len);
break;
case CEPHX_GET_PRINCIPAL_SESSION_KEY:
th = get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
if (IS_ERR(th))
return PTR_ERR(th);
- ret = ceph_x_proc_ticket_reply(ac, &th->session_key,
- buf + sizeof(*head), end);
+
+ /* service tickets */
+ ret = ceph_x_proc_ticket_reply(ac, &th->session_key, &p, end);
break;
default:
@@ -613,6 +699,9 @@ static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result,
if (ac->want_keys == xi->have_keys)
return 0;
return -EAGAIN;
+
+e_inval:
+ return -EINVAL;
}
static void ceph_x_destroy_authorizer(struct ceph_authorizer *a)
@@ -678,40 +767,44 @@ static int ceph_x_update_authorizer(
return 0;
}
-static int decrypt_authorize_challenge(struct ceph_x_authorizer *au,
- void *challenge_buf,
- int challenge_buf_len,
- u64 *server_challenge)
+/*
+ * CephXAuthorizeChallenge
+ */
+static int decrypt_authorizer_challenge(struct ceph_crypto_key *secret,
+ void *challenge, int challenge_len,
+ u64 *server_challenge)
{
- struct ceph_x_authorize_challenge *ch =
- challenge_buf + sizeof(struct ceph_x_encrypt_header);
+ void *dp, *dend;
int ret;
/* no leading len */
- ret = __ceph_x_decrypt(&au->session_key, challenge_buf,
- challenge_buf_len);
+ ret = __ceph_x_decrypt(secret, challenge, challenge_len);
if (ret < 0)
return ret;
- if (ret < sizeof(*ch)) {
- pr_err("bad size %d for ceph_x_authorize_challenge\n", ret);
- return -EINVAL;
- }
- *server_challenge = le64_to_cpu(ch->server_challenge);
+ dout("%s decrypted %d bytes\n", __func__, ret);
+ dp = challenge + sizeof(struct ceph_x_encrypt_header);
+ dend = dp + ret;
+
+ ceph_decode_skip_8(&dp, dend, e_inval); /* struct_v */
+ ceph_decode_64_safe(&dp, dend, *server_challenge, e_inval);
+ dout("%s server_challenge %llu\n", __func__, *server_challenge);
return 0;
+
+e_inval:
+ return -EINVAL;
}
static int ceph_x_add_authorizer_challenge(struct ceph_auth_client *ac,
struct ceph_authorizer *a,
- void *challenge_buf,
- int challenge_buf_len)
+ void *challenge, int challenge_len)
{
struct ceph_x_authorizer *au = (void *)a;
u64 server_challenge;
int ret;
- ret = decrypt_authorize_challenge(au, challenge_buf, challenge_buf_len,
- &server_challenge);
+ ret = decrypt_authorizer_challenge(&au->session_key, challenge,
+ challenge_len, &server_challenge);
if (ret) {
pr_err("failed to decrypt authorize challenge: %d", ret);
return ret;
@@ -726,29 +819,76 @@ static int ceph_x_add_authorizer_challenge(struct ceph_auth_client *ac,
return 0;
}
+/*
+ * CephXAuthorizeReply
+ */
+static int decrypt_authorizer_reply(struct ceph_crypto_key *secret,
+ void **p, void *end, u64 *nonce_plus_one,
+ u8 *con_secret, int *con_secret_len)
+{
+ void *dp, *dend;
+ u8 struct_v;
+ int len;
+ int ret;
+
+ dp = *p + ceph_x_encrypt_offset();
+ ret = ceph_x_decrypt(secret, p, end);
+ if (ret < 0)
+ return ret;
+
+ dout("%s decrypted %d bytes\n", __func__, ret);
+ dend = dp + ret;
+
+ ceph_decode_8_safe(&dp, dend, struct_v, e_inval);
+ ceph_decode_64_safe(&dp, dend, *nonce_plus_one, e_inval);
+ dout("%s nonce_plus_one %llu\n", __func__, *nonce_plus_one);
+ if (struct_v >= 2) {
+ ceph_decode_32_safe(&dp, dend, len, e_inval);
+ if (len > CEPH_MAX_CON_SECRET_LEN) {
+ pr_err("connection secret too big %d\n", len);
+ return -EINVAL;
+ }
+
+ dout("%s connection secret len %d\n", __func__, len);
+ if (con_secret) {
+ memcpy(con_secret, dp, len);
+ *con_secret_len = len;
+ }
+ }
+
+ return 0;
+
+e_inval:
+ return -EINVAL;
+}
+
static int ceph_x_verify_authorizer_reply(struct ceph_auth_client *ac,
- struct ceph_authorizer *a)
+ struct ceph_authorizer *a,
+ void *reply, int reply_len,
+ u8 *session_key, int *session_key_len,
+ u8 *con_secret, int *con_secret_len)
{
struct ceph_x_authorizer *au = (void *)a;
- void *p = au->enc_buf;
- struct ceph_x_authorize_reply *reply = p + ceph_x_encrypt_offset();
+ u64 nonce_plus_one;
int ret;
- ret = ceph_x_decrypt(&au->session_key, &p, p + CEPHX_AU_ENC_BUF_LEN);
- if (ret < 0)
+ if (session_key) {
+ memcpy(session_key, au->session_key.key, au->session_key.len);
+ *session_key_len = au->session_key.len;
+ }
+
+ ret = decrypt_authorizer_reply(&au->session_key, &reply,
+ reply + reply_len, &nonce_plus_one,
+ con_secret, con_secret_len);
+ if (ret)
return ret;
- if (ret < sizeof(*reply)) {
- pr_err("bad size %d for ceph_x_authorize_reply\n", ret);
- return -EINVAL;
+
+ if (nonce_plus_one != au->nonce + 1) {
+ pr_err("failed to authenticate server\n");
+ return -EPERM;
}
- if (au->nonce + 1 != le64_to_cpu(reply->nonce_plus_one))
- ret = -EPERM;
- else
- ret = 0;
- dout("verify_authorizer_reply nonce %llx got %llx ret %d\n",
- au->nonce, le64_to_cpu(reply->nonce_plus_one), ret);
- return ret;
+ return 0;
}
static void ceph_x_reset(struct ceph_auth_client *ac)
@@ -785,8 +925,15 @@ static void invalidate_ticket(struct ceph_auth_client *ac, int peer_type)
struct ceph_x_ticket_handler *th;
th = get_ticket_handler(ac, peer_type);
- if (!IS_ERR(th))
+ if (IS_ERR(th))
+ return;
+
+ if (th->have_key) {
+ dout("ticket %d (%s) secret_id %llu invalidated\n",
+ th->service, ceph_entity_type_name(th->service),
+ th->secret_id);
th->have_key = false;
+ }
}
static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac,
@@ -911,7 +1058,6 @@ static int ceph_x_check_message_signature(struct ceph_auth_handshake *auth,
}
static const struct ceph_auth_client_ops ceph_x_ops = {
- .name = "x",
.is_authenticated = ceph_x_is_authenticated,
.should_authenticate = ceph_x_should_authenticate,
.build_request = ceph_x_build_request,
diff --git a/net/ceph/auth_x_protocol.h b/net/ceph/auth_x_protocol.h
index 24b0b74564d0..792fcb974dc3 100644
--- a/net/ceph/auth_x_protocol.h
+++ b/net/ceph/auth_x_protocol.h
@@ -38,7 +38,8 @@ struct ceph_x_authenticate {
__u8 struct_v;
__le64 client_challenge;
__le64 key;
- /* ticket blob */
+ /* old_ticket blob */
+ /* nautilus+: other_keys */
} __attribute__ ((packed));
struct ceph_x_service_ticket_request {
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index 4e7edd707a14..271287c5ec12 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -265,6 +265,7 @@ enum {
Opt_ip,
Opt_crush_location,
Opt_read_from_replica,
+ Opt_ms_mode,
/* string args above */
Opt_share,
Opt_crc,
@@ -287,6 +288,23 @@ static const struct constant_table ceph_param_read_from_replica[] = {
{}
};
+enum ceph_ms_mode {
+ Opt_ms_mode_legacy,
+ Opt_ms_mode_crc,
+ Opt_ms_mode_secure,
+ Opt_ms_mode_prefer_crc,
+ Opt_ms_mode_prefer_secure
+};
+
+static const struct constant_table ceph_param_ms_mode[] = {
+ {"legacy", Opt_ms_mode_legacy},
+ {"crc", Opt_ms_mode_crc},
+ {"secure", Opt_ms_mode_secure},
+ {"prefer-crc", Opt_ms_mode_prefer_crc},
+ {"prefer-secure", Opt_ms_mode_prefer_secure},
+ {}
+};
+
static const struct fs_parameter_spec ceph_parameters[] = {
fsparam_flag ("abort_on_full", Opt_abort_on_full),
fsparam_flag_no ("cephx_require_signatures", Opt_cephx_require_signatures),
@@ -305,6 +323,8 @@ static const struct fs_parameter_spec ceph_parameters[] = {
fs_param_deprecated, NULL),
fsparam_enum ("read_from_replica", Opt_read_from_replica,
ceph_param_read_from_replica),
+ fsparam_enum ("ms_mode", Opt_ms_mode,
+ ceph_param_ms_mode),
fsparam_string ("secret", Opt_secret),
fsparam_flag_no ("share", Opt_share),
fsparam_flag_no ("tcp_nodelay", Opt_tcp_nodelay),
@@ -333,6 +353,8 @@ struct ceph_options *ceph_alloc_options(void)
opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT;
opt->osd_request_timeout = CEPH_OSD_REQUEST_TIMEOUT_DEFAULT;
opt->read_from_replica = CEPH_READ_FROM_REPLICA_DEFAULT;
+ opt->con_modes[0] = CEPH_CON_MODE_UNKNOWN;
+ opt->con_modes[1] = CEPH_CON_MODE_UNKNOWN;
return opt;
}
EXPORT_SYMBOL(ceph_alloc_options);
@@ -503,6 +525,32 @@ int ceph_parse_param(struct fs_parameter *param, struct ceph_options *opt,
BUG();
}
break;
+ case Opt_ms_mode:
+ switch (result.uint_32) {
+ case Opt_ms_mode_legacy:
+ opt->con_modes[0] = CEPH_CON_MODE_UNKNOWN;
+ opt->con_modes[1] = CEPH_CON_MODE_UNKNOWN;
+ break;
+ case Opt_ms_mode_crc:
+ opt->con_modes[0] = CEPH_CON_MODE_CRC;
+ opt->con_modes[1] = CEPH_CON_MODE_UNKNOWN;
+ break;
+ case Opt_ms_mode_secure:
+ opt->con_modes[0] = CEPH_CON_MODE_SECURE;
+ opt->con_modes[1] = CEPH_CON_MODE_UNKNOWN;
+ break;
+ case Opt_ms_mode_prefer_crc:
+ opt->con_modes[0] = CEPH_CON_MODE_CRC;
+ opt->con_modes[1] = CEPH_CON_MODE_SECURE;
+ break;
+ case Opt_ms_mode_prefer_secure:
+ opt->con_modes[0] = CEPH_CON_MODE_SECURE;
+ opt->con_modes[1] = CEPH_CON_MODE_CRC;
+ break;
+ default:
+ BUG();
+ }
+ break;
case Opt_osdtimeout:
warn_plog(&log, "Ignoring osdtimeout");
@@ -616,6 +664,21 @@ int ceph_print_client_options(struct seq_file *m, struct ceph_client *client,
} else if (opt->read_from_replica == CEPH_OSD_FLAG_LOCALIZE_READS) {
seq_puts(m, "read_from_replica=localize,");
}
+ if (opt->con_modes[0] != CEPH_CON_MODE_UNKNOWN) {
+ if (opt->con_modes[0] == CEPH_CON_MODE_CRC &&
+ opt->con_modes[1] == CEPH_CON_MODE_UNKNOWN) {
+ seq_puts(m, "ms_mode=crc,");
+ } else if (opt->con_modes[0] == CEPH_CON_MODE_SECURE &&
+ opt->con_modes[1] == CEPH_CON_MODE_UNKNOWN) {
+ seq_puts(m, "ms_mode=secure,");
+ } else if (opt->con_modes[0] == CEPH_CON_MODE_CRC &&
+ opt->con_modes[1] == CEPH_CON_MODE_SECURE) {
+ seq_puts(m, "ms_mode=prefer-crc,");
+ } else if (opt->con_modes[0] == CEPH_CON_MODE_SECURE &&
+ opt->con_modes[1] == CEPH_CON_MODE_CRC) {
+ seq_puts(m, "ms_mode=prefer-secure,");
+ }
+ }
if (opt->flags & CEPH_OPT_FSID)
seq_printf(m, "fsid=%pU,", &opt->fsid);
diff --git a/net/ceph/ceph_strings.c b/net/ceph/ceph_strings.c
index 10e01494993c..355fea272120 100644
--- a/net/ceph/ceph_strings.c
+++ b/net/ceph/ceph_strings.c
@@ -18,6 +18,34 @@ const char *ceph_entity_type_name(int type)
}
EXPORT_SYMBOL(ceph_entity_type_name);
+const char *ceph_auth_proto_name(int proto)
+{
+ switch (proto) {
+ case CEPH_AUTH_UNKNOWN:
+ return "unknown";
+ case CEPH_AUTH_NONE:
+ return "none";
+ case CEPH_AUTH_CEPHX:
+ return "cephx";
+ default:
+ return "???";
+ }
+}
+
+const char *ceph_con_mode_name(int mode)
+{
+ switch (mode) {
+ case CEPH_CON_MODE_UNKNOWN:
+ return "unknown";
+ case CEPH_CON_MODE_CRC:
+ return "crc";
+ case CEPH_CON_MODE_SECURE:
+ return "secure";
+ default:
+ return "???";
+ }
+}
+
const char *ceph_osd_op_name(int op)
{
switch (op) {
diff --git a/net/ceph/crypto.h b/net/ceph/crypto.h
index 96ef4d860bc9..13bd526349fa 100644
--- a/net/ceph/crypto.h
+++ b/net/ceph/crypto.h
@@ -5,6 +5,9 @@
#include <linux/ceph/types.h>
#include <linux/ceph/buffer.h>
+#define CEPH_KEY_LEN 16
+#define CEPH_MAX_CON_SECRET_LEN 64
+
/*
* cryptographic secret
*/
diff --git a/net/ceph/decode.c b/net/ceph/decode.c
index eea529595a7a..b44f7651be04 100644
--- a/net/ceph/decode.c
+++ b/net/ceph/decode.c
@@ -1,4 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/inet.h>
#include <linux/ceph/decode.h>
@@ -82,3 +85,101 @@ bad:
}
EXPORT_SYMBOL(ceph_decode_entity_addr);
+/*
+ * Return addr of desired type (MSGR2 or LEGACY) or error.
+ * Make sure there is only one match.
+ *
+ * Assume encoding with MSG_ADDR2.
+ */
+int ceph_decode_entity_addrvec(void **p, void *end, bool msgr2,
+ struct ceph_entity_addr *addr)
+{
+ __le32 my_type = msgr2 ? CEPH_ENTITY_ADDR_TYPE_MSGR2 :
+ CEPH_ENTITY_ADDR_TYPE_LEGACY;
+ struct ceph_entity_addr tmp_addr;
+ int addr_cnt;
+ bool found;
+ u8 marker;
+ int ret;
+ int i;
+
+ ceph_decode_8_safe(p, end, marker, e_inval);
+ if (marker != 2) {
+ pr_err("bad addrvec marker %d\n", marker);
+ return -EINVAL;
+ }
+
+ ceph_decode_32_safe(p, end, addr_cnt, e_inval);
+
+ found = false;
+ for (i = 0; i < addr_cnt; i++) {
+ ret = ceph_decode_entity_addr(p, end, &tmp_addr);
+ if (ret)
+ return ret;
+
+ if (tmp_addr.type == my_type) {
+ if (found) {
+ pr_err("another match of type %d in addrvec\n",
+ le32_to_cpu(my_type));
+ return -EINVAL;
+ }
+
+ memcpy(addr, &tmp_addr, sizeof(*addr));
+ found = true;
+ }
+ }
+ if (!found && addr_cnt != 0) {
+ pr_err("no match of type %d in addrvec\n",
+ le32_to_cpu(my_type));
+ return -ENOENT;
+ }
+
+ return 0;
+
+e_inval:
+ return -EINVAL;
+}
+EXPORT_SYMBOL(ceph_decode_entity_addrvec);
+
+static int get_sockaddr_encoding_len(sa_family_t family)
+{
+ union {
+ struct sockaddr sa;
+ struct sockaddr_in sin;
+ struct sockaddr_in6 sin6;
+ } u;
+
+ switch (family) {
+ case AF_INET:
+ return sizeof(u.sin);
+ case AF_INET6:
+ return sizeof(u.sin6);
+ default:
+ return sizeof(u);
+ }
+}
+
+int ceph_entity_addr_encoding_len(const struct ceph_entity_addr *addr)
+{
+ sa_family_t family = get_unaligned(&addr->in_addr.ss_family);
+ int addr_len = get_sockaddr_encoding_len(family);
+
+ return 1 + CEPH_ENCODING_START_BLK_LEN + 4 + 4 + 4 + addr_len;
+}
+
+void ceph_encode_entity_addr(void **p, const struct ceph_entity_addr *addr)
+{
+ sa_family_t family = get_unaligned(&addr->in_addr.ss_family);
+ int addr_len = get_sockaddr_encoding_len(family);
+
+ ceph_encode_8(p, 1); /* marker */
+ ceph_start_encoding(p, 1, 1, sizeof(addr->type) +
+ sizeof(addr->nonce) +
+ sizeof(u32) + addr_len);
+ ceph_encode_copy(p, &addr->type, sizeof(addr->type));
+ ceph_encode_copy(p, &addr->nonce, sizeof(addr->nonce));
+
+ ceph_encode_32(p, addr_len);
+ ceph_encode_16(p, family);
+ ceph_encode_copy(p, addr->in_addr.__data, addr_len - sizeof(family));
+}
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index af0f1fa24937..57d043b382ed 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -82,71 +82,51 @@
#define CON_SOCK_STATE_CONNECTED 3 /* -> CLOSING or -> CLOSED */
#define CON_SOCK_STATE_CLOSING 4 /* -> CLOSED */
-/*
- * connection states
- */
-#define CON_STATE_CLOSED 1 /* -> PREOPEN */
-#define CON_STATE_PREOPEN 2 /* -> CONNECTING, CLOSED */
-#define CON_STATE_CONNECTING 3 /* -> NEGOTIATING, CLOSED */
-#define CON_STATE_NEGOTIATING 4 /* -> OPEN, CLOSED */
-#define CON_STATE_OPEN 5 /* -> STANDBY, CLOSED */
-#define CON_STATE_STANDBY 6 /* -> PREOPEN, CLOSED */
-
-/*
- * ceph_connection flag bits
- */
-#define CON_FLAG_LOSSYTX 0 /* we can close channel or drop
- * messages on errors */
-#define CON_FLAG_KEEPALIVE_PENDING 1 /* we need to send a keepalive */
-#define CON_FLAG_WRITE_PENDING 2 /* we have data ready to send */
-#define CON_FLAG_SOCK_CLOSED 3 /* socket state changed to closed */
-#define CON_FLAG_BACKOFF 4 /* need to retry queuing delayed work */
-
static bool con_flag_valid(unsigned long con_flag)
{
switch (con_flag) {
- case CON_FLAG_LOSSYTX:
- case CON_FLAG_KEEPALIVE_PENDING:
- case CON_FLAG_WRITE_PENDING:
- case CON_FLAG_SOCK_CLOSED:
- case CON_FLAG_BACKOFF:
+ case CEPH_CON_F_LOSSYTX:
+ case CEPH_CON_F_KEEPALIVE_PENDING:
+ case CEPH_CON_F_WRITE_PENDING:
+ case CEPH_CON_F_SOCK_CLOSED:
+ case CEPH_CON_F_BACKOFF:
return true;
default:
return false;
}
}
-static void con_flag_clear(struct ceph_connection *con, unsigned long con_flag)
+void ceph_con_flag_clear(struct ceph_connection *con, unsigned long con_flag)
{
BUG_ON(!con_flag_valid(con_flag));
clear_bit(con_flag, &con->flags);
}
-static void con_flag_set(struct ceph_connection *con, unsigned long con_flag)
+void ceph_con_flag_set(struct ceph_connection *con, unsigned long con_flag)
{
BUG_ON(!con_flag_valid(con_flag));
set_bit(con_flag, &con->flags);
}
-static bool con_flag_test(struct ceph_connection *con, unsigned long con_flag)
+bool ceph_con_flag_test(struct ceph_connection *con, unsigned long con_flag)
{
BUG_ON(!con_flag_valid(con_flag));
return test_bit(con_flag, &con->flags);
}
-static bool con_flag_test_and_clear(struct ceph_connection *con,
- unsigned long con_flag)
+bool ceph_con_flag_test_and_clear(struct ceph_connection *con,
+ unsigned long con_flag)
{
BUG_ON(!con_flag_valid(con_flag));
return test_and_clear_bit(con_flag, &con->flags);
}
-static bool con_flag_test_and_set(struct ceph_connection *con,
- unsigned long con_flag)
+bool ceph_con_flag_test_and_set(struct ceph_connection *con,
+ unsigned long con_flag)
{
BUG_ON(!con_flag_valid(con_flag));
@@ -157,12 +137,6 @@ static bool con_flag_test_and_set(struct ceph_connection *con,
static struct kmem_cache *ceph_msg_cache;
-/* static tag bytes (protocol control messages) */
-static char tag_msg = CEPH_MSGR_TAG_MSG;
-static char tag_ack = CEPH_MSGR_TAG_ACK;
-static char tag_keepalive = CEPH_MSGR_TAG_KEEPALIVE;
-static char tag_keepalive2 = CEPH_MSGR_TAG_KEEPALIVE2;
-
#ifdef CONFIG_LOCKDEP
static struct lock_class_key socket_class;
#endif
@@ -184,7 +158,7 @@ static void con_fault(struct ceph_connection *con);
static char addr_str[ADDR_STR_COUNT][MAX_ADDR_STR_LEN];
static atomic_t addr_str_seq = ATOMIC_INIT(0);
-static struct page *zero_page; /* used in certain error cases */
+struct page *ceph_zero_page; /* used in certain error cases */
const char *ceph_pr_addr(const struct ceph_entity_addr *addr)
{
@@ -219,10 +193,13 @@ const char *ceph_pr_addr(const struct ceph_entity_addr *addr)
}
EXPORT_SYMBOL(ceph_pr_addr);
-static void encode_my_addr(struct ceph_messenger *msgr)
+void ceph_encode_my_addr(struct ceph_messenger *msgr)
{
- memcpy(&msgr->my_enc_addr, &msgr->inst.addr, sizeof(msgr->my_enc_addr));
- ceph_encode_banner_addr(&msgr->my_enc_addr);
+ if (!ceph_msgr2(from_msgr(msgr))) {
+ memcpy(&msgr->my_enc_addr, &msgr->inst.addr,
+ sizeof(msgr->my_enc_addr));
+ ceph_encode_banner_addr(&msgr->my_enc_addr);
+ }
}
/*
@@ -254,9 +231,9 @@ static void _ceph_msgr_exit(void)
ceph_msgr_wq = NULL;
}
- BUG_ON(zero_page == NULL);
- put_page(zero_page);
- zero_page = NULL;
+ BUG_ON(!ceph_zero_page);
+ put_page(ceph_zero_page);
+ ceph_zero_page = NULL;
ceph_msgr_slab_exit();
}
@@ -266,9 +243,9 @@ int __init ceph_msgr_init(void)
if (ceph_msgr_slab_init())
return -ENOMEM;
- BUG_ON(zero_page != NULL);
- zero_page = ZERO_PAGE(0);
- get_page(zero_page);
+ BUG_ON(ceph_zero_page);
+ ceph_zero_page = ZERO_PAGE(0);
+ get_page(ceph_zero_page);
/*
* The number of active work items is limited by the number of
@@ -372,7 +349,7 @@ static void ceph_sock_data_ready(struct sock *sk)
}
if (sk->sk_state != TCP_CLOSE_WAIT) {
- dout("%s on %p state = %lu, queueing work\n", __func__,
+ dout("%s %p state = %d, queueing work\n", __func__,
con, con->state);
queue_con(con);
}
@@ -390,7 +367,7 @@ static void ceph_sock_write_space(struct sock *sk)
* buffer. See net/ipv4/tcp_input.c:tcp_check_space()
* and net/core/stream.c:sk_stream_write_space().
*/
- if (con_flag_test(con, CON_FLAG_WRITE_PENDING)) {
+ if (ceph_con_flag_test(con, CEPH_CON_F_WRITE_PENDING)) {
if (sk_stream_is_writeable(sk)) {
dout("%s %p queueing write work\n", __func__, con);
clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
@@ -406,7 +383,7 @@ static void ceph_sock_state_change(struct sock *sk)
{
struct ceph_connection *con = sk->sk_user_data;
- dout("%s %p state = %lu sk_state = %u\n", __func__,
+ dout("%s %p state = %d sk_state = %u\n", __func__,
con, con->state, sk->sk_state);
switch (sk->sk_state) {
@@ -416,7 +393,7 @@ static void ceph_sock_state_change(struct sock *sk)
case TCP_CLOSE_WAIT:
dout("%s TCP_CLOSE_WAIT\n", __func__);
con_sock_state_closing(con);
- con_flag_set(con, CON_FLAG_SOCK_CLOSED);
+ ceph_con_flag_set(con, CEPH_CON_F_SOCK_CLOSED);
queue_con(con);
break;
case TCP_ESTABLISHED:
@@ -450,13 +427,15 @@ static void set_sock_callbacks(struct socket *sock,
/*
* initiate connection to a remote socket.
*/
-static int ceph_tcp_connect(struct ceph_connection *con)
+int ceph_tcp_connect(struct ceph_connection *con)
{
struct sockaddr_storage ss = con->peer_addr.in_addr; /* align */
struct socket *sock;
unsigned int noio_flag;
int ret;
+ dout("%s con %p peer_addr %s\n", __func__, con,
+ ceph_pr_addr(&con->peer_addr));
BUG_ON(con->sock);
/* sock_create_kern() allocates with GFP_KERNEL */
@@ -474,8 +453,6 @@ static int ceph_tcp_connect(struct ceph_connection *con)
set_sock_callbacks(sock, con);
- dout("connect %s\n", ceph_pr_addr(&con->peer_addr));
-
con_sock_state_connecting(con);
ret = sock->ops->connect(sock, (struct sockaddr *)&ss, sizeof(ss),
O_NONBLOCK);
@@ -498,103 +475,13 @@ static int ceph_tcp_connect(struct ceph_connection *con)
}
/*
- * If @buf is NULL, discard up to @len bytes.
- */
-static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len)
-{
- struct kvec iov = {buf, len};
- struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
- int r;
-
- if (!buf)
- msg.msg_flags |= MSG_TRUNC;
-
- iov_iter_kvec(&msg.msg_iter, READ, &iov, 1, len);
- r = sock_recvmsg(sock, &msg, msg.msg_flags);
- if (r == -EAGAIN)
- r = 0;
- return r;
-}
-
-static int ceph_tcp_recvpage(struct socket *sock, struct page *page,
- int page_offset, size_t length)
-{
- struct bio_vec bvec = {
- .bv_page = page,
- .bv_offset = page_offset,
- .bv_len = length
- };
- struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
- int r;
-
- BUG_ON(page_offset + length > PAGE_SIZE);
- iov_iter_bvec(&msg.msg_iter, READ, &bvec, 1, length);
- r = sock_recvmsg(sock, &msg, msg.msg_flags);
- if (r == -EAGAIN)
- r = 0;
- return r;
-}
-
-/*
- * write something. @more is true if caller will be sending more data
- * shortly.
- */
-static int ceph_tcp_sendmsg(struct socket *sock, struct kvec *iov,
- size_t kvlen, size_t len, bool more)
-{
- struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
- int r;
-
- if (more)
- msg.msg_flags |= MSG_MORE;
- else
- msg.msg_flags |= MSG_EOR; /* superfluous, but what the hell */
-
- r = kernel_sendmsg(sock, &msg, iov, kvlen, len);
- if (r == -EAGAIN)
- r = 0;
- return r;
-}
-
-/*
- * @more: either or both of MSG_MORE and MSG_SENDPAGE_NOTLAST
- */
-static int ceph_tcp_sendpage(struct socket *sock, struct page *page,
- int offset, size_t size, int more)
-{
- ssize_t (*sendpage)(struct socket *sock, struct page *page,
- int offset, size_t size, int flags);
- int flags = MSG_DONTWAIT | MSG_NOSIGNAL | more;
- int ret;
-
- /*
- * sendpage cannot properly handle pages with page_count == 0,
- * we need to fall back to sendmsg if that's the case.
- *
- * Same goes for slab pages: skb_can_coalesce() allows
- * coalescing neighboring slab objects into a single frag which
- * triggers one of hardened usercopy checks.
- */
- if (sendpage_ok(page))
- sendpage = sock->ops->sendpage;
- else
- sendpage = sock_no_sendpage;
-
- ret = sendpage(sock, page, offset, size, flags);
- if (ret == -EAGAIN)
- ret = 0;
-
- return ret;
-}
-
-/*
* Shutdown/close the socket for the given connection.
*/
-static int con_close_socket(struct ceph_connection *con)
+int ceph_con_close_socket(struct ceph_connection *con)
{
int rc = 0;
- dout("con_close_socket on %p sock %p\n", con, con->sock);
+ dout("%s con %p sock %p\n", __func__, con, con->sock);
if (con->sock) {
rc = con->sock->ops->shutdown(con->sock, SHUT_RDWR);
sock_release(con->sock);
@@ -607,12 +494,34 @@ static int con_close_socket(struct ceph_connection *con)
* received a socket close event before we had the chance to
* shut the socket down.
*/
- con_flag_clear(con, CON_FLAG_SOCK_CLOSED);
+ ceph_con_flag_clear(con, CEPH_CON_F_SOCK_CLOSED);
con_sock_state_closed(con);
return rc;
}
+static void ceph_con_reset_protocol(struct ceph_connection *con)
+{
+ dout("%s con %p\n", __func__, con);
+
+ ceph_con_close_socket(con);
+ if (con->in_msg) {
+ WARN_ON(con->in_msg->con != con);
+ ceph_msg_put(con->in_msg);
+ con->in_msg = NULL;
+ }
+ if (con->out_msg) {
+ WARN_ON(con->out_msg->con != con);
+ ceph_msg_put(con->out_msg);
+ con->out_msg = NULL;
+ }
+
+ if (ceph_msgr2(from_msgr(con->msgr)))
+ ceph_con_v2_reset_protocol(con);
+ else
+ ceph_con_v1_reset_protocol(con);
+}
+
/*
* Reset a connection. Discard all incoming and outgoing messages
* and clear *_seq state.
@@ -623,6 +532,7 @@ static void ceph_msg_remove(struct ceph_msg *msg)
ceph_msg_put(msg);
}
+
static void ceph_msg_remove_list(struct list_head *head)
{
while (!list_empty(head)) {
@@ -632,31 +542,22 @@ static void ceph_msg_remove_list(struct list_head *head)
}
}
-static void reset_connection(struct ceph_connection *con)
+void ceph_con_reset_session(struct ceph_connection *con)
{
- /* reset connection, out_queue, msg_ and connect_seq */
- /* discard existing out_queue and msg_seq */
- dout("reset_connection %p\n", con);
+ dout("%s con %p\n", __func__, con);
+
+ WARN_ON(con->in_msg);
+ WARN_ON(con->out_msg);
ceph_msg_remove_list(&con->out_queue);
ceph_msg_remove_list(&con->out_sent);
-
- if (con->in_msg) {
- BUG_ON(con->in_msg->con != con);
- ceph_msg_put(con->in_msg);
- con->in_msg = NULL;
- }
-
- con->connect_seq = 0;
con->out_seq = 0;
- if (con->out_msg) {
- BUG_ON(con->out_msg->con != con);
- ceph_msg_put(con->out_msg);
- con->out_msg = NULL;
- }
con->in_seq = 0;
con->in_seq_acked = 0;
- con->out_skip = 0;
+ if (ceph_msgr2(from_msgr(con->msgr)))
+ ceph_con_v2_reset_session(con);
+ else
+ ceph_con_v1_reset_session(con);
}
/*
@@ -666,17 +567,17 @@ void ceph_con_close(struct ceph_connection *con)
{
mutex_lock(&con->mutex);
dout("con_close %p peer %s\n", con, ceph_pr_addr(&con->peer_addr));
- con->state = CON_STATE_CLOSED;
+ con->state = CEPH_CON_S_CLOSED;
- con_flag_clear(con, CON_FLAG_LOSSYTX); /* so we retry next connect */
- con_flag_clear(con, CON_FLAG_KEEPALIVE_PENDING);
- con_flag_clear(con, CON_FLAG_WRITE_PENDING);
- con_flag_clear(con, CON_FLAG_BACKOFF);
+ ceph_con_flag_clear(con, CEPH_CON_F_LOSSYTX); /* so we retry next
+ connect */
+ ceph_con_flag_clear(con, CEPH_CON_F_KEEPALIVE_PENDING);
+ ceph_con_flag_clear(con, CEPH_CON_F_WRITE_PENDING);
+ ceph_con_flag_clear(con, CEPH_CON_F_BACKOFF);
- reset_connection(con);
- con->peer_global_seq = 0;
+ ceph_con_reset_protocol(con);
+ ceph_con_reset_session(con);
cancel_con(con);
- con_close_socket(con);
mutex_unlock(&con->mutex);
}
EXPORT_SYMBOL(ceph_con_close);
@@ -691,8 +592,8 @@ void ceph_con_open(struct ceph_connection *con,
mutex_lock(&con->mutex);
dout("con_open %p %s\n", con, ceph_pr_addr(addr));
- WARN_ON(con->state != CON_STATE_CLOSED);
- con->state = CON_STATE_PREOPEN;
+ WARN_ON(con->state != CEPH_CON_S_CLOSED);
+ con->state = CEPH_CON_S_PREOPEN;
con->peer_name.type = (__u8) entity_type;
con->peer_name.num = cpu_to_le64(entity_num);
@@ -709,7 +610,10 @@ EXPORT_SYMBOL(ceph_con_open);
*/
bool ceph_con_opened(struct ceph_connection *con)
{
- return con->connect_seq > 0;
+ if (ceph_msgr2(from_msgr(con->msgr)))
+ return ceph_con_v2_opened(con);
+
+ return ceph_con_v1_opened(con);
}
/*
@@ -732,16 +636,15 @@ void ceph_con_init(struct ceph_connection *con, void *private,
INIT_LIST_HEAD(&con->out_sent);
INIT_DELAYED_WORK(&con->work, ceph_con_workfn);
- con->state = CON_STATE_CLOSED;
+ con->state = CEPH_CON_S_CLOSED;
}
EXPORT_SYMBOL(ceph_con_init);
-
/*
* We maintain a global counter to order connection attempts. Get
* a unique seq greater than @gt.
*/
-static u32 get_global_seq(struct ceph_messenger *msgr, u32 gt)
+u32 ceph_get_global_seq(struct ceph_messenger *msgr, u32 gt)
{
u32 ret;
@@ -753,48 +656,53 @@ static u32 get_global_seq(struct ceph_messenger *msgr, u32 gt)
return ret;
}
-static void con_out_kvec_reset(struct ceph_connection *con)
-{
- BUG_ON(con->out_skip);
-
- con->out_kvec_left = 0;
- con->out_kvec_bytes = 0;
- con->out_kvec_cur = &con->out_kvec[0];
-}
-
-static void con_out_kvec_add(struct ceph_connection *con,
- size_t size, void *data)
+/*
+ * Discard messages that have been acked by the server.
+ */
+void ceph_con_discard_sent(struct ceph_connection *con, u64 ack_seq)
{
- int index = con->out_kvec_left;
+ struct ceph_msg *msg;
+ u64 seq;
- BUG_ON(con->out_skip);
- BUG_ON(index >= ARRAY_SIZE(con->out_kvec));
+ dout("%s con %p ack_seq %llu\n", __func__, con, ack_seq);
+ while (!list_empty(&con->out_sent)) {
+ msg = list_first_entry(&con->out_sent, struct ceph_msg,
+ list_head);
+ WARN_ON(msg->needs_out_seq);
+ seq = le64_to_cpu(msg->hdr.seq);
+ if (seq > ack_seq)
+ break;
- con->out_kvec[index].iov_len = size;
- con->out_kvec[index].iov_base = data;
- con->out_kvec_left++;
- con->out_kvec_bytes += size;
+ dout("%s con %p discarding msg %p seq %llu\n", __func__, con,
+ msg, seq);
+ ceph_msg_remove(msg);
+ }
}
/*
- * Chop off a kvec from the end. Return residual number of bytes for
- * that kvec, i.e. how many bytes would have been written if the kvec
- * hadn't been nuked.
+ * Discard messages that have been requeued in con_fault(), up to
+ * reconnect_seq. This avoids gratuitously resending messages that
+ * the server had received and handled prior to reconnect.
*/
-static int con_out_kvec_skip(struct ceph_connection *con)
+void ceph_con_discard_requeued(struct ceph_connection *con, u64 reconnect_seq)
{
- int off = con->out_kvec_cur - con->out_kvec;
- int skip = 0;
+ struct ceph_msg *msg;
+ u64 seq;
- if (con->out_kvec_bytes > 0) {
- skip = con->out_kvec[off + con->out_kvec_left - 1].iov_len;
- BUG_ON(con->out_kvec_bytes < skip);
- BUG_ON(!con->out_kvec_left);
- con->out_kvec_bytes -= skip;
- con->out_kvec_left--;
- }
+ dout("%s con %p reconnect_seq %llu\n", __func__, con, reconnect_seq);
+ while (!list_empty(&con->out_queue)) {
+ msg = list_first_entry(&con->out_queue, struct ceph_msg,
+ list_head);
+ if (msg->needs_out_seq)
+ break;
+ seq = le64_to_cpu(msg->hdr.seq);
+ if (seq > reconnect_seq)
+ break;
- return skip;
+ dout("%s con %p discarding msg %p seq %llu\n", __func__, con,
+ msg, seq);
+ ceph_msg_remove(msg);
+ }
}
#ifdef CONFIG_BLOCK
@@ -1113,10 +1021,9 @@ static void __ceph_msg_data_cursor_init(struct ceph_msg_data_cursor *cursor)
cursor->need_crc = true;
}
-static void ceph_msg_data_cursor_init(struct ceph_msg *msg, size_t length)
+void ceph_msg_data_cursor_init(struct ceph_msg_data_cursor *cursor,
+ struct ceph_msg *msg, size_t length)
{
- struct ceph_msg_data_cursor *cursor = &msg->cursor;
-
BUG_ON(!length);
BUG_ON(length > msg->data_length);
BUG_ON(!msg->num_data_items);
@@ -1132,9 +1039,9 @@ static void ceph_msg_data_cursor_init(struct ceph_msg *msg, size_t length)
* data item, and supply the page offset and length of that piece.
* Indicate whether this is the last piece in this data item.
*/
-static struct page *ceph_msg_data_next(struct ceph_msg_data_cursor *cursor,
- size_t *page_offset, size_t *length,
- bool *last_piece)
+struct page *ceph_msg_data_next(struct ceph_msg_data_cursor *cursor,
+ size_t *page_offset, size_t *length,
+ bool *last_piece)
{
struct page *page;
@@ -1173,8 +1080,7 @@ static struct page *ceph_msg_data_next(struct ceph_msg_data_cursor *cursor,
* Returns true if the result moves the cursor on to the next piece
* of the data item.
*/
-static void ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor,
- size_t bytes)
+void ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor, size_t bytes)
{
bool new_piece;
@@ -1210,328 +1116,8 @@ static void ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor,
cursor->need_crc = new_piece;
}
-static size_t sizeof_footer(struct ceph_connection *con)
-{
- return (con->peer_features & CEPH_FEATURE_MSG_AUTH) ?
- sizeof(struct ceph_msg_footer) :
- sizeof(struct ceph_msg_footer_old);
-}
-
-static void prepare_message_data(struct ceph_msg *msg, u32 data_len)
-{
- /* Initialize data cursor */
-
- ceph_msg_data_cursor_init(msg, (size_t)data_len);
-}
-
-/*
- * Prepare footer for currently outgoing message, and finish things
- * off. Assumes out_kvec* are already valid.. we just add on to the end.
- */
-static void prepare_write_message_footer(struct ceph_connection *con)
-{
- struct ceph_msg *m = con->out_msg;
-
- m->footer.flags |= CEPH_MSG_FOOTER_COMPLETE;
-
- dout("prepare_write_message_footer %p\n", con);
- con_out_kvec_add(con, sizeof_footer(con), &m->footer);
- if (con->peer_features & CEPH_FEATURE_MSG_AUTH) {
- if (con->ops->sign_message)
- con->ops->sign_message(m);
- else
- m->footer.sig = 0;
- } else {
- m->old_footer.flags = m->footer.flags;
- }
- con->out_more = m->more_to_follow;
- con->out_msg_done = true;
-}
-
-/*
- * Prepare headers for the next outgoing message.
- */
-static void prepare_write_message(struct ceph_connection *con)
-{
- struct ceph_msg *m;
- u32 crc;
-
- con_out_kvec_reset(con);
- con->out_msg_done = false;
-
- /* Sneak an ack in there first? If we can get it into the same
- * TCP packet that's a good thing. */
- if (con->in_seq > con->in_seq_acked) {
- con->in_seq_acked = con->in_seq;
- con_out_kvec_add(con, sizeof (tag_ack), &tag_ack);
- con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
- con_out_kvec_add(con, sizeof (con->out_temp_ack),
- &con->out_temp_ack);
- }
-
- BUG_ON(list_empty(&con->out_queue));
- m = list_first_entry(&con->out_queue, struct ceph_msg, list_head);
- con->out_msg = m;
- BUG_ON(m->con != con);
-
- /* put message on sent list */
- ceph_msg_get(m);
- list_move_tail(&m->list_head, &con->out_sent);
-
- /*
- * only assign outgoing seq # if we haven't sent this message
- * yet. if it is requeued, resend with it's original seq.
- */
- if (m->needs_out_seq) {
- m->hdr.seq = cpu_to_le64(++con->out_seq);
- m->needs_out_seq = false;
-
- if (con->ops->reencode_message)
- con->ops->reencode_message(m);
- }
-
- dout("prepare_write_message %p seq %lld type %d len %d+%d+%zd\n",
- m, con->out_seq, le16_to_cpu(m->hdr.type),
- le32_to_cpu(m->hdr.front_len), le32_to_cpu(m->hdr.middle_len),
- m->data_length);
- WARN_ON(m->front.iov_len != le32_to_cpu(m->hdr.front_len));
- WARN_ON(m->data_length != le32_to_cpu(m->hdr.data_len));
-
- /* tag + hdr + front + middle */
- con_out_kvec_add(con, sizeof (tag_msg), &tag_msg);
- con_out_kvec_add(con, sizeof(con->out_hdr), &con->out_hdr);
- con_out_kvec_add(con, m->front.iov_len, m->front.iov_base);
-
- if (m->middle)
- con_out_kvec_add(con, m->middle->vec.iov_len,
- m->middle->vec.iov_base);
-
- /* fill in hdr crc and finalize hdr */
- crc = crc32c(0, &m->hdr, offsetof(struct ceph_msg_header, crc));
- con->out_msg->hdr.crc = cpu_to_le32(crc);
- memcpy(&con->out_hdr, &con->out_msg->hdr, sizeof(con->out_hdr));
-
- /* fill in front and middle crc, footer */
- crc = crc32c(0, m->front.iov_base, m->front.iov_len);
- con->out_msg->footer.front_crc = cpu_to_le32(crc);
- if (m->middle) {
- crc = crc32c(0, m->middle->vec.iov_base,
- m->middle->vec.iov_len);
- con->out_msg->footer.middle_crc = cpu_to_le32(crc);
- } else
- con->out_msg->footer.middle_crc = 0;
- dout("%s front_crc %u middle_crc %u\n", __func__,
- le32_to_cpu(con->out_msg->footer.front_crc),
- le32_to_cpu(con->out_msg->footer.middle_crc));
- con->out_msg->footer.flags = 0;
-
- /* is there a data payload? */
- con->out_msg->footer.data_crc = 0;
- if (m->data_length) {
- prepare_message_data(con->out_msg, m->data_length);
- con->out_more = 1; /* data + footer will follow */
- } else {
- /* no, queue up footer too and be done */
- prepare_write_message_footer(con);
- }
-
- con_flag_set(con, CON_FLAG_WRITE_PENDING);
-}
-
-/*
- * Prepare an ack.
- */
-static void prepare_write_ack(struct ceph_connection *con)
-{
- dout("prepare_write_ack %p %llu -> %llu\n", con,
- con->in_seq_acked, con->in_seq);
- con->in_seq_acked = con->in_seq;
-
- con_out_kvec_reset(con);
-
- con_out_kvec_add(con, sizeof (tag_ack), &tag_ack);
-
- con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
- con_out_kvec_add(con, sizeof (con->out_temp_ack),
- &con->out_temp_ack);
-
- con->out_more = 1; /* more will follow.. eventually.. */
- con_flag_set(con, CON_FLAG_WRITE_PENDING);
-}
-
-/*
- * Prepare to share the seq during handshake
- */
-static void prepare_write_seq(struct ceph_connection *con)
-{
- dout("prepare_write_seq %p %llu -> %llu\n", con,
- con->in_seq_acked, con->in_seq);
- con->in_seq_acked = con->in_seq;
-
- con_out_kvec_reset(con);
-
- con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
- con_out_kvec_add(con, sizeof (con->out_temp_ack),
- &con->out_temp_ack);
-
- con_flag_set(con, CON_FLAG_WRITE_PENDING);
-}
-
-/*
- * Prepare to write keepalive byte.
- */
-static void prepare_write_keepalive(struct ceph_connection *con)
-{
- dout("prepare_write_keepalive %p\n", con);
- con_out_kvec_reset(con);
- if (con->peer_features & CEPH_FEATURE_MSGR_KEEPALIVE2) {
- struct timespec64 now;
-
- ktime_get_real_ts64(&now);
- con_out_kvec_add(con, sizeof(tag_keepalive2), &tag_keepalive2);
- ceph_encode_timespec64(&con->out_temp_keepalive2, &now);
- con_out_kvec_add(con, sizeof(con->out_temp_keepalive2),
- &con->out_temp_keepalive2);
- } else {
- con_out_kvec_add(con, sizeof(tag_keepalive), &tag_keepalive);
- }
- con_flag_set(con, CON_FLAG_WRITE_PENDING);
-}
-
-/*
- * Connection negotiation.
- */
-
-static int get_connect_authorizer(struct ceph_connection *con)
-{
- struct ceph_auth_handshake *auth;
- int auth_proto;
-
- if (!con->ops->get_authorizer) {
- con->auth = NULL;
- con->out_connect.authorizer_protocol = CEPH_AUTH_UNKNOWN;
- con->out_connect.authorizer_len = 0;
- return 0;
- }
-
- auth = con->ops->get_authorizer(con, &auth_proto, con->auth_retry);
- if (IS_ERR(auth))
- return PTR_ERR(auth);
-
- con->auth = auth;
- con->out_connect.authorizer_protocol = cpu_to_le32(auth_proto);
- con->out_connect.authorizer_len = cpu_to_le32(auth->authorizer_buf_len);
- return 0;
-}
-
-/*
- * We connected to a peer and are saying hello.
- */
-static void prepare_write_banner(struct ceph_connection *con)
-{
- con_out_kvec_add(con, strlen(CEPH_BANNER), CEPH_BANNER);
- con_out_kvec_add(con, sizeof (con->msgr->my_enc_addr),
- &con->msgr->my_enc_addr);
-
- con->out_more = 0;
- con_flag_set(con, CON_FLAG_WRITE_PENDING);
-}
-
-static void __prepare_write_connect(struct ceph_connection *con)
-{
- con_out_kvec_add(con, sizeof(con->out_connect), &con->out_connect);
- if (con->auth)
- con_out_kvec_add(con, con->auth->authorizer_buf_len,
- con->auth->authorizer_buf);
-
- con->out_more = 0;
- con_flag_set(con, CON_FLAG_WRITE_PENDING);
-}
-
-static int prepare_write_connect(struct ceph_connection *con)
-{
- unsigned int global_seq = get_global_seq(con->msgr, 0);
- int proto;
- int ret;
-
- switch (con->peer_name.type) {
- case CEPH_ENTITY_TYPE_MON:
- proto = CEPH_MONC_PROTOCOL;
- break;
- case CEPH_ENTITY_TYPE_OSD:
- proto = CEPH_OSDC_PROTOCOL;
- break;
- case CEPH_ENTITY_TYPE_MDS:
- proto = CEPH_MDSC_PROTOCOL;
- break;
- default:
- BUG();
- }
-
- dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con,
- con->connect_seq, global_seq, proto);
-
- con->out_connect.features =
- cpu_to_le64(from_msgr(con->msgr)->supported_features);
- con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT);
- con->out_connect.connect_seq = cpu_to_le32(con->connect_seq);
- con->out_connect.global_seq = cpu_to_le32(global_seq);
- con->out_connect.protocol_version = cpu_to_le32(proto);
- con->out_connect.flags = 0;
-
- ret = get_connect_authorizer(con);
- if (ret)
- return ret;
-
- __prepare_write_connect(con);
- return 0;
-}
-
-/*
- * write as much of pending kvecs to the socket as we can.
- * 1 -> done
- * 0 -> socket full, but more to do
- * <0 -> error
- */
-static int write_partial_kvec(struct ceph_connection *con)
-{
- int ret;
-
- dout("write_partial_kvec %p %d left\n", con, con->out_kvec_bytes);
- while (con->out_kvec_bytes > 0) {
- ret = ceph_tcp_sendmsg(con->sock, con->out_kvec_cur,
- con->out_kvec_left, con->out_kvec_bytes,
- con->out_more);
- if (ret <= 0)
- goto out;
- con->out_kvec_bytes -= ret;
- if (con->out_kvec_bytes == 0)
- break; /* done */
-
- /* account for full iov entries consumed */
- while (ret >= con->out_kvec_cur->iov_len) {
- BUG_ON(!con->out_kvec_left);
- ret -= con->out_kvec_cur->iov_len;
- con->out_kvec_cur++;
- con->out_kvec_left--;
- }
- /* and for a partially-consumed entry */
- if (ret) {
- con->out_kvec_cur->iov_len -= ret;
- con->out_kvec_cur->iov_base += ret;
- }
- }
- con->out_kvec_left = 0;
- ret = 1;
-out:
- dout("write_partial_kvec %p %d left in %d kvecs ret = %d\n", con,
- con->out_kvec_bytes, con->out_kvec_left, ret);
- return ret; /* done! */
-}
-
-static u32 ceph_crc32c_page(u32 crc, struct page *page,
- unsigned int page_offset,
- unsigned int length)
+u32 ceph_crc32c_page(u32 crc, struct page *page, unsigned int page_offset,
+ unsigned int length)
{
char *kaddr;
@@ -1542,257 +1128,8 @@ static u32 ceph_crc32c_page(u32 crc, struct page *page,
return crc;
}
-/*
- * Write as much message data payload as we can. If we finish, queue
- * up the footer.
- * 1 -> done, footer is now queued in out_kvec[].
- * 0 -> socket full, but more to do
- * <0 -> error
- */
-static int write_partial_message_data(struct ceph_connection *con)
-{
- struct ceph_msg *msg = con->out_msg;
- struct ceph_msg_data_cursor *cursor = &msg->cursor;
- bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC);
- int more = MSG_MORE | MSG_SENDPAGE_NOTLAST;
- u32 crc;
-
- dout("%s %p msg %p\n", __func__, con, msg);
-
- if (!msg->num_data_items)
- return -EINVAL;
-
- /*
- * Iterate through each page that contains data to be
- * written, and send as much as possible for each.
- *
- * If we are calculating the data crc (the default), we will
- * need to map the page. If we have no pages, they have
- * been revoked, so use the zero page.
- */
- crc = do_datacrc ? le32_to_cpu(msg->footer.data_crc) : 0;
- while (cursor->total_resid) {
- struct page *page;
- size_t page_offset;
- size_t length;
- int ret;
-
- if (!cursor->resid) {
- ceph_msg_data_advance(cursor, 0);
- continue;
- }
-
- page = ceph_msg_data_next(cursor, &page_offset, &length, NULL);
- if (length == cursor->total_resid)
- more = MSG_MORE;
- ret = ceph_tcp_sendpage(con->sock, page, page_offset, length,
- more);
- if (ret <= 0) {
- if (do_datacrc)
- msg->footer.data_crc = cpu_to_le32(crc);
-
- return ret;
- }
- if (do_datacrc && cursor->need_crc)
- crc = ceph_crc32c_page(crc, page, page_offset, length);
- ceph_msg_data_advance(cursor, (size_t)ret);
- }
-
- dout("%s %p msg %p done\n", __func__, con, msg);
-
- /* prepare and queue up footer, too */
- if (do_datacrc)
- msg->footer.data_crc = cpu_to_le32(crc);
- else
- msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC;
- con_out_kvec_reset(con);
- prepare_write_message_footer(con);
-
- return 1; /* must return > 0 to indicate success */
-}
-
-/*
- * write some zeros
- */
-static int write_partial_skip(struct ceph_connection *con)
-{
- int more = MSG_MORE | MSG_SENDPAGE_NOTLAST;
- int ret;
-
- dout("%s %p %d left\n", __func__, con, con->out_skip);
- while (con->out_skip > 0) {
- size_t size = min(con->out_skip, (int) PAGE_SIZE);
-
- if (size == con->out_skip)
- more = MSG_MORE;
- ret = ceph_tcp_sendpage(con->sock, zero_page, 0, size, more);
- if (ret <= 0)
- goto out;
- con->out_skip -= ret;
- }
- ret = 1;
-out:
- return ret;
-}
-
-/*
- * Prepare to read connection handshake, or an ack.
- */
-static void prepare_read_banner(struct ceph_connection *con)
-{
- dout("prepare_read_banner %p\n", con);
- con->in_base_pos = 0;
-}
-
-static void prepare_read_connect(struct ceph_connection *con)
-{
- dout("prepare_read_connect %p\n", con);
- con->in_base_pos = 0;
-}
-
-static void prepare_read_ack(struct ceph_connection *con)
-{
- dout("prepare_read_ack %p\n", con);
- con->in_base_pos = 0;
-}
-
-static void prepare_read_seq(struct ceph_connection *con)
-{
- dout("prepare_read_seq %p\n", con);
- con->in_base_pos = 0;
- con->in_tag = CEPH_MSGR_TAG_SEQ;
-}
-
-static void prepare_read_tag(struct ceph_connection *con)
-{
- dout("prepare_read_tag %p\n", con);
- con->in_base_pos = 0;
- con->in_tag = CEPH_MSGR_TAG_READY;
-}
-
-static void prepare_read_keepalive_ack(struct ceph_connection *con)
-{
- dout("prepare_read_keepalive_ack %p\n", con);
- con->in_base_pos = 0;
-}
-
-/*
- * Prepare to read a message.
- */
-static int prepare_read_message(struct ceph_connection *con)
-{
- dout("prepare_read_message %p\n", con);
- BUG_ON(con->in_msg != NULL);
- con->in_base_pos = 0;
- con->in_front_crc = con->in_middle_crc = con->in_data_crc = 0;
- return 0;
-}
-
-
-static int read_partial(struct ceph_connection *con,
- int end, int size, void *object)
-{
- while (con->in_base_pos < end) {
- int left = end - con->in_base_pos;
- int have = size - left;
- int ret = ceph_tcp_recvmsg(con->sock, object + have, left);
- if (ret <= 0)
- return ret;
- con->in_base_pos += ret;
- }
- return 1;
-}
-
-
-/*
- * Read all or part of the connect-side handshake on a new connection
- */
-static int read_partial_banner(struct ceph_connection *con)
-{
- int size;
- int end;
- int ret;
-
- dout("read_partial_banner %p at %d\n", con, con->in_base_pos);
-
- /* peer's banner */
- size = strlen(CEPH_BANNER);
- end = size;
- ret = read_partial(con, end, size, con->in_banner);
- if (ret <= 0)
- goto out;
-
- size = sizeof (con->actual_peer_addr);
- end += size;
- ret = read_partial(con, end, size, &con->actual_peer_addr);
- if (ret <= 0)
- goto out;
- ceph_decode_banner_addr(&con->actual_peer_addr);
-
- size = sizeof (con->peer_addr_for_me);
- end += size;
- ret = read_partial(con, end, size, &con->peer_addr_for_me);
- if (ret <= 0)
- goto out;
- ceph_decode_banner_addr(&con->peer_addr_for_me);
-
-out:
- return ret;
-}
-
-static int read_partial_connect(struct ceph_connection *con)
-{
- int size;
- int end;
- int ret;
-
- dout("read_partial_connect %p at %d\n", con, con->in_base_pos);
-
- size = sizeof (con->in_reply);
- end = size;
- ret = read_partial(con, end, size, &con->in_reply);
- if (ret <= 0)
- goto out;
-
- if (con->auth) {
- size = le32_to_cpu(con->in_reply.authorizer_len);
- if (size > con->auth->authorizer_reply_buf_len) {
- pr_err("authorizer reply too big: %d > %zu\n", size,
- con->auth->authorizer_reply_buf_len);
- ret = -EINVAL;
- goto out;
- }
-
- end += size;
- ret = read_partial(con, end, size,
- con->auth->authorizer_reply_buf);
- if (ret <= 0)
- goto out;
- }
-
- dout("read_partial_connect %p tag %d, con_seq = %u, g_seq = %u\n",
- con, (int)con->in_reply.tag,
- le32_to_cpu(con->in_reply.connect_seq),
- le32_to_cpu(con->in_reply.global_seq));
-out:
- return ret;
-}
-/*
- * Verify the hello banner looks okay.
- */
-static int verify_hello(struct ceph_connection *con)
-{
- if (memcmp(con->in_banner, CEPH_BANNER, strlen(CEPH_BANNER))) {
- pr_err("connect to %s got bad banner\n",
- ceph_pr_addr(&con->peer_addr));
- con->error_msg = "protocol error, bad banner";
- return -1;
- }
- return 0;
-}
-
-static bool addr_is_blank(struct ceph_entity_addr *addr)
+bool ceph_addr_is_blank(const struct ceph_entity_addr *addr)
{
struct sockaddr_storage ss = addr->in_addr; /* align */
struct in_addr *addr4 = &((struct sockaddr_in *)&ss)->sin_addr;
@@ -1808,7 +1145,7 @@ static bool addr_is_blank(struct ceph_entity_addr *addr)
}
}
-static int addr_port(struct ceph_entity_addr *addr)
+int ceph_addr_port(const struct ceph_entity_addr *addr)
{
switch (get_unaligned(&addr->in_addr.ss_family)) {
case AF_INET:
@@ -1819,7 +1156,7 @@ static int addr_port(struct ceph_entity_addr *addr)
return 0;
}
-static void addr_set_port(struct ceph_entity_addr *addr, int p)
+void ceph_addr_set_port(struct ceph_entity_addr *addr, int p)
{
switch (get_unaligned(&addr->in_addr.ss_family)) {
case AF_INET:
@@ -1977,8 +1314,17 @@ int ceph_parse_ips(const char *c, const char *end,
port = CEPH_MON_PORT;
}
- addr_set_port(&addr[i], port);
+ ceph_addr_set_port(&addr[i], port);
+ /*
+ * We want the type to be set according to ms_mode
+ * option, but options are normally parsed after mon
+ * addresses. Rather than complicating parsing, set
+ * to LEGACY and override in build_initial_monmap()
+ * for mon addresses and ceph_messenger_init() for
+ * ip option.
+ */
addr[i].type = CEPH_ENTITY_ADDR_TYPE_LEGACY;
+ addr[i].nonce = 0;
dout("parse_ips got %s\n", ceph_pr_addr(&addr[i]));
@@ -2000,521 +1346,12 @@ bad:
return ret;
}
-static int process_banner(struct ceph_connection *con)
-{
- dout("process_banner on %p\n", con);
-
- if (verify_hello(con) < 0)
- return -1;
-
- /*
- * Make sure the other end is who we wanted. note that the other
- * end may not yet know their ip address, so if it's 0.0.0.0, give
- * them the benefit of the doubt.
- */
- if (memcmp(&con->peer_addr, &con->actual_peer_addr,
- sizeof(con->peer_addr)) != 0 &&
- !(addr_is_blank(&con->actual_peer_addr) &&
- con->actual_peer_addr.nonce == con->peer_addr.nonce)) {
- pr_warn("wrong peer, want %s/%u, got %s/%u\n",
- ceph_pr_addr(&con->peer_addr),
- le32_to_cpu(con->peer_addr.nonce),
- ceph_pr_addr(&con->actual_peer_addr),
- le32_to_cpu(con->actual_peer_addr.nonce));
- con->error_msg = "wrong peer at address";
- return -1;
- }
-
- /*
- * did we learn our address?
- */
- if (addr_is_blank(&con->msgr->inst.addr)) {
- int port = addr_port(&con->msgr->inst.addr);
-
- memcpy(&con->msgr->inst.addr.in_addr,
- &con->peer_addr_for_me.in_addr,
- sizeof(con->peer_addr_for_me.in_addr));
- addr_set_port(&con->msgr->inst.addr, port);
- encode_my_addr(con->msgr);
- dout("process_banner learned my addr is %s\n",
- ceph_pr_addr(&con->msgr->inst.addr));
- }
-
- return 0;
-}
-
-static int process_connect(struct ceph_connection *con)
-{
- u64 sup_feat = from_msgr(con->msgr)->supported_features;
- u64 req_feat = from_msgr(con->msgr)->required_features;
- u64 server_feat = le64_to_cpu(con->in_reply.features);
- int ret;
-
- dout("process_connect on %p tag %d\n", con, (int)con->in_tag);
-
- if (con->auth) {
- int len = le32_to_cpu(con->in_reply.authorizer_len);
-
- /*
- * Any connection that defines ->get_authorizer()
- * should also define ->add_authorizer_challenge() and
- * ->verify_authorizer_reply().
- *
- * See get_connect_authorizer().
- */
- if (con->in_reply.tag == CEPH_MSGR_TAG_CHALLENGE_AUTHORIZER) {
- ret = con->ops->add_authorizer_challenge(
- con, con->auth->authorizer_reply_buf, len);
- if (ret < 0)
- return ret;
-
- con_out_kvec_reset(con);
- __prepare_write_connect(con);
- prepare_read_connect(con);
- return 0;
- }
-
- if (len) {
- ret = con->ops->verify_authorizer_reply(con);
- if (ret < 0) {
- con->error_msg = "bad authorize reply";
- return ret;
- }
- }
- }
-
- switch (con->in_reply.tag) {
- case CEPH_MSGR_TAG_FEATURES:
- pr_err("%s%lld %s feature set mismatch,"
- " my %llx < server's %llx, missing %llx\n",
- ENTITY_NAME(con->peer_name),
- ceph_pr_addr(&con->peer_addr),
- sup_feat, server_feat, server_feat & ~sup_feat);
- con->error_msg = "missing required protocol features";
- reset_connection(con);
- return -1;
-
- case CEPH_MSGR_TAG_BADPROTOVER:
- pr_err("%s%lld %s protocol version mismatch,"
- " my %d != server's %d\n",
- ENTITY_NAME(con->peer_name),
- ceph_pr_addr(&con->peer_addr),
- le32_to_cpu(con->out_connect.protocol_version),
- le32_to_cpu(con->in_reply.protocol_version));
- con->error_msg = "protocol version mismatch";
- reset_connection(con);
- return -1;
-
- case CEPH_MSGR_TAG_BADAUTHORIZER:
- con->auth_retry++;
- dout("process_connect %p got BADAUTHORIZER attempt %d\n", con,
- con->auth_retry);
- if (con->auth_retry == 2) {
- con->error_msg = "connect authorization failure";
- return -1;
- }
- con_out_kvec_reset(con);
- ret = prepare_write_connect(con);
- if (ret < 0)
- return ret;
- prepare_read_connect(con);
- break;
-
- case CEPH_MSGR_TAG_RESETSESSION:
- /*
- * If we connected with a large connect_seq but the peer
- * has no record of a session with us (no connection, or
- * connect_seq == 0), they will send RESETSESION to indicate
- * that they must have reset their session, and may have
- * dropped messages.
- */
- dout("process_connect got RESET peer seq %u\n",
- le32_to_cpu(con->in_reply.connect_seq));
- pr_err("%s%lld %s connection reset\n",
- ENTITY_NAME(con->peer_name),
- ceph_pr_addr(&con->peer_addr));
- reset_connection(con);
- con_out_kvec_reset(con);
- ret = prepare_write_connect(con);
- if (ret < 0)
- return ret;
- prepare_read_connect(con);
-
- /* Tell ceph about it. */
- mutex_unlock(&con->mutex);
- pr_info("reset on %s%lld\n", ENTITY_NAME(con->peer_name));
- if (con->ops->peer_reset)
- con->ops->peer_reset(con);
- mutex_lock(&con->mutex);
- if (con->state != CON_STATE_NEGOTIATING)
- return -EAGAIN;
- break;
-
- case CEPH_MSGR_TAG_RETRY_SESSION:
- /*
- * If we sent a smaller connect_seq than the peer has, try
- * again with a larger value.
- */
- dout("process_connect got RETRY_SESSION my seq %u, peer %u\n",
- le32_to_cpu(con->out_connect.connect_seq),
- le32_to_cpu(con->in_reply.connect_seq));
- con->connect_seq = le32_to_cpu(con->in_reply.connect_seq);
- con_out_kvec_reset(con);
- ret = prepare_write_connect(con);
- if (ret < 0)
- return ret;
- prepare_read_connect(con);
- break;
-
- case CEPH_MSGR_TAG_RETRY_GLOBAL:
- /*
- * If we sent a smaller global_seq than the peer has, try
- * again with a larger value.
- */
- dout("process_connect got RETRY_GLOBAL my %u peer_gseq %u\n",
- con->peer_global_seq,
- le32_to_cpu(con->in_reply.global_seq));
- get_global_seq(con->msgr,
- le32_to_cpu(con->in_reply.global_seq));
- con_out_kvec_reset(con);
- ret = prepare_write_connect(con);
- if (ret < 0)
- return ret;
- prepare_read_connect(con);
- break;
-
- case CEPH_MSGR_TAG_SEQ:
- case CEPH_MSGR_TAG_READY:
- if (req_feat & ~server_feat) {
- pr_err("%s%lld %s protocol feature mismatch,"
- " my required %llx > server's %llx, need %llx\n",
- ENTITY_NAME(con->peer_name),
- ceph_pr_addr(&con->peer_addr),
- req_feat, server_feat, req_feat & ~server_feat);
- con->error_msg = "missing required protocol features";
- reset_connection(con);
- return -1;
- }
-
- WARN_ON(con->state != CON_STATE_NEGOTIATING);
- con->state = CON_STATE_OPEN;
- con->auth_retry = 0; /* we authenticated; clear flag */
- con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq);
- con->connect_seq++;
- con->peer_features = server_feat;
- dout("process_connect got READY gseq %d cseq %d (%d)\n",
- con->peer_global_seq,
- le32_to_cpu(con->in_reply.connect_seq),
- con->connect_seq);
- WARN_ON(con->connect_seq !=
- le32_to_cpu(con->in_reply.connect_seq));
-
- if (con->in_reply.flags & CEPH_MSG_CONNECT_LOSSY)
- con_flag_set(con, CON_FLAG_LOSSYTX);
-
- con->delay = 0; /* reset backoff memory */
-
- if (con->in_reply.tag == CEPH_MSGR_TAG_SEQ) {
- prepare_write_seq(con);
- prepare_read_seq(con);
- } else {
- prepare_read_tag(con);
- }
- break;
-
- case CEPH_MSGR_TAG_WAIT:
- /*
- * If there is a connection race (we are opening
- * connections to each other), one of us may just have
- * to WAIT. This shouldn't happen if we are the
- * client.
- */
- con->error_msg = "protocol error, got WAIT as client";
- return -1;
-
- default:
- con->error_msg = "protocol error, garbage tag during connect";
- return -1;
- }
- return 0;
-}
-
-
-/*
- * read (part of) an ack
- */
-static int read_partial_ack(struct ceph_connection *con)
-{
- int size = sizeof (con->in_temp_ack);
- int end = size;
-
- return read_partial(con, end, size, &con->in_temp_ack);
-}
-
-/*
- * We can finally discard anything that's been acked.
- */
-static void process_ack(struct ceph_connection *con)
-{
- struct ceph_msg *m;
- u64 ack = le64_to_cpu(con->in_temp_ack);
- u64 seq;
- bool reconnect = (con->in_tag == CEPH_MSGR_TAG_SEQ);
- struct list_head *list = reconnect ? &con->out_queue : &con->out_sent;
-
- /*
- * In the reconnect case, con_fault() has requeued messages
- * in out_sent. We should cleanup old messages according to
- * the reconnect seq.
- */
- while (!list_empty(list)) {
- m = list_first_entry(list, struct ceph_msg, list_head);
- if (reconnect && m->needs_out_seq)
- break;
- seq = le64_to_cpu(m->hdr.seq);
- if (seq > ack)
- break;
- dout("got ack for seq %llu type %d at %p\n", seq,
- le16_to_cpu(m->hdr.type), m);
- m->ack_stamp = jiffies;
- ceph_msg_remove(m);
- }
-
- prepare_read_tag(con);
-}
-
-
-static int read_partial_message_section(struct ceph_connection *con,
- struct kvec *section,
- unsigned int sec_len, u32 *crc)
-{
- int ret, left;
-
- BUG_ON(!section);
-
- while (section->iov_len < sec_len) {
- BUG_ON(section->iov_base == NULL);
- left = sec_len - section->iov_len;
- ret = ceph_tcp_recvmsg(con->sock, (char *)section->iov_base +
- section->iov_len, left);
- if (ret <= 0)
- return ret;
- section->iov_len += ret;
- }
- if (section->iov_len == sec_len)
- *crc = crc32c(0, section->iov_base, section->iov_len);
-
- return 1;
-}
-
-static int read_partial_msg_data(struct ceph_connection *con)
-{
- struct ceph_msg *msg = con->in_msg;
- struct ceph_msg_data_cursor *cursor = &msg->cursor;
- bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC);
- struct page *page;
- size_t page_offset;
- size_t length;
- u32 crc = 0;
- int ret;
-
- if (!msg->num_data_items)
- return -EIO;
-
- if (do_datacrc)
- crc = con->in_data_crc;
- while (cursor->total_resid) {
- if (!cursor->resid) {
- ceph_msg_data_advance(cursor, 0);
- continue;
- }
-
- page = ceph_msg_data_next(cursor, &page_offset, &length, NULL);
- ret = ceph_tcp_recvpage(con->sock, page, page_offset, length);
- if (ret <= 0) {
- if (do_datacrc)
- con->in_data_crc = crc;
-
- return ret;
- }
-
- if (do_datacrc)
- crc = ceph_crc32c_page(crc, page, page_offset, ret);
- ceph_msg_data_advance(cursor, (size_t)ret);
- }
- if (do_datacrc)
- con->in_data_crc = crc;
-
- return 1; /* must return > 0 to indicate success */
-}
-
-/*
- * read (part of) a message.
- */
-static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip);
-
-static int read_partial_message(struct ceph_connection *con)
-{
- struct ceph_msg *m = con->in_msg;
- int size;
- int end;
- int ret;
- unsigned int front_len, middle_len, data_len;
- bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC);
- bool need_sign = (con->peer_features & CEPH_FEATURE_MSG_AUTH);
- u64 seq;
- u32 crc;
-
- dout("read_partial_message con %p msg %p\n", con, m);
-
- /* header */
- size = sizeof (con->in_hdr);
- end = size;
- ret = read_partial(con, end, size, &con->in_hdr);
- if (ret <= 0)
- return ret;
-
- crc = crc32c(0, &con->in_hdr, offsetof(struct ceph_msg_header, crc));
- if (cpu_to_le32(crc) != con->in_hdr.crc) {
- pr_err("read_partial_message bad hdr crc %u != expected %u\n",
- crc, con->in_hdr.crc);
- return -EBADMSG;
- }
-
- front_len = le32_to_cpu(con->in_hdr.front_len);
- if (front_len > CEPH_MSG_MAX_FRONT_LEN)
- return -EIO;
- middle_len = le32_to_cpu(con->in_hdr.middle_len);
- if (middle_len > CEPH_MSG_MAX_MIDDLE_LEN)
- return -EIO;
- data_len = le32_to_cpu(con->in_hdr.data_len);
- if (data_len > CEPH_MSG_MAX_DATA_LEN)
- return -EIO;
-
- /* verify seq# */
- seq = le64_to_cpu(con->in_hdr.seq);
- if ((s64)seq - (s64)con->in_seq < 1) {
- pr_info("skipping %s%lld %s seq %lld expected %lld\n",
- ENTITY_NAME(con->peer_name),
- ceph_pr_addr(&con->peer_addr),
- seq, con->in_seq + 1);
- con->in_base_pos = -front_len - middle_len - data_len -
- sizeof_footer(con);
- con->in_tag = CEPH_MSGR_TAG_READY;
- return 1;
- } else if ((s64)seq - (s64)con->in_seq > 1) {
- pr_err("read_partial_message bad seq %lld expected %lld\n",
- seq, con->in_seq + 1);
- con->error_msg = "bad message sequence # for incoming message";
- return -EBADE;
- }
-
- /* allocate message? */
- if (!con->in_msg) {
- int skip = 0;
-
- dout("got hdr type %d front %d data %d\n", con->in_hdr.type,
- front_len, data_len);
- ret = ceph_con_in_msg_alloc(con, &skip);
- if (ret < 0)
- return ret;
-
- BUG_ON(!con->in_msg ^ skip);
- if (skip) {
- /* skip this message */
- dout("alloc_msg said skip message\n");
- con->in_base_pos = -front_len - middle_len - data_len -
- sizeof_footer(con);
- con->in_tag = CEPH_MSGR_TAG_READY;
- con->in_seq++;
- return 1;
- }
-
- BUG_ON(!con->in_msg);
- BUG_ON(con->in_msg->con != con);
- m = con->in_msg;
- m->front.iov_len = 0; /* haven't read it yet */
- if (m->middle)
- m->middle->vec.iov_len = 0;
-
- /* prepare for data payload, if any */
-
- if (data_len)
- prepare_message_data(con->in_msg, data_len);
- }
-
- /* front */
- ret = read_partial_message_section(con, &m->front, front_len,
- &con->in_front_crc);
- if (ret <= 0)
- return ret;
-
- /* middle */
- if (m->middle) {
- ret = read_partial_message_section(con, &m->middle->vec,
- middle_len,
- &con->in_middle_crc);
- if (ret <= 0)
- return ret;
- }
-
- /* (page) data */
- if (data_len) {
- ret = read_partial_msg_data(con);
- if (ret <= 0)
- return ret;
- }
-
- /* footer */
- size = sizeof_footer(con);
- end += size;
- ret = read_partial(con, end, size, &m->footer);
- if (ret <= 0)
- return ret;
-
- if (!need_sign) {
- m->footer.flags = m->old_footer.flags;
- m->footer.sig = 0;
- }
-
- dout("read_partial_message got msg %p %d (%u) + %d (%u) + %d (%u)\n",
- m, front_len, m->footer.front_crc, middle_len,
- m->footer.middle_crc, data_len, m->footer.data_crc);
-
- /* crc ok? */
- if (con->in_front_crc != le32_to_cpu(m->footer.front_crc)) {
- pr_err("read_partial_message %p front crc %u != exp. %u\n",
- m, con->in_front_crc, m->footer.front_crc);
- return -EBADMSG;
- }
- if (con->in_middle_crc != le32_to_cpu(m->footer.middle_crc)) {
- pr_err("read_partial_message %p middle crc %u != exp %u\n",
- m, con->in_middle_crc, m->footer.middle_crc);
- return -EBADMSG;
- }
- if (do_datacrc &&
- (m->footer.flags & CEPH_MSG_FOOTER_NOCRC) == 0 &&
- con->in_data_crc != le32_to_cpu(m->footer.data_crc)) {
- pr_err("read_partial_message %p data crc %u != exp. %u\n", m,
- con->in_data_crc, le32_to_cpu(m->footer.data_crc));
- return -EBADMSG;
- }
-
- if (need_sign && con->ops->check_message_signature &&
- con->ops->check_message_signature(m)) {
- pr_err("read_partial_message %p signature check failed\n", m);
- return -EBADMSG;
- }
-
- return 1; /* done! */
-}
-
/*
* Process message. This happens in the worker thread. The callback should
* be careful not to do anything that waits on other incoming messages or it
* may deadlock.
*/
-static void process_message(struct ceph_connection *con)
+void ceph_con_process_message(struct ceph_connection *con)
{
struct ceph_msg *msg = con->in_msg;
@@ -2528,12 +1365,13 @@ static void process_message(struct ceph_connection *con)
con->in_seq++;
mutex_unlock(&con->mutex);
- dout("===== %p %llu from %s%lld %d=%s len %d+%d (%u %u %u) =====\n",
+ dout("===== %p %llu from %s%lld %d=%s len %d+%d+%d (%u %u %u) =====\n",
msg, le64_to_cpu(msg->hdr.seq),
ENTITY_NAME(msg->hdr.src),
le16_to_cpu(msg->hdr.type),
ceph_msg_type_name(le16_to_cpu(msg->hdr.type)),
le32_to_cpu(msg->hdr.front_len),
+ le32_to_cpu(msg->hdr.middle_len),
le32_to_cpu(msg->hdr.data_len),
con->in_front_crc, con->in_middle_crc, con->in_data_crc);
con->ops->dispatch(con, msg);
@@ -2541,264 +1379,6 @@ static void process_message(struct ceph_connection *con)
mutex_lock(&con->mutex);
}
-static int read_keepalive_ack(struct ceph_connection *con)
-{
- struct ceph_timespec ceph_ts;
- size_t size = sizeof(ceph_ts);
- int ret = read_partial(con, size, size, &ceph_ts);
- if (ret <= 0)
- return ret;
- ceph_decode_timespec64(&con->last_keepalive_ack, &ceph_ts);
- prepare_read_tag(con);
- return 1;
-}
-
-/*
- * Write something to the socket. Called in a worker thread when the
- * socket appears to be writeable and we have something ready to send.
- */
-static int try_write(struct ceph_connection *con)
-{
- int ret = 1;
-
- dout("try_write start %p state %lu\n", con, con->state);
- if (con->state != CON_STATE_PREOPEN &&
- con->state != CON_STATE_CONNECTING &&
- con->state != CON_STATE_NEGOTIATING &&
- con->state != CON_STATE_OPEN)
- return 0;
-
- /* open the socket first? */
- if (con->state == CON_STATE_PREOPEN) {
- BUG_ON(con->sock);
- con->state = CON_STATE_CONNECTING;
-
- con_out_kvec_reset(con);
- prepare_write_banner(con);
- prepare_read_banner(con);
-
- BUG_ON(con->in_msg);
- con->in_tag = CEPH_MSGR_TAG_READY;
- dout("try_write initiating connect on %p new state %lu\n",
- con, con->state);
- ret = ceph_tcp_connect(con);
- if (ret < 0) {
- con->error_msg = "connect error";
- goto out;
- }
- }
-
-more:
- dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes);
- BUG_ON(!con->sock);
-
- /* kvec data queued? */
- if (con->out_kvec_left) {
- ret = write_partial_kvec(con);
- if (ret <= 0)
- goto out;
- }
- if (con->out_skip) {
- ret = write_partial_skip(con);
- if (ret <= 0)
- goto out;
- }
-
- /* msg pages? */
- if (con->out_msg) {
- if (con->out_msg_done) {
- ceph_msg_put(con->out_msg);
- con->out_msg = NULL; /* we're done with this one */
- goto do_next;
- }
-
- ret = write_partial_message_data(con);
- if (ret == 1)
- goto more; /* we need to send the footer, too! */
- if (ret == 0)
- goto out;
- if (ret < 0) {
- dout("try_write write_partial_message_data err %d\n",
- ret);
- goto out;
- }
- }
-
-do_next:
- if (con->state == CON_STATE_OPEN) {
- if (con_flag_test_and_clear(con, CON_FLAG_KEEPALIVE_PENDING)) {
- prepare_write_keepalive(con);
- goto more;
- }
- /* is anything else pending? */
- if (!list_empty(&con->out_queue)) {
- prepare_write_message(con);
- goto more;
- }
- if (con->in_seq > con->in_seq_acked) {
- prepare_write_ack(con);
- goto more;
- }
- }
-
- /* Nothing to do! */
- con_flag_clear(con, CON_FLAG_WRITE_PENDING);
- dout("try_write nothing else to write.\n");
- ret = 0;
-out:
- dout("try_write done on %p ret %d\n", con, ret);
- return ret;
-}
-
-/*
- * Read what we can from the socket.
- */
-static int try_read(struct ceph_connection *con)
-{
- int ret = -1;
-
-more:
- dout("try_read start on %p state %lu\n", con, con->state);
- if (con->state != CON_STATE_CONNECTING &&
- con->state != CON_STATE_NEGOTIATING &&
- con->state != CON_STATE_OPEN)
- return 0;
-
- BUG_ON(!con->sock);
-
- dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag,
- con->in_base_pos);
-
- if (con->state == CON_STATE_CONNECTING) {
- dout("try_read connecting\n");
- ret = read_partial_banner(con);
- if (ret <= 0)
- goto out;
- ret = process_banner(con);
- if (ret < 0)
- goto out;
-
- con->state = CON_STATE_NEGOTIATING;
-
- /*
- * Received banner is good, exchange connection info.
- * Do not reset out_kvec, as sending our banner raced
- * with receiving peer banner after connect completed.
- */
- ret = prepare_write_connect(con);
- if (ret < 0)
- goto out;
- prepare_read_connect(con);
-
- /* Send connection info before awaiting response */
- goto out;
- }
-
- if (con->state == CON_STATE_NEGOTIATING) {
- dout("try_read negotiating\n");
- ret = read_partial_connect(con);
- if (ret <= 0)
- goto out;
- ret = process_connect(con);
- if (ret < 0)
- goto out;
- goto more;
- }
-
- WARN_ON(con->state != CON_STATE_OPEN);
-
- if (con->in_base_pos < 0) {
- /*
- * skipping + discarding content.
- */
- ret = ceph_tcp_recvmsg(con->sock, NULL, -con->in_base_pos);
- if (ret <= 0)
- goto out;
- dout("skipped %d / %d bytes\n", ret, -con->in_base_pos);
- con->in_base_pos += ret;
- if (con->in_base_pos)
- goto more;
- }
- if (con->in_tag == CEPH_MSGR_TAG_READY) {
- /*
- * what's next?
- */
- ret = ceph_tcp_recvmsg(con->sock, &con->in_tag, 1);
- if (ret <= 0)
- goto out;
- dout("try_read got tag %d\n", (int)con->in_tag);
- switch (con->in_tag) {
- case CEPH_MSGR_TAG_MSG:
- prepare_read_message(con);
- break;
- case CEPH_MSGR_TAG_ACK:
- prepare_read_ack(con);
- break;
- case CEPH_MSGR_TAG_KEEPALIVE2_ACK:
- prepare_read_keepalive_ack(con);
- break;
- case CEPH_MSGR_TAG_CLOSE:
- con_close_socket(con);
- con->state = CON_STATE_CLOSED;
- goto out;
- default:
- goto bad_tag;
- }
- }
- if (con->in_tag == CEPH_MSGR_TAG_MSG) {
- ret = read_partial_message(con);
- if (ret <= 0) {
- switch (ret) {
- case -EBADMSG:
- con->error_msg = "bad crc/signature";
- fallthrough;
- case -EBADE:
- ret = -EIO;
- break;
- case -EIO:
- con->error_msg = "io error";
- break;
- }
- goto out;
- }
- if (con->in_tag == CEPH_MSGR_TAG_READY)
- goto more;
- process_message(con);
- if (con->state == CON_STATE_OPEN)
- prepare_read_tag(con);
- goto more;
- }
- if (con->in_tag == CEPH_MSGR_TAG_ACK ||
- con->in_tag == CEPH_MSGR_TAG_SEQ) {
- /*
- * the final handshake seq exchange is semantically
- * equivalent to an ACK
- */
- ret = read_partial_ack(con);
- if (ret <= 0)
- goto out;
- process_ack(con);
- goto more;
- }
- if (con->in_tag == CEPH_MSGR_TAG_KEEPALIVE2_ACK) {
- ret = read_keepalive_ack(con);
- if (ret <= 0)
- goto out;
- goto more;
- }
-
-out:
- dout("try_read done on %p ret %d\n", con, ret);
- return ret;
-
-bad_tag:
- pr_err("try_read bad con->in_tag = %d\n", (int)con->in_tag);
- con->error_msg = "protocol error, garbage tag";
- ret = -1;
- goto out;
-}
-
-
/*
* Atomically queue work on a connection after the specified delay.
* Bump @con reference to avoid races with connection teardown.
@@ -2811,6 +1391,9 @@ static int queue_con_delay(struct ceph_connection *con, unsigned long delay)
return -ENOENT;
}
+ if (delay >= HZ)
+ delay = round_jiffies_relative(delay);
+
dout("%s %p %lu\n", __func__, con, delay);
if (!queue_delayed_work(ceph_msgr_wq, &con->work, delay)) {
dout("%s %p - already queued\n", __func__, con);
@@ -2836,27 +1419,30 @@ static void cancel_con(struct ceph_connection *con)
static bool con_sock_closed(struct ceph_connection *con)
{
- if (!con_flag_test_and_clear(con, CON_FLAG_SOCK_CLOSED))
+ if (!ceph_con_flag_test_and_clear(con, CEPH_CON_F_SOCK_CLOSED))
return false;
#define CASE(x) \
- case CON_STATE_ ## x: \
+ case CEPH_CON_S_ ## x: \
con->error_msg = "socket closed (con state " #x ")"; \
break;
switch (con->state) {
CASE(CLOSED);
CASE(PREOPEN);
- CASE(CONNECTING);
- CASE(NEGOTIATING);
+ CASE(V1_BANNER);
+ CASE(V1_CONNECT_MSG);
+ CASE(V2_BANNER_PREFIX);
+ CASE(V2_BANNER_PAYLOAD);
+ CASE(V2_HELLO);
+ CASE(V2_AUTH);
+ CASE(V2_AUTH_SIGNATURE);
+ CASE(V2_SESSION_CONNECT);
+ CASE(V2_SESSION_RECONNECT);
CASE(OPEN);
CASE(STANDBY);
default:
- pr_warn("%s con %p unrecognized state %lu\n",
- __func__, con, con->state);
- con->error_msg = "unrecognized con state";
BUG();
- break;
}
#undef CASE
@@ -2867,15 +1453,15 @@ static bool con_backoff(struct ceph_connection *con)
{
int ret;
- if (!con_flag_test_and_clear(con, CON_FLAG_BACKOFF))
+ if (!ceph_con_flag_test_and_clear(con, CEPH_CON_F_BACKOFF))
return false;
- ret = queue_con_delay(con, round_jiffies_relative(con->delay));
+ ret = queue_con_delay(con, con->delay);
if (ret) {
dout("%s: con %p FAILED to back off %lu\n", __func__,
con, con->delay);
BUG_ON(ret == -ENOENT);
- con_flag_set(con, CON_FLAG_BACKOFF);
+ ceph_con_flag_set(con, CEPH_CON_F_BACKOFF);
}
return true;
@@ -2891,11 +1477,11 @@ static void con_fault_finish(struct ceph_connection *con)
* in case we faulted due to authentication, invalidate our
* current tickets so that we can get new ones.
*/
- if (con->auth_retry) {
- dout("auth_retry %d, invalidating\n", con->auth_retry);
+ if (con->v1.auth_retry) {
+ dout("auth_retry %d, invalidating\n", con->v1.auth_retry);
if (con->ops->invalidate_authorizer)
con->ops->invalidate_authorizer(con);
- con->auth_retry = 0;
+ con->v1.auth_retry = 0;
}
if (con->ops->fault)
@@ -2923,21 +1509,24 @@ static void ceph_con_workfn(struct work_struct *work)
dout("%s: con %p BACKOFF\n", __func__, con);
break;
}
- if (con->state == CON_STATE_STANDBY) {
+ if (con->state == CEPH_CON_S_STANDBY) {
dout("%s: con %p STANDBY\n", __func__, con);
break;
}
- if (con->state == CON_STATE_CLOSED) {
+ if (con->state == CEPH_CON_S_CLOSED) {
dout("%s: con %p CLOSED\n", __func__, con);
BUG_ON(con->sock);
break;
}
- if (con->state == CON_STATE_PREOPEN) {
+ if (con->state == CEPH_CON_S_PREOPEN) {
dout("%s: con %p PREOPEN\n", __func__, con);
BUG_ON(con->sock);
}
- ret = try_read(con);
+ if (ceph_msgr2(from_msgr(con->msgr)))
+ ret = ceph_con_v2_try_read(con);
+ else
+ ret = ceph_con_v1_try_read(con);
if (ret < 0) {
if (ret == -EAGAIN)
continue;
@@ -2947,7 +1536,10 @@ static void ceph_con_workfn(struct work_struct *work)
break;
}
- ret = try_write(con);
+ if (ceph_msgr2(from_msgr(con->msgr)))
+ ret = ceph_con_v2_try_write(con);
+ else
+ ret = ceph_con_v1_try_write(con);
if (ret < 0) {
if (ret == -EAGAIN)
continue;
@@ -2974,64 +1566,54 @@ static void ceph_con_workfn(struct work_struct *work)
*/
static void con_fault(struct ceph_connection *con)
{
- dout("fault %p state %lu to peer %s\n",
+ dout("fault %p state %d to peer %s\n",
con, con->state, ceph_pr_addr(&con->peer_addr));
pr_warn("%s%lld %s %s\n", ENTITY_NAME(con->peer_name),
ceph_pr_addr(&con->peer_addr), con->error_msg);
con->error_msg = NULL;
- WARN_ON(con->state != CON_STATE_CONNECTING &&
- con->state != CON_STATE_NEGOTIATING &&
- con->state != CON_STATE_OPEN);
+ WARN_ON(con->state == CEPH_CON_S_STANDBY ||
+ con->state == CEPH_CON_S_CLOSED);
- con_close_socket(con);
+ ceph_con_reset_protocol(con);
- if (con_flag_test(con, CON_FLAG_LOSSYTX)) {
+ if (ceph_con_flag_test(con, CEPH_CON_F_LOSSYTX)) {
dout("fault on LOSSYTX channel, marking CLOSED\n");
- con->state = CON_STATE_CLOSED;
+ con->state = CEPH_CON_S_CLOSED;
return;
}
- if (con->in_msg) {
- BUG_ON(con->in_msg->con != con);
- ceph_msg_put(con->in_msg);
- con->in_msg = NULL;
- }
- if (con->out_msg) {
- BUG_ON(con->out_msg->con != con);
- ceph_msg_put(con->out_msg);
- con->out_msg = NULL;
- }
-
/* Requeue anything that hasn't been acked */
list_splice_init(&con->out_sent, &con->out_queue);
/* If there are no messages queued or keepalive pending, place
* the connection in a STANDBY state */
if (list_empty(&con->out_queue) &&
- !con_flag_test(con, CON_FLAG_KEEPALIVE_PENDING)) {
+ !ceph_con_flag_test(con, CEPH_CON_F_KEEPALIVE_PENDING)) {
dout("fault %p setting STANDBY clearing WRITE_PENDING\n", con);
- con_flag_clear(con, CON_FLAG_WRITE_PENDING);
- con->state = CON_STATE_STANDBY;
+ ceph_con_flag_clear(con, CEPH_CON_F_WRITE_PENDING);
+ con->state = CEPH_CON_S_STANDBY;
} else {
/* retry after a delay. */
- con->state = CON_STATE_PREOPEN;
- if (con->delay == 0)
+ con->state = CEPH_CON_S_PREOPEN;
+ if (!con->delay) {
con->delay = BASE_DELAY_INTERVAL;
- else if (con->delay < MAX_DELAY_INTERVAL)
+ } else if (con->delay < MAX_DELAY_INTERVAL) {
con->delay *= 2;
- con_flag_set(con, CON_FLAG_BACKOFF);
+ if (con->delay > MAX_DELAY_INTERVAL)
+ con->delay = MAX_DELAY_INTERVAL;
+ }
+ ceph_con_flag_set(con, CEPH_CON_F_BACKOFF);
queue_con(con);
}
}
-
void ceph_messenger_reset_nonce(struct ceph_messenger *msgr)
{
u32 nonce = le32_to_cpu(msgr->inst.addr.nonce) + 1000000;
msgr->inst.addr.nonce = cpu_to_le32(nonce);
- encode_my_addr(msgr);
+ ceph_encode_my_addr(msgr);
}
/*
@@ -3042,26 +1624,35 @@ void ceph_messenger_init(struct ceph_messenger *msgr,
{
spin_lock_init(&msgr->global_seq_lock);
- if (myaddr)
- msgr->inst.addr = *myaddr;
+ if (myaddr) {
+ memcpy(&msgr->inst.addr.in_addr, &myaddr->in_addr,
+ sizeof(msgr->inst.addr.in_addr));
+ ceph_addr_set_port(&msgr->inst.addr, 0);
+ }
- /* select a random nonce */
- msgr->inst.addr.type = 0;
- get_random_bytes(&msgr->inst.addr.nonce, sizeof(msgr->inst.addr.nonce));
- encode_my_addr(msgr);
+ /*
+ * Since nautilus, clients are identified using type ANY.
+ * For msgr1, ceph_encode_banner_addr() munges it to NONE.
+ */
+ msgr->inst.addr.type = CEPH_ENTITY_ADDR_TYPE_ANY;
+
+ /* generate a random non-zero nonce */
+ do {
+ get_random_bytes(&msgr->inst.addr.nonce,
+ sizeof(msgr->inst.addr.nonce));
+ } while (!msgr->inst.addr.nonce);
+ ceph_encode_my_addr(msgr);
atomic_set(&msgr->stopping, 0);
write_pnet(&msgr->net, get_net(current->nsproxy->net_ns));
dout("%s %p\n", __func__, msgr);
}
-EXPORT_SYMBOL(ceph_messenger_init);
void ceph_messenger_fini(struct ceph_messenger *msgr)
{
put_net(read_pnet(&msgr->net));
}
-EXPORT_SYMBOL(ceph_messenger_fini);
static void msg_con_set(struct ceph_msg *msg, struct ceph_connection *con)
{
@@ -3075,17 +1666,19 @@ static void msg_con_set(struct ceph_msg *msg, struct ceph_connection *con)
static void clear_standby(struct ceph_connection *con)
{
/* come back from STANDBY? */
- if (con->state == CON_STATE_STANDBY) {
+ if (con->state == CEPH_CON_S_STANDBY) {
dout("clear_standby %p and ++connect_seq\n", con);
- con->state = CON_STATE_PREOPEN;
- con->connect_seq++;
- WARN_ON(con_flag_test(con, CON_FLAG_WRITE_PENDING));
- WARN_ON(con_flag_test(con, CON_FLAG_KEEPALIVE_PENDING));
+ con->state = CEPH_CON_S_PREOPEN;
+ con->v1.connect_seq++;
+ WARN_ON(ceph_con_flag_test(con, CEPH_CON_F_WRITE_PENDING));
+ WARN_ON(ceph_con_flag_test(con, CEPH_CON_F_KEEPALIVE_PENDING));
}
}
/*
* Queue up an outgoing message on the given connection.
+ *
+ * Consumes a ref on @msg.
*/
void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
{
@@ -3096,7 +1689,7 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
mutex_lock(&con->mutex);
- if (con->state == CON_STATE_CLOSED) {
+ if (con->state == CEPH_CON_S_CLOSED) {
dout("con_send %p closed, dropping %p\n", con, msg);
ceph_msg_put(msg);
mutex_unlock(&con->mutex);
@@ -3119,7 +1712,7 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
/* if there wasn't anything waiting to send before, queue
* new work */
- if (con_flag_test_and_set(con, CON_FLAG_WRITE_PENDING) == 0)
+ if (!ceph_con_flag_test_and_set(con, CEPH_CON_F_WRITE_PENDING))
queue_con(con);
}
EXPORT_SYMBOL(ceph_con_send);
@@ -3137,36 +1730,30 @@ void ceph_msg_revoke(struct ceph_msg *msg)
}
mutex_lock(&con->mutex);
- if (!list_empty(&msg->list_head)) {
- dout("%s %p msg %p - was on queue\n", __func__, con, msg);
- list_del_init(&msg->list_head);
- msg->hdr.seq = 0;
-
- ceph_msg_put(msg);
+ if (list_empty(&msg->list_head)) {
+ WARN_ON(con->out_msg == msg);
+ dout("%s con %p msg %p not linked\n", __func__, con, msg);
+ mutex_unlock(&con->mutex);
+ return;
}
+
+ dout("%s con %p msg %p was linked\n", __func__, con, msg);
+ msg->hdr.seq = 0;
+ ceph_msg_remove(msg);
+
if (con->out_msg == msg) {
- BUG_ON(con->out_skip);
- /* footer */
- if (con->out_msg_done) {
- con->out_skip += con_out_kvec_skip(con);
- } else {
- BUG_ON(!msg->data_length);
- con->out_skip += sizeof_footer(con);
- }
- /* data, middle, front */
- if (msg->data_length)
- con->out_skip += msg->cursor.total_resid;
- if (msg->middle)
- con->out_skip += con_out_kvec_skip(con);
- con->out_skip += con_out_kvec_skip(con);
-
- dout("%s %p msg %p - was sending, will write %d skip %d\n",
- __func__, con, msg, con->out_kvec_bytes, con->out_skip);
- msg->hdr.seq = 0;
+ WARN_ON(con->state != CEPH_CON_S_OPEN);
+ dout("%s con %p msg %p was sending\n", __func__, con, msg);
+ if (ceph_msgr2(from_msgr(con->msgr)))
+ ceph_con_v2_revoke(con);
+ else
+ ceph_con_v1_revoke(con);
+ ceph_msg_put(con->out_msg);
con->out_msg = NULL;
- ceph_msg_put(msg);
+ } else {
+ dout("%s con %p msg %p not current, out_msg %p\n", __func__,
+ con, msg, con->out_msg);
}
-
mutex_unlock(&con->mutex);
}
@@ -3184,25 +1771,17 @@ void ceph_msg_revoke_incoming(struct ceph_msg *msg)
mutex_lock(&con->mutex);
if (con->in_msg == msg) {
- unsigned int front_len = le32_to_cpu(con->in_hdr.front_len);
- unsigned int middle_len = le32_to_cpu(con->in_hdr.middle_len);
- unsigned int data_len = le32_to_cpu(con->in_hdr.data_len);
-
- /* skip rest of message */
- dout("%s %p msg %p revoked\n", __func__, con, msg);
- con->in_base_pos = con->in_base_pos -
- sizeof(struct ceph_msg_header) -
- front_len -
- middle_len -
- data_len -
- sizeof(struct ceph_msg_footer);
+ WARN_ON(con->state != CEPH_CON_S_OPEN);
+ dout("%s con %p msg %p was recving\n", __func__, con, msg);
+ if (ceph_msgr2(from_msgr(con->msgr)))
+ ceph_con_v2_revoke_incoming(con);
+ else
+ ceph_con_v1_revoke_incoming(con);
ceph_msg_put(con->in_msg);
con->in_msg = NULL;
- con->in_tag = CEPH_MSGR_TAG_READY;
- con->in_seq++;
} else {
- dout("%s %p in_msg %p msg %p no-op\n",
- __func__, con, con->in_msg, msg);
+ dout("%s con %p msg %p not current, in_msg %p\n", __func__,
+ con, msg, con->in_msg);
}
mutex_unlock(&con->mutex);
}
@@ -3215,10 +1794,10 @@ void ceph_con_keepalive(struct ceph_connection *con)
dout("con_keepalive %p\n", con);
mutex_lock(&con->mutex);
clear_standby(con);
- con_flag_set(con, CON_FLAG_KEEPALIVE_PENDING);
+ ceph_con_flag_set(con, CEPH_CON_F_KEEPALIVE_PENDING);
mutex_unlock(&con->mutex);
- if (con_flag_test_and_set(con, CON_FLAG_WRITE_PENDING) == 0)
+ if (!ceph_con_flag_test_and_set(con, CEPH_CON_F_WRITE_PENDING))
queue_con(con);
}
EXPORT_SYMBOL(ceph_con_keepalive);
@@ -3424,9 +2003,9 @@ static int ceph_alloc_middle(struct ceph_connection *con, struct ceph_msg *msg)
* On error (ENOMEM, EAGAIN, ...),
* - con->in_msg == NULL
*/
-static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip)
+int ceph_con_in_msg_alloc(struct ceph_connection *con,
+ struct ceph_msg_header *hdr, int *skip)
{
- struct ceph_msg_header *hdr = &con->in_hdr;
int middle_len = le32_to_cpu(hdr->middle_len);
struct ceph_msg *msg;
int ret = 0;
@@ -3437,7 +2016,7 @@ static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip)
mutex_unlock(&con->mutex);
msg = con->ops->alloc_msg(con, hdr, skip);
mutex_lock(&con->mutex);
- if (con->state != CON_STATE_OPEN) {
+ if (con->state != CEPH_CON_S_OPEN) {
if (msg)
ceph_msg_put(msg);
return -EAGAIN;
@@ -3458,7 +2037,7 @@ static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip)
con->error_msg = "error allocating memory for incoming message";
return -ENOMEM;
}
- memcpy(&con->in_msg->hdr, &con->in_hdr, sizeof(con->in_hdr));
+ memcpy(&con->in_msg->hdr, hdr, sizeof(*hdr));
if (middle_len && !con->in_msg->middle) {
ret = ceph_alloc_middle(con, con->in_msg);
@@ -3471,6 +2050,39 @@ static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip)
return ret;
}
+void ceph_con_get_out_msg(struct ceph_connection *con)
+{
+ struct ceph_msg *msg;
+
+ BUG_ON(list_empty(&con->out_queue));
+ msg = list_first_entry(&con->out_queue, struct ceph_msg, list_head);
+ WARN_ON(msg->con != con);
+
+ /*
+ * Put the message on "sent" list using a ref from ceph_con_send().
+ * It is put when the message is acked or revoked.
+ */
+ list_move_tail(&msg->list_head, &con->out_sent);
+
+ /*
+ * Only assign outgoing seq # if we haven't sent this message
+ * yet. If it is requeued, resend with it's original seq.
+ */
+ if (msg->needs_out_seq) {
+ msg->hdr.seq = cpu_to_le64(++con->out_seq);
+ msg->needs_out_seq = false;
+
+ if (con->ops->reencode_message)
+ con->ops->reencode_message(msg);
+ }
+
+ /*
+ * Get a ref for out_msg. It is put when we are done sending the
+ * message or in case of a fault.
+ */
+ WARN_ON(con->out_msg);
+ con->out_msg = ceph_msg_get(msg);
+}
/*
* Free a generically kmalloc'd message.
diff --git a/net/ceph/messenger_v1.c b/net/ceph/messenger_v1.c
new file mode 100644
index 000000000000..04f653b3c897
--- /dev/null
+++ b/net/ceph/messenger_v1.c
@@ -0,0 +1,1506 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/bvec.h>
+#include <linux/crc32c.h>
+#include <linux/net.h>
+#include <linux/socket.h>
+#include <net/sock.h>
+
+#include <linux/ceph/ceph_features.h>
+#include <linux/ceph/decode.h>
+#include <linux/ceph/libceph.h>
+#include <linux/ceph/messenger.h>
+
+/* static tag bytes (protocol control messages) */
+static char tag_msg = CEPH_MSGR_TAG_MSG;
+static char tag_ack = CEPH_MSGR_TAG_ACK;
+static char tag_keepalive = CEPH_MSGR_TAG_KEEPALIVE;
+static char tag_keepalive2 = CEPH_MSGR_TAG_KEEPALIVE2;
+
+/*
+ * If @buf is NULL, discard up to @len bytes.
+ */
+static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len)
+{
+ struct kvec iov = {buf, len};
+ struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
+ int r;
+
+ if (!buf)
+ msg.msg_flags |= MSG_TRUNC;
+
+ iov_iter_kvec(&msg.msg_iter, READ, &iov, 1, len);
+ r = sock_recvmsg(sock, &msg, msg.msg_flags);
+ if (r == -EAGAIN)
+ r = 0;
+ return r;
+}
+
+static int ceph_tcp_recvpage(struct socket *sock, struct page *page,
+ int page_offset, size_t length)
+{
+ struct bio_vec bvec = {
+ .bv_page = page,
+ .bv_offset = page_offset,
+ .bv_len = length
+ };
+ struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
+ int r;
+
+ BUG_ON(page_offset + length > PAGE_SIZE);
+ iov_iter_bvec(&msg.msg_iter, READ, &bvec, 1, length);
+ r = sock_recvmsg(sock, &msg, msg.msg_flags);
+ if (r == -EAGAIN)
+ r = 0;
+ return r;
+}
+
+/*
+ * write something. @more is true if caller will be sending more data
+ * shortly.
+ */
+static int ceph_tcp_sendmsg(struct socket *sock, struct kvec *iov,
+ size_t kvlen, size_t len, bool more)
+{
+ struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
+ int r;
+
+ if (more)
+ msg.msg_flags |= MSG_MORE;
+ else
+ msg.msg_flags |= MSG_EOR; /* superfluous, but what the hell */
+
+ r = kernel_sendmsg(sock, &msg, iov, kvlen, len);
+ if (r == -EAGAIN)
+ r = 0;
+ return r;
+}
+
+/*
+ * @more: either or both of MSG_MORE and MSG_SENDPAGE_NOTLAST
+ */
+static int ceph_tcp_sendpage(struct socket *sock, struct page *page,
+ int offset, size_t size, int more)
+{
+ ssize_t (*sendpage)(struct socket *sock, struct page *page,
+ int offset, size_t size, int flags);
+ int flags = MSG_DONTWAIT | MSG_NOSIGNAL | more;
+ int ret;
+
+ /*
+ * sendpage cannot properly handle pages with page_count == 0,
+ * we need to fall back to sendmsg if that's the case.
+ *
+ * Same goes for slab pages: skb_can_coalesce() allows
+ * coalescing neighboring slab objects into a single frag which
+ * triggers one of hardened usercopy checks.
+ */
+ if (sendpage_ok(page))
+ sendpage = sock->ops->sendpage;
+ else
+ sendpage = sock_no_sendpage;
+
+ ret = sendpage(sock, page, offset, size, flags);
+ if (ret == -EAGAIN)
+ ret = 0;
+
+ return ret;
+}
+
+static void con_out_kvec_reset(struct ceph_connection *con)
+{
+ BUG_ON(con->v1.out_skip);
+
+ con->v1.out_kvec_left = 0;
+ con->v1.out_kvec_bytes = 0;
+ con->v1.out_kvec_cur = &con->v1.out_kvec[0];
+}
+
+static void con_out_kvec_add(struct ceph_connection *con,
+ size_t size, void *data)
+{
+ int index = con->v1.out_kvec_left;
+
+ BUG_ON(con->v1.out_skip);
+ BUG_ON(index >= ARRAY_SIZE(con->v1.out_kvec));
+
+ con->v1.out_kvec[index].iov_len = size;
+ con->v1.out_kvec[index].iov_base = data;
+ con->v1.out_kvec_left++;
+ con->v1.out_kvec_bytes += size;
+}
+
+/*
+ * Chop off a kvec from the end. Return residual number of bytes for
+ * that kvec, i.e. how many bytes would have been written if the kvec
+ * hadn't been nuked.
+ */
+static int con_out_kvec_skip(struct ceph_connection *con)
+{
+ int skip = 0;
+
+ if (con->v1.out_kvec_bytes > 0) {
+ skip = con->v1.out_kvec_cur[con->v1.out_kvec_left - 1].iov_len;
+ BUG_ON(con->v1.out_kvec_bytes < skip);
+ BUG_ON(!con->v1.out_kvec_left);
+ con->v1.out_kvec_bytes -= skip;
+ con->v1.out_kvec_left--;
+ }
+
+ return skip;
+}
+
+static size_t sizeof_footer(struct ceph_connection *con)
+{
+ return (con->peer_features & CEPH_FEATURE_MSG_AUTH) ?
+ sizeof(struct ceph_msg_footer) :
+ sizeof(struct ceph_msg_footer_old);
+}
+
+static void prepare_message_data(struct ceph_msg *msg, u32 data_len)
+{
+ /* Initialize data cursor */
+
+ ceph_msg_data_cursor_init(&msg->cursor, msg, data_len);
+}
+
+/*
+ * Prepare footer for currently outgoing message, and finish things
+ * off. Assumes out_kvec* are already valid.. we just add on to the end.
+ */
+static void prepare_write_message_footer(struct ceph_connection *con)
+{
+ struct ceph_msg *m = con->out_msg;
+
+ m->footer.flags |= CEPH_MSG_FOOTER_COMPLETE;
+
+ dout("prepare_write_message_footer %p\n", con);
+ con_out_kvec_add(con, sizeof_footer(con), &m->footer);
+ if (con->peer_features & CEPH_FEATURE_MSG_AUTH) {
+ if (con->ops->sign_message)
+ con->ops->sign_message(m);
+ else
+ m->footer.sig = 0;
+ } else {
+ m->old_footer.flags = m->footer.flags;
+ }
+ con->v1.out_more = m->more_to_follow;
+ con->v1.out_msg_done = true;
+}
+
+/*
+ * Prepare headers for the next outgoing message.
+ */
+static void prepare_write_message(struct ceph_connection *con)
+{
+ struct ceph_msg *m;
+ u32 crc;
+
+ con_out_kvec_reset(con);
+ con->v1.out_msg_done = false;
+
+ /* Sneak an ack in there first? If we can get it into the same
+ * TCP packet that's a good thing. */
+ if (con->in_seq > con->in_seq_acked) {
+ con->in_seq_acked = con->in_seq;
+ con_out_kvec_add(con, sizeof (tag_ack), &tag_ack);
+ con->v1.out_temp_ack = cpu_to_le64(con->in_seq_acked);
+ con_out_kvec_add(con, sizeof(con->v1.out_temp_ack),
+ &con->v1.out_temp_ack);
+ }
+
+ ceph_con_get_out_msg(con);
+ m = con->out_msg;
+
+ dout("prepare_write_message %p seq %lld type %d len %d+%d+%zd\n",
+ m, con->out_seq, le16_to_cpu(m->hdr.type),
+ le32_to_cpu(m->hdr.front_len), le32_to_cpu(m->hdr.middle_len),
+ m->data_length);
+ WARN_ON(m->front.iov_len != le32_to_cpu(m->hdr.front_len));
+ WARN_ON(m->data_length != le32_to_cpu(m->hdr.data_len));
+
+ /* tag + hdr + front + middle */
+ con_out_kvec_add(con, sizeof (tag_msg), &tag_msg);
+ con_out_kvec_add(con, sizeof(con->v1.out_hdr), &con->v1.out_hdr);
+ con_out_kvec_add(con, m->front.iov_len, m->front.iov_base);
+
+ if (m->middle)
+ con_out_kvec_add(con, m->middle->vec.iov_len,
+ m->middle->vec.iov_base);
+
+ /* fill in hdr crc and finalize hdr */
+ crc = crc32c(0, &m->hdr, offsetof(struct ceph_msg_header, crc));
+ con->out_msg->hdr.crc = cpu_to_le32(crc);
+ memcpy(&con->v1.out_hdr, &con->out_msg->hdr, sizeof(con->v1.out_hdr));
+
+ /* fill in front and middle crc, footer */
+ crc = crc32c(0, m->front.iov_base, m->front.iov_len);
+ con->out_msg->footer.front_crc = cpu_to_le32(crc);
+ if (m->middle) {
+ crc = crc32c(0, m->middle->vec.iov_base,
+ m->middle->vec.iov_len);
+ con->out_msg->footer.middle_crc = cpu_to_le32(crc);
+ } else
+ con->out_msg->footer.middle_crc = 0;
+ dout("%s front_crc %u middle_crc %u\n", __func__,
+ le32_to_cpu(con->out_msg->footer.front_crc),
+ le32_to_cpu(con->out_msg->footer.middle_crc));
+ con->out_msg->footer.flags = 0;
+
+ /* is there a data payload? */
+ con->out_msg->footer.data_crc = 0;
+ if (m->data_length) {
+ prepare_message_data(con->out_msg, m->data_length);
+ con->v1.out_more = 1; /* data + footer will follow */
+ } else {
+ /* no, queue up footer too and be done */
+ prepare_write_message_footer(con);
+ }
+
+ ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
+}
+
+/*
+ * Prepare an ack.
+ */
+static void prepare_write_ack(struct ceph_connection *con)
+{
+ dout("prepare_write_ack %p %llu -> %llu\n", con,
+ con->in_seq_acked, con->in_seq);
+ con->in_seq_acked = con->in_seq;
+
+ con_out_kvec_reset(con);
+
+ con_out_kvec_add(con, sizeof (tag_ack), &tag_ack);
+
+ con->v1.out_temp_ack = cpu_to_le64(con->in_seq_acked);
+ con_out_kvec_add(con, sizeof(con->v1.out_temp_ack),
+ &con->v1.out_temp_ack);
+
+ con->v1.out_more = 1; /* more will follow.. eventually.. */
+ ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
+}
+
+/*
+ * Prepare to share the seq during handshake
+ */
+static void prepare_write_seq(struct ceph_connection *con)
+{
+ dout("prepare_write_seq %p %llu -> %llu\n", con,
+ con->in_seq_acked, con->in_seq);
+ con->in_seq_acked = con->in_seq;
+
+ con_out_kvec_reset(con);
+
+ con->v1.out_temp_ack = cpu_to_le64(con->in_seq_acked);
+ con_out_kvec_add(con, sizeof(con->v1.out_temp_ack),
+ &con->v1.out_temp_ack);
+
+ ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
+}
+
+/*
+ * Prepare to write keepalive byte.
+ */
+static void prepare_write_keepalive(struct ceph_connection *con)
+{
+ dout("prepare_write_keepalive %p\n", con);
+ con_out_kvec_reset(con);
+ if (con->peer_features & CEPH_FEATURE_MSGR_KEEPALIVE2) {
+ struct timespec64 now;
+
+ ktime_get_real_ts64(&now);
+ con_out_kvec_add(con, sizeof(tag_keepalive2), &tag_keepalive2);
+ ceph_encode_timespec64(&con->v1.out_temp_keepalive2, &now);
+ con_out_kvec_add(con, sizeof(con->v1.out_temp_keepalive2),
+ &con->v1.out_temp_keepalive2);
+ } else {
+ con_out_kvec_add(con, sizeof(tag_keepalive), &tag_keepalive);
+ }
+ ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
+}
+
+/*
+ * Connection negotiation.
+ */
+
+static int get_connect_authorizer(struct ceph_connection *con)
+{
+ struct ceph_auth_handshake *auth;
+ int auth_proto;
+
+ if (!con->ops->get_authorizer) {
+ con->v1.auth = NULL;
+ con->v1.out_connect.authorizer_protocol = CEPH_AUTH_UNKNOWN;
+ con->v1.out_connect.authorizer_len = 0;
+ return 0;
+ }
+
+ auth = con->ops->get_authorizer(con, &auth_proto, con->v1.auth_retry);
+ if (IS_ERR(auth))
+ return PTR_ERR(auth);
+
+ con->v1.auth = auth;
+ con->v1.out_connect.authorizer_protocol = cpu_to_le32(auth_proto);
+ con->v1.out_connect.authorizer_len =
+ cpu_to_le32(auth->authorizer_buf_len);
+ return 0;
+}
+
+/*
+ * We connected to a peer and are saying hello.
+ */
+static void prepare_write_banner(struct ceph_connection *con)
+{
+ con_out_kvec_add(con, strlen(CEPH_BANNER), CEPH_BANNER);
+ con_out_kvec_add(con, sizeof (con->msgr->my_enc_addr),
+ &con->msgr->my_enc_addr);
+
+ con->v1.out_more = 0;
+ ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
+}
+
+static void __prepare_write_connect(struct ceph_connection *con)
+{
+ con_out_kvec_add(con, sizeof(con->v1.out_connect),
+ &con->v1.out_connect);
+ if (con->v1.auth)
+ con_out_kvec_add(con, con->v1.auth->authorizer_buf_len,
+ con->v1.auth->authorizer_buf);
+
+ con->v1.out_more = 0;
+ ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
+}
+
+static int prepare_write_connect(struct ceph_connection *con)
+{
+ unsigned int global_seq = ceph_get_global_seq(con->msgr, 0);
+ int proto;
+ int ret;
+
+ switch (con->peer_name.type) {
+ case CEPH_ENTITY_TYPE_MON:
+ proto = CEPH_MONC_PROTOCOL;
+ break;
+ case CEPH_ENTITY_TYPE_OSD:
+ proto = CEPH_OSDC_PROTOCOL;
+ break;
+ case CEPH_ENTITY_TYPE_MDS:
+ proto = CEPH_MDSC_PROTOCOL;
+ break;
+ default:
+ BUG();
+ }
+
+ dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con,
+ con->v1.connect_seq, global_seq, proto);
+
+ con->v1.out_connect.features =
+ cpu_to_le64(from_msgr(con->msgr)->supported_features);
+ con->v1.out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT);
+ con->v1.out_connect.connect_seq = cpu_to_le32(con->v1.connect_seq);
+ con->v1.out_connect.global_seq = cpu_to_le32(global_seq);
+ con->v1.out_connect.protocol_version = cpu_to_le32(proto);
+ con->v1.out_connect.flags = 0;
+
+ ret = get_connect_authorizer(con);
+ if (ret)
+ return ret;
+
+ __prepare_write_connect(con);
+ return 0;
+}
+
+/*
+ * write as much of pending kvecs to the socket as we can.
+ * 1 -> done
+ * 0 -> socket full, but more to do
+ * <0 -> error
+ */
+static int write_partial_kvec(struct ceph_connection *con)
+{
+ int ret;
+
+ dout("write_partial_kvec %p %d left\n", con, con->v1.out_kvec_bytes);
+ while (con->v1.out_kvec_bytes > 0) {
+ ret = ceph_tcp_sendmsg(con->sock, con->v1.out_kvec_cur,
+ con->v1.out_kvec_left,
+ con->v1.out_kvec_bytes,
+ con->v1.out_more);
+ if (ret <= 0)
+ goto out;
+ con->v1.out_kvec_bytes -= ret;
+ if (!con->v1.out_kvec_bytes)
+ break; /* done */
+
+ /* account for full iov entries consumed */
+ while (ret >= con->v1.out_kvec_cur->iov_len) {
+ BUG_ON(!con->v1.out_kvec_left);
+ ret -= con->v1.out_kvec_cur->iov_len;
+ con->v1.out_kvec_cur++;
+ con->v1.out_kvec_left--;
+ }
+ /* and for a partially-consumed entry */
+ if (ret) {
+ con->v1.out_kvec_cur->iov_len -= ret;
+ con->v1.out_kvec_cur->iov_base += ret;
+ }
+ }
+ con->v1.out_kvec_left = 0;
+ ret = 1;
+out:
+ dout("write_partial_kvec %p %d left in %d kvecs ret = %d\n", con,
+ con->v1.out_kvec_bytes, con->v1.out_kvec_left, ret);
+ return ret; /* done! */
+}
+
+/*
+ * Write as much message data payload as we can. If we finish, queue
+ * up the footer.
+ * 1 -> done, footer is now queued in out_kvec[].
+ * 0 -> socket full, but more to do
+ * <0 -> error
+ */
+static int write_partial_message_data(struct ceph_connection *con)
+{
+ struct ceph_msg *msg = con->out_msg;
+ struct ceph_msg_data_cursor *cursor = &msg->cursor;
+ bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC);
+ int more = MSG_MORE | MSG_SENDPAGE_NOTLAST;
+ u32 crc;
+
+ dout("%s %p msg %p\n", __func__, con, msg);
+
+ if (!msg->num_data_items)
+ return -EINVAL;
+
+ /*
+ * Iterate through each page that contains data to be
+ * written, and send as much as possible for each.
+ *
+ * If we are calculating the data crc (the default), we will
+ * need to map the page. If we have no pages, they have
+ * been revoked, so use the zero page.
+ */
+ crc = do_datacrc ? le32_to_cpu(msg->footer.data_crc) : 0;
+ while (cursor->total_resid) {
+ struct page *page;
+ size_t page_offset;
+ size_t length;
+ int ret;
+
+ if (!cursor->resid) {
+ ceph_msg_data_advance(cursor, 0);
+ continue;
+ }
+
+ page = ceph_msg_data_next(cursor, &page_offset, &length, NULL);
+ if (length == cursor->total_resid)
+ more = MSG_MORE;
+ ret = ceph_tcp_sendpage(con->sock, page, page_offset, length,
+ more);
+ if (ret <= 0) {
+ if (do_datacrc)
+ msg->footer.data_crc = cpu_to_le32(crc);
+
+ return ret;
+ }
+ if (do_datacrc && cursor->need_crc)
+ crc = ceph_crc32c_page(crc, page, page_offset, length);
+ ceph_msg_data_advance(cursor, (size_t)ret);
+ }
+
+ dout("%s %p msg %p done\n", __func__, con, msg);
+
+ /* prepare and queue up footer, too */
+ if (do_datacrc)
+ msg->footer.data_crc = cpu_to_le32(crc);
+ else
+ msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC;
+ con_out_kvec_reset(con);
+ prepare_write_message_footer(con);
+
+ return 1; /* must return > 0 to indicate success */
+}
+
+/*
+ * write some zeros
+ */
+static int write_partial_skip(struct ceph_connection *con)
+{
+ int more = MSG_MORE | MSG_SENDPAGE_NOTLAST;
+ int ret;
+
+ dout("%s %p %d left\n", __func__, con, con->v1.out_skip);
+ while (con->v1.out_skip > 0) {
+ size_t size = min(con->v1.out_skip, (int)PAGE_SIZE);
+
+ if (size == con->v1.out_skip)
+ more = MSG_MORE;
+ ret = ceph_tcp_sendpage(con->sock, ceph_zero_page, 0, size,
+ more);
+ if (ret <= 0)
+ goto out;
+ con->v1.out_skip -= ret;
+ }
+ ret = 1;
+out:
+ return ret;
+}
+
+/*
+ * Prepare to read connection handshake, or an ack.
+ */
+static void prepare_read_banner(struct ceph_connection *con)
+{
+ dout("prepare_read_banner %p\n", con);
+ con->v1.in_base_pos = 0;
+}
+
+static void prepare_read_connect(struct ceph_connection *con)
+{
+ dout("prepare_read_connect %p\n", con);
+ con->v1.in_base_pos = 0;
+}
+
+static void prepare_read_ack(struct ceph_connection *con)
+{
+ dout("prepare_read_ack %p\n", con);
+ con->v1.in_base_pos = 0;
+}
+
+static void prepare_read_seq(struct ceph_connection *con)
+{
+ dout("prepare_read_seq %p\n", con);
+ con->v1.in_base_pos = 0;
+ con->v1.in_tag = CEPH_MSGR_TAG_SEQ;
+}
+
+static void prepare_read_tag(struct ceph_connection *con)
+{
+ dout("prepare_read_tag %p\n", con);
+ con->v1.in_base_pos = 0;
+ con->v1.in_tag = CEPH_MSGR_TAG_READY;
+}
+
+static void prepare_read_keepalive_ack(struct ceph_connection *con)
+{
+ dout("prepare_read_keepalive_ack %p\n", con);
+ con->v1.in_base_pos = 0;
+}
+
+/*
+ * Prepare to read a message.
+ */
+static int prepare_read_message(struct ceph_connection *con)
+{
+ dout("prepare_read_message %p\n", con);
+ BUG_ON(con->in_msg != NULL);
+ con->v1.in_base_pos = 0;
+ con->in_front_crc = con->in_middle_crc = con->in_data_crc = 0;
+ return 0;
+}
+
+static int read_partial(struct ceph_connection *con,
+ int end, int size, void *object)
+{
+ while (con->v1.in_base_pos < end) {
+ int left = end - con->v1.in_base_pos;
+ int have = size - left;
+ int ret = ceph_tcp_recvmsg(con->sock, object + have, left);
+ if (ret <= 0)
+ return ret;
+ con->v1.in_base_pos += ret;
+ }
+ return 1;
+}
+
+/*
+ * Read all or part of the connect-side handshake on a new connection
+ */
+static int read_partial_banner(struct ceph_connection *con)
+{
+ int size;
+ int end;
+ int ret;
+
+ dout("read_partial_banner %p at %d\n", con, con->v1.in_base_pos);
+
+ /* peer's banner */
+ size = strlen(CEPH_BANNER);
+ end = size;
+ ret = read_partial(con, end, size, con->v1.in_banner);
+ if (ret <= 0)
+ goto out;
+
+ size = sizeof(con->v1.actual_peer_addr);
+ end += size;
+ ret = read_partial(con, end, size, &con->v1.actual_peer_addr);
+ if (ret <= 0)
+ goto out;
+ ceph_decode_banner_addr(&con->v1.actual_peer_addr);
+
+ size = sizeof(con->v1.peer_addr_for_me);
+ end += size;
+ ret = read_partial(con, end, size, &con->v1.peer_addr_for_me);
+ if (ret <= 0)
+ goto out;
+ ceph_decode_banner_addr(&con->v1.peer_addr_for_me);
+
+out:
+ return ret;
+}
+
+static int read_partial_connect(struct ceph_connection *con)
+{
+ int size;
+ int end;
+ int ret;
+
+ dout("read_partial_connect %p at %d\n", con, con->v1.in_base_pos);
+
+ size = sizeof(con->v1.in_reply);
+ end = size;
+ ret = read_partial(con, end, size, &con->v1.in_reply);
+ if (ret <= 0)
+ goto out;
+
+ if (con->v1.auth) {
+ size = le32_to_cpu(con->v1.in_reply.authorizer_len);
+ if (size > con->v1.auth->authorizer_reply_buf_len) {
+ pr_err("authorizer reply too big: %d > %zu\n", size,
+ con->v1.auth->authorizer_reply_buf_len);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ end += size;
+ ret = read_partial(con, end, size,
+ con->v1.auth->authorizer_reply_buf);
+ if (ret <= 0)
+ goto out;
+ }
+
+ dout("read_partial_connect %p tag %d, con_seq = %u, g_seq = %u\n",
+ con, con->v1.in_reply.tag,
+ le32_to_cpu(con->v1.in_reply.connect_seq),
+ le32_to_cpu(con->v1.in_reply.global_seq));
+out:
+ return ret;
+}
+
+/*
+ * Verify the hello banner looks okay.
+ */
+static int verify_hello(struct ceph_connection *con)
+{
+ if (memcmp(con->v1.in_banner, CEPH_BANNER, strlen(CEPH_BANNER))) {
+ pr_err("connect to %s got bad banner\n",
+ ceph_pr_addr(&con->peer_addr));
+ con->error_msg = "protocol error, bad banner";
+ return -1;
+ }
+ return 0;
+}
+
+static int process_banner(struct ceph_connection *con)
+{
+ struct ceph_entity_addr *my_addr = &con->msgr->inst.addr;
+
+ dout("process_banner on %p\n", con);
+
+ if (verify_hello(con) < 0)
+ return -1;
+
+ /*
+ * Make sure the other end is who we wanted. note that the other
+ * end may not yet know their ip address, so if it's 0.0.0.0, give
+ * them the benefit of the doubt.
+ */
+ if (memcmp(&con->peer_addr, &con->v1.actual_peer_addr,
+ sizeof(con->peer_addr)) != 0 &&
+ !(ceph_addr_is_blank(&con->v1.actual_peer_addr) &&
+ con->v1.actual_peer_addr.nonce == con->peer_addr.nonce)) {
+ pr_warn("wrong peer, want %s/%u, got %s/%u\n",
+ ceph_pr_addr(&con->peer_addr),
+ le32_to_cpu(con->peer_addr.nonce),
+ ceph_pr_addr(&con->v1.actual_peer_addr),
+ le32_to_cpu(con->v1.actual_peer_addr.nonce));
+ con->error_msg = "wrong peer at address";
+ return -1;
+ }
+
+ /*
+ * did we learn our address?
+ */
+ if (ceph_addr_is_blank(my_addr)) {
+ memcpy(&my_addr->in_addr,
+ &con->v1.peer_addr_for_me.in_addr,
+ sizeof(con->v1.peer_addr_for_me.in_addr));
+ ceph_addr_set_port(my_addr, 0);
+ ceph_encode_my_addr(con->msgr);
+ dout("process_banner learned my addr is %s\n",
+ ceph_pr_addr(my_addr));
+ }
+
+ return 0;
+}
+
+static int process_connect(struct ceph_connection *con)
+{
+ u64 sup_feat = from_msgr(con->msgr)->supported_features;
+ u64 req_feat = from_msgr(con->msgr)->required_features;
+ u64 server_feat = le64_to_cpu(con->v1.in_reply.features);
+ int ret;
+
+ dout("process_connect on %p tag %d\n", con, con->v1.in_tag);
+
+ if (con->v1.auth) {
+ int len = le32_to_cpu(con->v1.in_reply.authorizer_len);
+
+ /*
+ * Any connection that defines ->get_authorizer()
+ * should also define ->add_authorizer_challenge() and
+ * ->verify_authorizer_reply().
+ *
+ * See get_connect_authorizer().
+ */
+ if (con->v1.in_reply.tag ==
+ CEPH_MSGR_TAG_CHALLENGE_AUTHORIZER) {
+ ret = con->ops->add_authorizer_challenge(
+ con, con->v1.auth->authorizer_reply_buf, len);
+ if (ret < 0)
+ return ret;
+
+ con_out_kvec_reset(con);
+ __prepare_write_connect(con);
+ prepare_read_connect(con);
+ return 0;
+ }
+
+ if (len) {
+ ret = con->ops->verify_authorizer_reply(con);
+ if (ret < 0) {
+ con->error_msg = "bad authorize reply";
+ return ret;
+ }
+ }
+ }
+
+ switch (con->v1.in_reply.tag) {
+ case CEPH_MSGR_TAG_FEATURES:
+ pr_err("%s%lld %s feature set mismatch,"
+ " my %llx < server's %llx, missing %llx\n",
+ ENTITY_NAME(con->peer_name),
+ ceph_pr_addr(&con->peer_addr),
+ sup_feat, server_feat, server_feat & ~sup_feat);
+ con->error_msg = "missing required protocol features";
+ return -1;
+
+ case CEPH_MSGR_TAG_BADPROTOVER:
+ pr_err("%s%lld %s protocol version mismatch,"
+ " my %d != server's %d\n",
+ ENTITY_NAME(con->peer_name),
+ ceph_pr_addr(&con->peer_addr),
+ le32_to_cpu(con->v1.out_connect.protocol_version),
+ le32_to_cpu(con->v1.in_reply.protocol_version));
+ con->error_msg = "protocol version mismatch";
+ return -1;
+
+ case CEPH_MSGR_TAG_BADAUTHORIZER:
+ con->v1.auth_retry++;
+ dout("process_connect %p got BADAUTHORIZER attempt %d\n", con,
+ con->v1.auth_retry);
+ if (con->v1.auth_retry == 2) {
+ con->error_msg = "connect authorization failure";
+ return -1;
+ }
+ con_out_kvec_reset(con);
+ ret = prepare_write_connect(con);
+ if (ret < 0)
+ return ret;
+ prepare_read_connect(con);
+ break;
+
+ case CEPH_MSGR_TAG_RESETSESSION:
+ /*
+ * If we connected with a large connect_seq but the peer
+ * has no record of a session with us (no connection, or
+ * connect_seq == 0), they will send RESETSESION to indicate
+ * that they must have reset their session, and may have
+ * dropped messages.
+ */
+ dout("process_connect got RESET peer seq %u\n",
+ le32_to_cpu(con->v1.in_reply.connect_seq));
+ pr_info("%s%lld %s session reset\n",
+ ENTITY_NAME(con->peer_name),
+ ceph_pr_addr(&con->peer_addr));
+ ceph_con_reset_session(con);
+ con_out_kvec_reset(con);
+ ret = prepare_write_connect(con);
+ if (ret < 0)
+ return ret;
+ prepare_read_connect(con);
+
+ /* Tell ceph about it. */
+ mutex_unlock(&con->mutex);
+ if (con->ops->peer_reset)
+ con->ops->peer_reset(con);
+ mutex_lock(&con->mutex);
+ if (con->state != CEPH_CON_S_V1_CONNECT_MSG)
+ return -EAGAIN;
+ break;
+
+ case CEPH_MSGR_TAG_RETRY_SESSION:
+ /*
+ * If we sent a smaller connect_seq than the peer has, try
+ * again with a larger value.
+ */
+ dout("process_connect got RETRY_SESSION my seq %u, peer %u\n",
+ le32_to_cpu(con->v1.out_connect.connect_seq),
+ le32_to_cpu(con->v1.in_reply.connect_seq));
+ con->v1.connect_seq = le32_to_cpu(con->v1.in_reply.connect_seq);
+ con_out_kvec_reset(con);
+ ret = prepare_write_connect(con);
+ if (ret < 0)
+ return ret;
+ prepare_read_connect(con);
+ break;
+
+ case CEPH_MSGR_TAG_RETRY_GLOBAL:
+ /*
+ * If we sent a smaller global_seq than the peer has, try
+ * again with a larger value.
+ */
+ dout("process_connect got RETRY_GLOBAL my %u peer_gseq %u\n",
+ con->v1.peer_global_seq,
+ le32_to_cpu(con->v1.in_reply.global_seq));
+ ceph_get_global_seq(con->msgr,
+ le32_to_cpu(con->v1.in_reply.global_seq));
+ con_out_kvec_reset(con);
+ ret = prepare_write_connect(con);
+ if (ret < 0)
+ return ret;
+ prepare_read_connect(con);
+ break;
+
+ case CEPH_MSGR_TAG_SEQ:
+ case CEPH_MSGR_TAG_READY:
+ if (req_feat & ~server_feat) {
+ pr_err("%s%lld %s protocol feature mismatch,"
+ " my required %llx > server's %llx, need %llx\n",
+ ENTITY_NAME(con->peer_name),
+ ceph_pr_addr(&con->peer_addr),
+ req_feat, server_feat, req_feat & ~server_feat);
+ con->error_msg = "missing required protocol features";
+ return -1;
+ }
+
+ WARN_ON(con->state != CEPH_CON_S_V1_CONNECT_MSG);
+ con->state = CEPH_CON_S_OPEN;
+ con->v1.auth_retry = 0; /* we authenticated; clear flag */
+ con->v1.peer_global_seq =
+ le32_to_cpu(con->v1.in_reply.global_seq);
+ con->v1.connect_seq++;
+ con->peer_features = server_feat;
+ dout("process_connect got READY gseq %d cseq %d (%d)\n",
+ con->v1.peer_global_seq,
+ le32_to_cpu(con->v1.in_reply.connect_seq),
+ con->v1.connect_seq);
+ WARN_ON(con->v1.connect_seq !=
+ le32_to_cpu(con->v1.in_reply.connect_seq));
+
+ if (con->v1.in_reply.flags & CEPH_MSG_CONNECT_LOSSY)
+ ceph_con_flag_set(con, CEPH_CON_F_LOSSYTX);
+
+ con->delay = 0; /* reset backoff memory */
+
+ if (con->v1.in_reply.tag == CEPH_MSGR_TAG_SEQ) {
+ prepare_write_seq(con);
+ prepare_read_seq(con);
+ } else {
+ prepare_read_tag(con);
+ }
+ break;
+
+ case CEPH_MSGR_TAG_WAIT:
+ /*
+ * If there is a connection race (we are opening
+ * connections to each other), one of us may just have
+ * to WAIT. This shouldn't happen if we are the
+ * client.
+ */
+ con->error_msg = "protocol error, got WAIT as client";
+ return -1;
+
+ default:
+ con->error_msg = "protocol error, garbage tag during connect";
+ return -1;
+ }
+ return 0;
+}
+
+/*
+ * read (part of) an ack
+ */
+static int read_partial_ack(struct ceph_connection *con)
+{
+ int size = sizeof(con->v1.in_temp_ack);
+ int end = size;
+
+ return read_partial(con, end, size, &con->v1.in_temp_ack);
+}
+
+/*
+ * We can finally discard anything that's been acked.
+ */
+static void process_ack(struct ceph_connection *con)
+{
+ u64 ack = le64_to_cpu(con->v1.in_temp_ack);
+
+ if (con->v1.in_tag == CEPH_MSGR_TAG_ACK)
+ ceph_con_discard_sent(con, ack);
+ else
+ ceph_con_discard_requeued(con, ack);
+
+ prepare_read_tag(con);
+}
+
+static int read_partial_message_section(struct ceph_connection *con,
+ struct kvec *section,
+ unsigned int sec_len, u32 *crc)
+{
+ int ret, left;
+
+ BUG_ON(!section);
+
+ while (section->iov_len < sec_len) {
+ BUG_ON(section->iov_base == NULL);
+ left = sec_len - section->iov_len;
+ ret = ceph_tcp_recvmsg(con->sock, (char *)section->iov_base +
+ section->iov_len, left);
+ if (ret <= 0)
+ return ret;
+ section->iov_len += ret;
+ }
+ if (section->iov_len == sec_len)
+ *crc = crc32c(0, section->iov_base, section->iov_len);
+
+ return 1;
+}
+
+static int read_partial_msg_data(struct ceph_connection *con)
+{
+ struct ceph_msg *msg = con->in_msg;
+ struct ceph_msg_data_cursor *cursor = &msg->cursor;
+ bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC);
+ struct page *page;
+ size_t page_offset;
+ size_t length;
+ u32 crc = 0;
+ int ret;
+
+ if (!msg->num_data_items)
+ return -EIO;
+
+ if (do_datacrc)
+ crc = con->in_data_crc;
+ while (cursor->total_resid) {
+ if (!cursor->resid) {
+ ceph_msg_data_advance(cursor, 0);
+ continue;
+ }
+
+ page = ceph_msg_data_next(cursor, &page_offset, &length, NULL);
+ ret = ceph_tcp_recvpage(con->sock, page, page_offset, length);
+ if (ret <= 0) {
+ if (do_datacrc)
+ con->in_data_crc = crc;
+
+ return ret;
+ }
+
+ if (do_datacrc)
+ crc = ceph_crc32c_page(crc, page, page_offset, ret);
+ ceph_msg_data_advance(cursor, (size_t)ret);
+ }
+ if (do_datacrc)
+ con->in_data_crc = crc;
+
+ return 1; /* must return > 0 to indicate success */
+}
+
+/*
+ * read (part of) a message.
+ */
+static int read_partial_message(struct ceph_connection *con)
+{
+ struct ceph_msg *m = con->in_msg;
+ int size;
+ int end;
+ int ret;
+ unsigned int front_len, middle_len, data_len;
+ bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC);
+ bool need_sign = (con->peer_features & CEPH_FEATURE_MSG_AUTH);
+ u64 seq;
+ u32 crc;
+
+ dout("read_partial_message con %p msg %p\n", con, m);
+
+ /* header */
+ size = sizeof(con->v1.in_hdr);
+ end = size;
+ ret = read_partial(con, end, size, &con->v1.in_hdr);
+ if (ret <= 0)
+ return ret;
+
+ crc = crc32c(0, &con->v1.in_hdr, offsetof(struct ceph_msg_header, crc));
+ if (cpu_to_le32(crc) != con->v1.in_hdr.crc) {
+ pr_err("read_partial_message bad hdr crc %u != expected %u\n",
+ crc, con->v1.in_hdr.crc);
+ return -EBADMSG;
+ }
+
+ front_len = le32_to_cpu(con->v1.in_hdr.front_len);
+ if (front_len > CEPH_MSG_MAX_FRONT_LEN)
+ return -EIO;
+ middle_len = le32_to_cpu(con->v1.in_hdr.middle_len);
+ if (middle_len > CEPH_MSG_MAX_MIDDLE_LEN)
+ return -EIO;
+ data_len = le32_to_cpu(con->v1.in_hdr.data_len);
+ if (data_len > CEPH_MSG_MAX_DATA_LEN)
+ return -EIO;
+
+ /* verify seq# */
+ seq = le64_to_cpu(con->v1.in_hdr.seq);
+ if ((s64)seq - (s64)con->in_seq < 1) {
+ pr_info("skipping %s%lld %s seq %lld expected %lld\n",
+ ENTITY_NAME(con->peer_name),
+ ceph_pr_addr(&con->peer_addr),
+ seq, con->in_seq + 1);
+ con->v1.in_base_pos = -front_len - middle_len - data_len -
+ sizeof_footer(con);
+ con->v1.in_tag = CEPH_MSGR_TAG_READY;
+ return 1;
+ } else if ((s64)seq - (s64)con->in_seq > 1) {
+ pr_err("read_partial_message bad seq %lld expected %lld\n",
+ seq, con->in_seq + 1);
+ con->error_msg = "bad message sequence # for incoming message";
+ return -EBADE;
+ }
+
+ /* allocate message? */
+ if (!con->in_msg) {
+ int skip = 0;
+
+ dout("got hdr type %d front %d data %d\n", con->v1.in_hdr.type,
+ front_len, data_len);
+ ret = ceph_con_in_msg_alloc(con, &con->v1.in_hdr, &skip);
+ if (ret < 0)
+ return ret;
+
+ BUG_ON(!con->in_msg ^ skip);
+ if (skip) {
+ /* skip this message */
+ dout("alloc_msg said skip message\n");
+ con->v1.in_base_pos = -front_len - middle_len -
+ data_len - sizeof_footer(con);
+ con->v1.in_tag = CEPH_MSGR_TAG_READY;
+ con->in_seq++;
+ return 1;
+ }
+
+ BUG_ON(!con->in_msg);
+ BUG_ON(con->in_msg->con != con);
+ m = con->in_msg;
+ m->front.iov_len = 0; /* haven't read it yet */
+ if (m->middle)
+ m->middle->vec.iov_len = 0;
+
+ /* prepare for data payload, if any */
+
+ if (data_len)
+ prepare_message_data(con->in_msg, data_len);
+ }
+
+ /* front */
+ ret = read_partial_message_section(con, &m->front, front_len,
+ &con->in_front_crc);
+ if (ret <= 0)
+ return ret;
+
+ /* middle */
+ if (m->middle) {
+ ret = read_partial_message_section(con, &m->middle->vec,
+ middle_len,
+ &con->in_middle_crc);
+ if (ret <= 0)
+ return ret;
+ }
+
+ /* (page) data */
+ if (data_len) {
+ ret = read_partial_msg_data(con);
+ if (ret <= 0)
+ return ret;
+ }
+
+ /* footer */
+ size = sizeof_footer(con);
+ end += size;
+ ret = read_partial(con, end, size, &m->footer);
+ if (ret <= 0)
+ return ret;
+
+ if (!need_sign) {
+ m->footer.flags = m->old_footer.flags;
+ m->footer.sig = 0;
+ }
+
+ dout("read_partial_message got msg %p %d (%u) + %d (%u) + %d (%u)\n",
+ m, front_len, m->footer.front_crc, middle_len,
+ m->footer.middle_crc, data_len, m->footer.data_crc);
+
+ /* crc ok? */
+ if (con->in_front_crc != le32_to_cpu(m->footer.front_crc)) {
+ pr_err("read_partial_message %p front crc %u != exp. %u\n",
+ m, con->in_front_crc, m->footer.front_crc);
+ return -EBADMSG;
+ }
+ if (con->in_middle_crc != le32_to_cpu(m->footer.middle_crc)) {
+ pr_err("read_partial_message %p middle crc %u != exp %u\n",
+ m, con->in_middle_crc, m->footer.middle_crc);
+ return -EBADMSG;
+ }
+ if (do_datacrc &&
+ (m->footer.flags & CEPH_MSG_FOOTER_NOCRC) == 0 &&
+ con->in_data_crc != le32_to_cpu(m->footer.data_crc)) {
+ pr_err("read_partial_message %p data crc %u != exp. %u\n", m,
+ con->in_data_crc, le32_to_cpu(m->footer.data_crc));
+ return -EBADMSG;
+ }
+
+ if (need_sign && con->ops->check_message_signature &&
+ con->ops->check_message_signature(m)) {
+ pr_err("read_partial_message %p signature check failed\n", m);
+ return -EBADMSG;
+ }
+
+ return 1; /* done! */
+}
+
+static int read_keepalive_ack(struct ceph_connection *con)
+{
+ struct ceph_timespec ceph_ts;
+ size_t size = sizeof(ceph_ts);
+ int ret = read_partial(con, size, size, &ceph_ts);
+ if (ret <= 0)
+ return ret;
+ ceph_decode_timespec64(&con->last_keepalive_ack, &ceph_ts);
+ prepare_read_tag(con);
+ return 1;
+}
+
+/*
+ * Read what we can from the socket.
+ */
+int ceph_con_v1_try_read(struct ceph_connection *con)
+{
+ int ret = -1;
+
+more:
+ dout("try_read start %p state %d\n", con, con->state);
+ if (con->state != CEPH_CON_S_V1_BANNER &&
+ con->state != CEPH_CON_S_V1_CONNECT_MSG &&
+ con->state != CEPH_CON_S_OPEN)
+ return 0;
+
+ BUG_ON(!con->sock);
+
+ dout("try_read tag %d in_base_pos %d\n", con->v1.in_tag,
+ con->v1.in_base_pos);
+
+ if (con->state == CEPH_CON_S_V1_BANNER) {
+ ret = read_partial_banner(con);
+ if (ret <= 0)
+ goto out;
+ ret = process_banner(con);
+ if (ret < 0)
+ goto out;
+
+ con->state = CEPH_CON_S_V1_CONNECT_MSG;
+
+ /*
+ * Received banner is good, exchange connection info.
+ * Do not reset out_kvec, as sending our banner raced
+ * with receiving peer banner after connect completed.
+ */
+ ret = prepare_write_connect(con);
+ if (ret < 0)
+ goto out;
+ prepare_read_connect(con);
+
+ /* Send connection info before awaiting response */
+ goto out;
+ }
+
+ if (con->state == CEPH_CON_S_V1_CONNECT_MSG) {
+ ret = read_partial_connect(con);
+ if (ret <= 0)
+ goto out;
+ ret = process_connect(con);
+ if (ret < 0)
+ goto out;
+ goto more;
+ }
+
+ WARN_ON(con->state != CEPH_CON_S_OPEN);
+
+ if (con->v1.in_base_pos < 0) {
+ /*
+ * skipping + discarding content.
+ */
+ ret = ceph_tcp_recvmsg(con->sock, NULL, -con->v1.in_base_pos);
+ if (ret <= 0)
+ goto out;
+ dout("skipped %d / %d bytes\n", ret, -con->v1.in_base_pos);
+ con->v1.in_base_pos += ret;
+ if (con->v1.in_base_pos)
+ goto more;
+ }
+ if (con->v1.in_tag == CEPH_MSGR_TAG_READY) {
+ /*
+ * what's next?
+ */
+ ret = ceph_tcp_recvmsg(con->sock, &con->v1.in_tag, 1);
+ if (ret <= 0)
+ goto out;
+ dout("try_read got tag %d\n", con->v1.in_tag);
+ switch (con->v1.in_tag) {
+ case CEPH_MSGR_TAG_MSG:
+ prepare_read_message(con);
+ break;
+ case CEPH_MSGR_TAG_ACK:
+ prepare_read_ack(con);
+ break;
+ case CEPH_MSGR_TAG_KEEPALIVE2_ACK:
+ prepare_read_keepalive_ack(con);
+ break;
+ case CEPH_MSGR_TAG_CLOSE:
+ ceph_con_close_socket(con);
+ con->state = CEPH_CON_S_CLOSED;
+ goto out;
+ default:
+ goto bad_tag;
+ }
+ }
+ if (con->v1.in_tag == CEPH_MSGR_TAG_MSG) {
+ ret = read_partial_message(con);
+ if (ret <= 0) {
+ switch (ret) {
+ case -EBADMSG:
+ con->error_msg = "bad crc/signature";
+ fallthrough;
+ case -EBADE:
+ ret = -EIO;
+ break;
+ case -EIO:
+ con->error_msg = "io error";
+ break;
+ }
+ goto out;
+ }
+ if (con->v1.in_tag == CEPH_MSGR_TAG_READY)
+ goto more;
+ ceph_con_process_message(con);
+ if (con->state == CEPH_CON_S_OPEN)
+ prepare_read_tag(con);
+ goto more;
+ }
+ if (con->v1.in_tag == CEPH_MSGR_TAG_ACK ||
+ con->v1.in_tag == CEPH_MSGR_TAG_SEQ) {
+ /*
+ * the final handshake seq exchange is semantically
+ * equivalent to an ACK
+ */
+ ret = read_partial_ack(con);
+ if (ret <= 0)
+ goto out;
+ process_ack(con);
+ goto more;
+ }
+ if (con->v1.in_tag == CEPH_MSGR_TAG_KEEPALIVE2_ACK) {
+ ret = read_keepalive_ack(con);
+ if (ret <= 0)
+ goto out;
+ goto more;
+ }
+
+out:
+ dout("try_read done on %p ret %d\n", con, ret);
+ return ret;
+
+bad_tag:
+ pr_err("try_read bad tag %d\n", con->v1.in_tag);
+ con->error_msg = "protocol error, garbage tag";
+ ret = -1;
+ goto out;
+}
+
+/*
+ * Write something to the socket. Called in a worker thread when the
+ * socket appears to be writeable and we have something ready to send.
+ */
+int ceph_con_v1_try_write(struct ceph_connection *con)
+{
+ int ret = 1;
+
+ dout("try_write start %p state %d\n", con, con->state);
+ if (con->state != CEPH_CON_S_PREOPEN &&
+ con->state != CEPH_CON_S_V1_BANNER &&
+ con->state != CEPH_CON_S_V1_CONNECT_MSG &&
+ con->state != CEPH_CON_S_OPEN)
+ return 0;
+
+ /* open the socket first? */
+ if (con->state == CEPH_CON_S_PREOPEN) {
+ BUG_ON(con->sock);
+ con->state = CEPH_CON_S_V1_BANNER;
+
+ con_out_kvec_reset(con);
+ prepare_write_banner(con);
+ prepare_read_banner(con);
+
+ BUG_ON(con->in_msg);
+ con->v1.in_tag = CEPH_MSGR_TAG_READY;
+ dout("try_write initiating connect on %p new state %d\n",
+ con, con->state);
+ ret = ceph_tcp_connect(con);
+ if (ret < 0) {
+ con->error_msg = "connect error";
+ goto out;
+ }
+ }
+
+more:
+ dout("try_write out_kvec_bytes %d\n", con->v1.out_kvec_bytes);
+ BUG_ON(!con->sock);
+
+ /* kvec data queued? */
+ if (con->v1.out_kvec_left) {
+ ret = write_partial_kvec(con);
+ if (ret <= 0)
+ goto out;
+ }
+ if (con->v1.out_skip) {
+ ret = write_partial_skip(con);
+ if (ret <= 0)
+ goto out;
+ }
+
+ /* msg pages? */
+ if (con->out_msg) {
+ if (con->v1.out_msg_done) {
+ ceph_msg_put(con->out_msg);
+ con->out_msg = NULL; /* we're done with this one */
+ goto do_next;
+ }
+
+ ret = write_partial_message_data(con);
+ if (ret == 1)
+ goto more; /* we need to send the footer, too! */
+ if (ret == 0)
+ goto out;
+ if (ret < 0) {
+ dout("try_write write_partial_message_data err %d\n",
+ ret);
+ goto out;
+ }
+ }
+
+do_next:
+ if (con->state == CEPH_CON_S_OPEN) {
+ if (ceph_con_flag_test_and_clear(con,
+ CEPH_CON_F_KEEPALIVE_PENDING)) {
+ prepare_write_keepalive(con);
+ goto more;
+ }
+ /* is anything else pending? */
+ if (!list_empty(&con->out_queue)) {
+ prepare_write_message(con);
+ goto more;
+ }
+ if (con->in_seq > con->in_seq_acked) {
+ prepare_write_ack(con);
+ goto more;
+ }
+ }
+
+ /* Nothing to do! */
+ ceph_con_flag_clear(con, CEPH_CON_F_WRITE_PENDING);
+ dout("try_write nothing else to write.\n");
+ ret = 0;
+out:
+ dout("try_write done on %p ret %d\n", con, ret);
+ return ret;
+}
+
+void ceph_con_v1_revoke(struct ceph_connection *con)
+{
+ struct ceph_msg *msg = con->out_msg;
+
+ WARN_ON(con->v1.out_skip);
+ /* footer */
+ if (con->v1.out_msg_done) {
+ con->v1.out_skip += con_out_kvec_skip(con);
+ } else {
+ WARN_ON(!msg->data_length);
+ con->v1.out_skip += sizeof_footer(con);
+ }
+ /* data, middle, front */
+ if (msg->data_length)
+ con->v1.out_skip += msg->cursor.total_resid;
+ if (msg->middle)
+ con->v1.out_skip += con_out_kvec_skip(con);
+ con->v1.out_skip += con_out_kvec_skip(con);
+
+ dout("%s con %p out_kvec_bytes %d out_skip %d\n", __func__, con,
+ con->v1.out_kvec_bytes, con->v1.out_skip);
+}
+
+void ceph_con_v1_revoke_incoming(struct ceph_connection *con)
+{
+ unsigned int front_len = le32_to_cpu(con->v1.in_hdr.front_len);
+ unsigned int middle_len = le32_to_cpu(con->v1.in_hdr.middle_len);
+ unsigned int data_len = le32_to_cpu(con->v1.in_hdr.data_len);
+
+ /* skip rest of message */
+ con->v1.in_base_pos = con->v1.in_base_pos -
+ sizeof(struct ceph_msg_header) -
+ front_len -
+ middle_len -
+ data_len -
+ sizeof(struct ceph_msg_footer);
+
+ con->v1.in_tag = CEPH_MSGR_TAG_READY;
+ con->in_seq++;
+
+ dout("%s con %p in_base_pos %d\n", __func__, con, con->v1.in_base_pos);
+}
+
+bool ceph_con_v1_opened(struct ceph_connection *con)
+{
+ return con->v1.connect_seq;
+}
+
+void ceph_con_v1_reset_session(struct ceph_connection *con)
+{
+ con->v1.connect_seq = 0;
+ con->v1.peer_global_seq = 0;
+}
+
+void ceph_con_v1_reset_protocol(struct ceph_connection *con)
+{
+ con->v1.out_skip = 0;
+}
diff --git a/net/ceph/messenger_v2.c b/net/ceph/messenger_v2.c
new file mode 100644
index 000000000000..c1ebb2aa08b5
--- /dev/null
+++ b/net/ceph/messenger_v2.c
@@ -0,0 +1,3443 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Ceph msgr2 protocol implementation
+ *
+ * Copyright (C) 2020 Ilya Dryomov <idryomov@gmail.com>
+ */
+
+#include <linux/ceph/ceph_debug.h>
+
+#include <crypto/aead.h>
+#include <crypto/algapi.h> /* for crypto_memneq() */
+#include <crypto/hash.h>
+#include <crypto/sha2.h>
+#include <linux/bvec.h>
+#include <linux/crc32c.h>
+#include <linux/net.h>
+#include <linux/scatterlist.h>
+#include <linux/socket.h>
+#include <linux/sched/mm.h>
+#include <net/sock.h>
+#include <net/tcp.h>
+
+#include <linux/ceph/ceph_features.h>
+#include <linux/ceph/decode.h>
+#include <linux/ceph/libceph.h>
+#include <linux/ceph/messenger.h>
+
+#include "crypto.h" /* for CEPH_KEY_LEN and CEPH_MAX_CON_SECRET_LEN */
+
+#define FRAME_TAG_HELLO 1
+#define FRAME_TAG_AUTH_REQUEST 2
+#define FRAME_TAG_AUTH_BAD_METHOD 3
+#define FRAME_TAG_AUTH_REPLY_MORE 4
+#define FRAME_TAG_AUTH_REQUEST_MORE 5
+#define FRAME_TAG_AUTH_DONE 6
+#define FRAME_TAG_AUTH_SIGNATURE 7
+#define FRAME_TAG_CLIENT_IDENT 8
+#define FRAME_TAG_SERVER_IDENT 9
+#define FRAME_TAG_IDENT_MISSING_FEATURES 10
+#define FRAME_TAG_SESSION_RECONNECT 11
+#define FRAME_TAG_SESSION_RESET 12
+#define FRAME_TAG_SESSION_RETRY 13
+#define FRAME_TAG_SESSION_RETRY_GLOBAL 14
+#define FRAME_TAG_SESSION_RECONNECT_OK 15
+#define FRAME_TAG_WAIT 16
+#define FRAME_TAG_MESSAGE 17
+#define FRAME_TAG_KEEPALIVE2 18
+#define FRAME_TAG_KEEPALIVE2_ACK 19
+#define FRAME_TAG_ACK 20
+
+#define FRAME_LATE_STATUS_ABORTED 0x1
+#define FRAME_LATE_STATUS_COMPLETE 0xe
+#define FRAME_LATE_STATUS_ABORTED_MASK 0xf
+
+#define IN_S_HANDLE_PREAMBLE 1
+#define IN_S_HANDLE_CONTROL 2
+#define IN_S_HANDLE_CONTROL_REMAINDER 3
+#define IN_S_PREPARE_READ_DATA 4
+#define IN_S_PREPARE_READ_DATA_CONT 5
+#define IN_S_HANDLE_EPILOGUE 6
+#define IN_S_FINISH_SKIP 7
+
+#define OUT_S_QUEUE_DATA 1
+#define OUT_S_QUEUE_DATA_CONT 2
+#define OUT_S_QUEUE_ENC_PAGE 3
+#define OUT_S_QUEUE_ZEROS 4
+#define OUT_S_FINISH_MESSAGE 5
+#define OUT_S_GET_NEXT 6
+
+#define CTRL_BODY(p) ((void *)(p) + CEPH_PREAMBLE_LEN)
+#define FRONT_PAD(p) ((void *)(p) + CEPH_EPILOGUE_SECURE_LEN)
+#define MIDDLE_PAD(p) (FRONT_PAD(p) + CEPH_GCM_BLOCK_LEN)
+#define DATA_PAD(p) (MIDDLE_PAD(p) + CEPH_GCM_BLOCK_LEN)
+
+#define CEPH_MSG_FLAGS (MSG_DONTWAIT | MSG_NOSIGNAL)
+
+static int do_recvmsg(struct socket *sock, struct iov_iter *it)
+{
+ struct msghdr msg = { .msg_flags = CEPH_MSG_FLAGS };
+ int ret;
+
+ msg.msg_iter = *it;
+ while (iov_iter_count(it)) {
+ ret = sock_recvmsg(sock, &msg, msg.msg_flags);
+ if (ret <= 0) {
+ if (ret == -EAGAIN)
+ ret = 0;
+ return ret;
+ }
+
+ iov_iter_advance(it, ret);
+ }
+
+ WARN_ON(msg_data_left(&msg));
+ return 1;
+}
+
+/*
+ * Read as much as possible.
+ *
+ * Return:
+ * 1 - done, nothing (else) to read
+ * 0 - socket is empty, need to wait
+ * <0 - error
+ */
+static int ceph_tcp_recv(struct ceph_connection *con)
+{
+ int ret;
+
+ dout("%s con %p %s %zu\n", __func__, con,
+ iov_iter_is_discard(&con->v2.in_iter) ? "discard" : "need",
+ iov_iter_count(&con->v2.in_iter));
+ ret = do_recvmsg(con->sock, &con->v2.in_iter);
+ dout("%s con %p ret %d left %zu\n", __func__, con, ret,
+ iov_iter_count(&con->v2.in_iter));
+ return ret;
+}
+
+static int do_sendmsg(struct socket *sock, struct iov_iter *it)
+{
+ struct msghdr msg = { .msg_flags = CEPH_MSG_FLAGS };
+ int ret;
+
+ msg.msg_iter = *it;
+ while (iov_iter_count(it)) {
+ ret = sock_sendmsg(sock, &msg);
+ if (ret <= 0) {
+ if (ret == -EAGAIN)
+ ret = 0;
+ return ret;
+ }
+
+ iov_iter_advance(it, ret);
+ }
+
+ WARN_ON(msg_data_left(&msg));
+ return 1;
+}
+
+static int do_try_sendpage(struct socket *sock, struct iov_iter *it)
+{
+ struct msghdr msg = { .msg_flags = CEPH_MSG_FLAGS };
+ struct bio_vec bv;
+ int ret;
+
+ if (WARN_ON(!iov_iter_is_bvec(it)))
+ return -EINVAL;
+
+ while (iov_iter_count(it)) {
+ /* iov_iter_iovec() for ITER_BVEC */
+ bv.bv_page = it->bvec->bv_page;
+ bv.bv_offset = it->bvec->bv_offset + it->iov_offset;
+ bv.bv_len = min(iov_iter_count(it),
+ it->bvec->bv_len - it->iov_offset);
+
+ /*
+ * sendpage cannot properly handle pages with
+ * page_count == 0, we need to fall back to sendmsg if
+ * that's the case.
+ *
+ * Same goes for slab pages: skb_can_coalesce() allows
+ * coalescing neighboring slab objects into a single frag
+ * which triggers one of hardened usercopy checks.
+ */
+ if (sendpage_ok(bv.bv_page)) {
+ ret = sock->ops->sendpage(sock, bv.bv_page,
+ bv.bv_offset, bv.bv_len,
+ CEPH_MSG_FLAGS);
+ } else {
+ iov_iter_bvec(&msg.msg_iter, WRITE, &bv, 1, bv.bv_len);
+ ret = sock_sendmsg(sock, &msg);
+ }
+ if (ret <= 0) {
+ if (ret == -EAGAIN)
+ ret = 0;
+ return ret;
+ }
+
+ iov_iter_advance(it, ret);
+ }
+
+ return 1;
+}
+
+/*
+ * Write as much as possible. The socket is expected to be corked,
+ * so we don't bother with MSG_MORE/MSG_SENDPAGE_NOTLAST here.
+ *
+ * Return:
+ * 1 - done, nothing (else) to write
+ * 0 - socket is full, need to wait
+ * <0 - error
+ */
+static int ceph_tcp_send(struct ceph_connection *con)
+{
+ int ret;
+
+ dout("%s con %p have %zu try_sendpage %d\n", __func__, con,
+ iov_iter_count(&con->v2.out_iter), con->v2.out_iter_sendpage);
+ if (con->v2.out_iter_sendpage)
+ ret = do_try_sendpage(con->sock, &con->v2.out_iter);
+ else
+ ret = do_sendmsg(con->sock, &con->v2.out_iter);
+ dout("%s con %p ret %d left %zu\n", __func__, con, ret,
+ iov_iter_count(&con->v2.out_iter));
+ return ret;
+}
+
+static void add_in_kvec(struct ceph_connection *con, void *buf, int len)
+{
+ BUG_ON(con->v2.in_kvec_cnt >= ARRAY_SIZE(con->v2.in_kvecs));
+ WARN_ON(!iov_iter_is_kvec(&con->v2.in_iter));
+
+ con->v2.in_kvecs[con->v2.in_kvec_cnt].iov_base = buf;
+ con->v2.in_kvecs[con->v2.in_kvec_cnt].iov_len = len;
+ con->v2.in_kvec_cnt++;
+
+ con->v2.in_iter.nr_segs++;
+ con->v2.in_iter.count += len;
+}
+
+static void reset_in_kvecs(struct ceph_connection *con)
+{
+ WARN_ON(iov_iter_count(&con->v2.in_iter));
+
+ con->v2.in_kvec_cnt = 0;
+ iov_iter_kvec(&con->v2.in_iter, READ, con->v2.in_kvecs, 0, 0);
+}
+
+static void set_in_bvec(struct ceph_connection *con, const struct bio_vec *bv)
+{
+ WARN_ON(iov_iter_count(&con->v2.in_iter));
+
+ con->v2.in_bvec = *bv;
+ iov_iter_bvec(&con->v2.in_iter, READ, &con->v2.in_bvec, 1, bv->bv_len);
+}
+
+static void set_in_skip(struct ceph_connection *con, int len)
+{
+ WARN_ON(iov_iter_count(&con->v2.in_iter));
+
+ dout("%s con %p len %d\n", __func__, con, len);
+ iov_iter_discard(&con->v2.in_iter, READ, len);
+}
+
+static void add_out_kvec(struct ceph_connection *con, void *buf, int len)
+{
+ BUG_ON(con->v2.out_kvec_cnt >= ARRAY_SIZE(con->v2.out_kvecs));
+ WARN_ON(!iov_iter_is_kvec(&con->v2.out_iter));
+ WARN_ON(con->v2.out_zero);
+
+ con->v2.out_kvecs[con->v2.out_kvec_cnt].iov_base = buf;
+ con->v2.out_kvecs[con->v2.out_kvec_cnt].iov_len = len;
+ con->v2.out_kvec_cnt++;
+
+ con->v2.out_iter.nr_segs++;
+ con->v2.out_iter.count += len;
+}
+
+static void reset_out_kvecs(struct ceph_connection *con)
+{
+ WARN_ON(iov_iter_count(&con->v2.out_iter));
+ WARN_ON(con->v2.out_zero);
+
+ con->v2.out_kvec_cnt = 0;
+
+ iov_iter_kvec(&con->v2.out_iter, WRITE, con->v2.out_kvecs, 0, 0);
+ con->v2.out_iter_sendpage = false;
+}
+
+static void set_out_bvec(struct ceph_connection *con, const struct bio_vec *bv,
+ bool zerocopy)
+{
+ WARN_ON(iov_iter_count(&con->v2.out_iter));
+ WARN_ON(con->v2.out_zero);
+
+ con->v2.out_bvec = *bv;
+ con->v2.out_iter_sendpage = zerocopy;
+ iov_iter_bvec(&con->v2.out_iter, WRITE, &con->v2.out_bvec, 1,
+ con->v2.out_bvec.bv_len);
+}
+
+static void set_out_bvec_zero(struct ceph_connection *con)
+{
+ WARN_ON(iov_iter_count(&con->v2.out_iter));
+ WARN_ON(!con->v2.out_zero);
+
+ con->v2.out_bvec.bv_page = ceph_zero_page;
+ con->v2.out_bvec.bv_offset = 0;
+ con->v2.out_bvec.bv_len = min(con->v2.out_zero, (int)PAGE_SIZE);
+ con->v2.out_iter_sendpage = true;
+ iov_iter_bvec(&con->v2.out_iter, WRITE, &con->v2.out_bvec, 1,
+ con->v2.out_bvec.bv_len);
+}
+
+static void out_zero_add(struct ceph_connection *con, int len)
+{
+ dout("%s con %p len %d\n", __func__, con, len);
+ con->v2.out_zero += len;
+}
+
+static void *alloc_conn_buf(struct ceph_connection *con, int len)
+{
+ void *buf;
+
+ dout("%s con %p len %d\n", __func__, con, len);
+
+ if (WARN_ON(con->v2.conn_buf_cnt >= ARRAY_SIZE(con->v2.conn_bufs)))
+ return NULL;
+
+ buf = ceph_kvmalloc(len, GFP_NOIO);
+ if (!buf)
+ return NULL;
+
+ con->v2.conn_bufs[con->v2.conn_buf_cnt++] = buf;
+ return buf;
+}
+
+static void free_conn_bufs(struct ceph_connection *con)
+{
+ while (con->v2.conn_buf_cnt)
+ kvfree(con->v2.conn_bufs[--con->v2.conn_buf_cnt]);
+}
+
+static void add_in_sign_kvec(struct ceph_connection *con, void *buf, int len)
+{
+ BUG_ON(con->v2.in_sign_kvec_cnt >= ARRAY_SIZE(con->v2.in_sign_kvecs));
+
+ con->v2.in_sign_kvecs[con->v2.in_sign_kvec_cnt].iov_base = buf;
+ con->v2.in_sign_kvecs[con->v2.in_sign_kvec_cnt].iov_len = len;
+ con->v2.in_sign_kvec_cnt++;
+}
+
+static void clear_in_sign_kvecs(struct ceph_connection *con)
+{
+ con->v2.in_sign_kvec_cnt = 0;
+}
+
+static void add_out_sign_kvec(struct ceph_connection *con, void *buf, int len)
+{
+ BUG_ON(con->v2.out_sign_kvec_cnt >= ARRAY_SIZE(con->v2.out_sign_kvecs));
+
+ con->v2.out_sign_kvecs[con->v2.out_sign_kvec_cnt].iov_base = buf;
+ con->v2.out_sign_kvecs[con->v2.out_sign_kvec_cnt].iov_len = len;
+ con->v2.out_sign_kvec_cnt++;
+}
+
+static void clear_out_sign_kvecs(struct ceph_connection *con)
+{
+ con->v2.out_sign_kvec_cnt = 0;
+}
+
+static bool con_secure(struct ceph_connection *con)
+{
+ return con->v2.con_mode == CEPH_CON_MODE_SECURE;
+}
+
+static int front_len(const struct ceph_msg *msg)
+{
+ return le32_to_cpu(msg->hdr.front_len);
+}
+
+static int middle_len(const struct ceph_msg *msg)
+{
+ return le32_to_cpu(msg->hdr.middle_len);
+}
+
+static int data_len(const struct ceph_msg *msg)
+{
+ return le32_to_cpu(msg->hdr.data_len);
+}
+
+static bool need_padding(int len)
+{
+ return !IS_ALIGNED(len, CEPH_GCM_BLOCK_LEN);
+}
+
+static int padded_len(int len)
+{
+ return ALIGN(len, CEPH_GCM_BLOCK_LEN);
+}
+
+static int padding_len(int len)
+{
+ return padded_len(len) - len;
+}
+
+/* preamble + control segment */
+static int head_onwire_len(int ctrl_len, bool secure)
+{
+ int head_len;
+ int rem_len;
+
+ if (secure) {
+ head_len = CEPH_PREAMBLE_SECURE_LEN;
+ if (ctrl_len > CEPH_PREAMBLE_INLINE_LEN) {
+ rem_len = ctrl_len - CEPH_PREAMBLE_INLINE_LEN;
+ head_len += padded_len(rem_len) + CEPH_GCM_TAG_LEN;
+ }
+ } else {
+ head_len = CEPH_PREAMBLE_PLAIN_LEN;
+ if (ctrl_len)
+ head_len += ctrl_len + CEPH_CRC_LEN;
+ }
+ return head_len;
+}
+
+/* front, middle and data segments + epilogue */
+static int __tail_onwire_len(int front_len, int middle_len, int data_len,
+ bool secure)
+{
+ if (!front_len && !middle_len && !data_len)
+ return 0;
+
+ if (!secure)
+ return front_len + middle_len + data_len +
+ CEPH_EPILOGUE_PLAIN_LEN;
+
+ return padded_len(front_len) + padded_len(middle_len) +
+ padded_len(data_len) + CEPH_EPILOGUE_SECURE_LEN;
+}
+
+static int tail_onwire_len(const struct ceph_msg *msg, bool secure)
+{
+ return __tail_onwire_len(front_len(msg), middle_len(msg),
+ data_len(msg), secure);
+}
+
+/* head_onwire_len(sizeof(struct ceph_msg_header2), false) */
+#define MESSAGE_HEAD_PLAIN_LEN (CEPH_PREAMBLE_PLAIN_LEN + \
+ sizeof(struct ceph_msg_header2) + \
+ CEPH_CRC_LEN)
+
+static const int frame_aligns[] = {
+ sizeof(void *),
+ sizeof(void *),
+ sizeof(void *),
+ PAGE_SIZE
+};
+
+/*
+ * Discards trailing empty segments, unless there is just one segment.
+ * A frame always has at least one (possibly empty) segment.
+ */
+static int calc_segment_count(const int *lens, int len_cnt)
+{
+ int i;
+
+ for (i = len_cnt - 1; i >= 0; i--) {
+ if (lens[i])
+ return i + 1;
+ }
+
+ return 1;
+}
+
+static void init_frame_desc(struct ceph_frame_desc *desc, int tag,
+ const int *lens, int len_cnt)
+{
+ int i;
+
+ memset(desc, 0, sizeof(*desc));
+
+ desc->fd_tag = tag;
+ desc->fd_seg_cnt = calc_segment_count(lens, len_cnt);
+ BUG_ON(desc->fd_seg_cnt > CEPH_FRAME_MAX_SEGMENT_COUNT);
+ for (i = 0; i < desc->fd_seg_cnt; i++) {
+ desc->fd_lens[i] = lens[i];
+ desc->fd_aligns[i] = frame_aligns[i];
+ }
+}
+
+/*
+ * Preamble crc covers everything up to itself (28 bytes) and
+ * is calculated and verified irrespective of the connection mode
+ * (i.e. even if the frame is encrypted).
+ */
+static void encode_preamble(const struct ceph_frame_desc *desc, void *p)
+{
+ void *crcp = p + CEPH_PREAMBLE_LEN - CEPH_CRC_LEN;
+ void *start = p;
+ int i;
+
+ memset(p, 0, CEPH_PREAMBLE_LEN);
+
+ ceph_encode_8(&p, desc->fd_tag);
+ ceph_encode_8(&p, desc->fd_seg_cnt);
+ for (i = 0; i < desc->fd_seg_cnt; i++) {
+ ceph_encode_32(&p, desc->fd_lens[i]);
+ ceph_encode_16(&p, desc->fd_aligns[i]);
+ }
+
+ put_unaligned_le32(crc32c(0, start, crcp - start), crcp);
+}
+
+static int decode_preamble(void *p, struct ceph_frame_desc *desc)
+{
+ void *crcp = p + CEPH_PREAMBLE_LEN - CEPH_CRC_LEN;
+ u32 crc, expected_crc;
+ int i;
+
+ crc = crc32c(0, p, crcp - p);
+ expected_crc = get_unaligned_le32(crcp);
+ if (crc != expected_crc) {
+ pr_err("bad preamble crc, calculated %u, expected %u\n",
+ crc, expected_crc);
+ return -EBADMSG;
+ }
+
+ memset(desc, 0, sizeof(*desc));
+
+ desc->fd_tag = ceph_decode_8(&p);
+ desc->fd_seg_cnt = ceph_decode_8(&p);
+ if (desc->fd_seg_cnt < 1 ||
+ desc->fd_seg_cnt > CEPH_FRAME_MAX_SEGMENT_COUNT) {
+ pr_err("bad segment count %d\n", desc->fd_seg_cnt);
+ return -EINVAL;
+ }
+ for (i = 0; i < desc->fd_seg_cnt; i++) {
+ desc->fd_lens[i] = ceph_decode_32(&p);
+ desc->fd_aligns[i] = ceph_decode_16(&p);
+ }
+
+ /*
+ * This would fire for FRAME_TAG_WAIT (it has one empty
+ * segment), but we should never get it as client.
+ */
+ if (!desc->fd_lens[desc->fd_seg_cnt - 1]) {
+ pr_err("last segment empty\n");
+ return -EINVAL;
+ }
+
+ if (desc->fd_lens[0] > CEPH_MSG_MAX_CONTROL_LEN) {
+ pr_err("control segment too big %d\n", desc->fd_lens[0]);
+ return -EINVAL;
+ }
+ if (desc->fd_lens[1] > CEPH_MSG_MAX_FRONT_LEN) {
+ pr_err("front segment too big %d\n", desc->fd_lens[1]);
+ return -EINVAL;
+ }
+ if (desc->fd_lens[2] > CEPH_MSG_MAX_MIDDLE_LEN) {
+ pr_err("middle segment too big %d\n", desc->fd_lens[2]);
+ return -EINVAL;
+ }
+ if (desc->fd_lens[3] > CEPH_MSG_MAX_DATA_LEN) {
+ pr_err("data segment too big %d\n", desc->fd_lens[3]);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static void encode_epilogue_plain(struct ceph_connection *con, bool aborted)
+{
+ con->v2.out_epil.late_status = aborted ? FRAME_LATE_STATUS_ABORTED :
+ FRAME_LATE_STATUS_COMPLETE;
+ cpu_to_le32s(&con->v2.out_epil.front_crc);
+ cpu_to_le32s(&con->v2.out_epil.middle_crc);
+ cpu_to_le32s(&con->v2.out_epil.data_crc);
+}
+
+static void encode_epilogue_secure(struct ceph_connection *con, bool aborted)
+{
+ memset(&con->v2.out_epil, 0, sizeof(con->v2.out_epil));
+ con->v2.out_epil.late_status = aborted ? FRAME_LATE_STATUS_ABORTED :
+ FRAME_LATE_STATUS_COMPLETE;
+}
+
+static int decode_epilogue(void *p, u32 *front_crc, u32 *middle_crc,
+ u32 *data_crc)
+{
+ u8 late_status;
+
+ late_status = ceph_decode_8(&p);
+ if ((late_status & FRAME_LATE_STATUS_ABORTED_MASK) !=
+ FRAME_LATE_STATUS_COMPLETE) {
+ /* we should never get an aborted message as client */
+ pr_err("bad late_status 0x%x\n", late_status);
+ return -EINVAL;
+ }
+
+ if (front_crc && middle_crc && data_crc) {
+ *front_crc = ceph_decode_32(&p);
+ *middle_crc = ceph_decode_32(&p);
+ *data_crc = ceph_decode_32(&p);
+ }
+
+ return 0;
+}
+
+static void fill_header(struct ceph_msg_header *hdr,
+ const struct ceph_msg_header2 *hdr2,
+ int front_len, int middle_len, int data_len,
+ const struct ceph_entity_name *peer_name)
+{
+ hdr->seq = hdr2->seq;
+ hdr->tid = hdr2->tid;
+ hdr->type = hdr2->type;
+ hdr->priority = hdr2->priority;
+ hdr->version = hdr2->version;
+ hdr->front_len = cpu_to_le32(front_len);
+ hdr->middle_len = cpu_to_le32(middle_len);
+ hdr->data_len = cpu_to_le32(data_len);
+ hdr->data_off = hdr2->data_off;
+ hdr->src = *peer_name;
+ hdr->compat_version = hdr2->compat_version;
+ hdr->reserved = 0;
+ hdr->crc = 0;
+}
+
+static void fill_header2(struct ceph_msg_header2 *hdr2,
+ const struct ceph_msg_header *hdr, u64 ack_seq)
+{
+ hdr2->seq = hdr->seq;
+ hdr2->tid = hdr->tid;
+ hdr2->type = hdr->type;
+ hdr2->priority = hdr->priority;
+ hdr2->version = hdr->version;
+ hdr2->data_pre_padding_len = 0;
+ hdr2->data_off = hdr->data_off;
+ hdr2->ack_seq = cpu_to_le64(ack_seq);
+ hdr2->flags = 0;
+ hdr2->compat_version = hdr->compat_version;
+ hdr2->reserved = 0;
+}
+
+static int verify_control_crc(struct ceph_connection *con)
+{
+ int ctrl_len = con->v2.in_desc.fd_lens[0];
+ u32 crc, expected_crc;
+
+ WARN_ON(con->v2.in_kvecs[0].iov_len != ctrl_len);
+ WARN_ON(con->v2.in_kvecs[1].iov_len != CEPH_CRC_LEN);
+
+ crc = crc32c(-1, con->v2.in_kvecs[0].iov_base, ctrl_len);
+ expected_crc = get_unaligned_le32(con->v2.in_kvecs[1].iov_base);
+ if (crc != expected_crc) {
+ pr_err("bad control crc, calculated %u, expected %u\n",
+ crc, expected_crc);
+ return -EBADMSG;
+ }
+
+ return 0;
+}
+
+static int verify_epilogue_crcs(struct ceph_connection *con, u32 front_crc,
+ u32 middle_crc, u32 data_crc)
+{
+ if (front_len(con->in_msg)) {
+ con->in_front_crc = crc32c(-1, con->in_msg->front.iov_base,
+ front_len(con->in_msg));
+ } else {
+ WARN_ON(!middle_len(con->in_msg) && !data_len(con->in_msg));
+ con->in_front_crc = -1;
+ }
+
+ if (middle_len(con->in_msg))
+ con->in_middle_crc = crc32c(-1,
+ con->in_msg->middle->vec.iov_base,
+ middle_len(con->in_msg));
+ else if (data_len(con->in_msg))
+ con->in_middle_crc = -1;
+ else
+ con->in_middle_crc = 0;
+
+ if (!data_len(con->in_msg))
+ con->in_data_crc = 0;
+
+ dout("%s con %p msg %p crcs %u %u %u\n", __func__, con, con->in_msg,
+ con->in_front_crc, con->in_middle_crc, con->in_data_crc);
+
+ if (con->in_front_crc != front_crc) {
+ pr_err("bad front crc, calculated %u, expected %u\n",
+ con->in_front_crc, front_crc);
+ return -EBADMSG;
+ }
+ if (con->in_middle_crc != middle_crc) {
+ pr_err("bad middle crc, calculated %u, expected %u\n",
+ con->in_middle_crc, middle_crc);
+ return -EBADMSG;
+ }
+ if (con->in_data_crc != data_crc) {
+ pr_err("bad data crc, calculated %u, expected %u\n",
+ con->in_data_crc, data_crc);
+ return -EBADMSG;
+ }
+
+ return 0;
+}
+
+static int setup_crypto(struct ceph_connection *con,
+ u8 *session_key, int session_key_len,
+ u8 *con_secret, int con_secret_len)
+{
+ unsigned int noio_flag;
+ void *p;
+ int ret;
+
+ dout("%s con %p con_mode %d session_key_len %d con_secret_len %d\n",
+ __func__, con, con->v2.con_mode, session_key_len, con_secret_len);
+ WARN_ON(con->v2.hmac_tfm || con->v2.gcm_tfm || con->v2.gcm_req);
+
+ if (con->v2.con_mode != CEPH_CON_MODE_CRC &&
+ con->v2.con_mode != CEPH_CON_MODE_SECURE) {
+ pr_err("bad con_mode %d\n", con->v2.con_mode);
+ return -EINVAL;
+ }
+
+ if (!session_key_len) {
+ WARN_ON(con->v2.con_mode != CEPH_CON_MODE_CRC);
+ WARN_ON(con_secret_len);
+ return 0; /* auth_none */
+ }
+
+ noio_flag = memalloc_noio_save();
+ con->v2.hmac_tfm = crypto_alloc_shash("hmac(sha256)", 0, 0);
+ memalloc_noio_restore(noio_flag);
+ if (IS_ERR(con->v2.hmac_tfm)) {
+ ret = PTR_ERR(con->v2.hmac_tfm);
+ con->v2.hmac_tfm = NULL;
+ pr_err("failed to allocate hmac tfm context: %d\n", ret);
+ return ret;
+ }
+
+ WARN_ON((unsigned long)session_key &
+ crypto_shash_alignmask(con->v2.hmac_tfm));
+ ret = crypto_shash_setkey(con->v2.hmac_tfm, session_key,
+ session_key_len);
+ if (ret) {
+ pr_err("failed to set hmac key: %d\n", ret);
+ return ret;
+ }
+
+ if (con->v2.con_mode == CEPH_CON_MODE_CRC) {
+ WARN_ON(con_secret_len);
+ return 0; /* auth_x, plain mode */
+ }
+
+ if (con_secret_len < CEPH_GCM_KEY_LEN + 2 * CEPH_GCM_IV_LEN) {
+ pr_err("con_secret too small %d\n", con_secret_len);
+ return -EINVAL;
+ }
+
+ noio_flag = memalloc_noio_save();
+ con->v2.gcm_tfm = crypto_alloc_aead("gcm(aes)", 0, 0);
+ memalloc_noio_restore(noio_flag);
+ if (IS_ERR(con->v2.gcm_tfm)) {
+ ret = PTR_ERR(con->v2.gcm_tfm);
+ con->v2.gcm_tfm = NULL;
+ pr_err("failed to allocate gcm tfm context: %d\n", ret);
+ return ret;
+ }
+
+ p = con_secret;
+ WARN_ON((unsigned long)p & crypto_aead_alignmask(con->v2.gcm_tfm));
+ ret = crypto_aead_setkey(con->v2.gcm_tfm, p, CEPH_GCM_KEY_LEN);
+ if (ret) {
+ pr_err("failed to set gcm key: %d\n", ret);
+ return ret;
+ }
+
+ p += CEPH_GCM_KEY_LEN;
+ WARN_ON(crypto_aead_ivsize(con->v2.gcm_tfm) != CEPH_GCM_IV_LEN);
+ ret = crypto_aead_setauthsize(con->v2.gcm_tfm, CEPH_GCM_TAG_LEN);
+ if (ret) {
+ pr_err("failed to set gcm tag size: %d\n", ret);
+ return ret;
+ }
+
+ con->v2.gcm_req = aead_request_alloc(con->v2.gcm_tfm, GFP_NOIO);
+ if (!con->v2.gcm_req) {
+ pr_err("failed to allocate gcm request\n");
+ return -ENOMEM;
+ }
+
+ crypto_init_wait(&con->v2.gcm_wait);
+ aead_request_set_callback(con->v2.gcm_req, CRYPTO_TFM_REQ_MAY_BACKLOG,
+ crypto_req_done, &con->v2.gcm_wait);
+
+ memcpy(&con->v2.in_gcm_nonce, p, CEPH_GCM_IV_LEN);
+ memcpy(&con->v2.out_gcm_nonce, p + CEPH_GCM_IV_LEN, CEPH_GCM_IV_LEN);
+ return 0; /* auth_x, secure mode */
+}
+
+static int hmac_sha256(struct ceph_connection *con, const struct kvec *kvecs,
+ int kvec_cnt, u8 *hmac)
+{
+ SHASH_DESC_ON_STACK(desc, con->v2.hmac_tfm); /* tfm arg is ignored */
+ int ret;
+ int i;
+
+ dout("%s con %p hmac_tfm %p kvec_cnt %d\n", __func__, con,
+ con->v2.hmac_tfm, kvec_cnt);
+
+ if (!con->v2.hmac_tfm) {
+ memset(hmac, 0, SHA256_DIGEST_SIZE);
+ return 0; /* auth_none */
+ }
+
+ desc->tfm = con->v2.hmac_tfm;
+ ret = crypto_shash_init(desc);
+ if (ret)
+ return ret;
+
+ for (i = 0; i < kvec_cnt; i++) {
+ WARN_ON((unsigned long)kvecs[i].iov_base &
+ crypto_shash_alignmask(con->v2.hmac_tfm));
+ ret = crypto_shash_update(desc, kvecs[i].iov_base,
+ kvecs[i].iov_len);
+ if (ret)
+ return ret;
+ }
+
+ ret = crypto_shash_final(desc, hmac);
+ if (ret)
+ return ret;
+
+ shash_desc_zero(desc);
+ return 0; /* auth_x, both plain and secure modes */
+}
+
+static void gcm_inc_nonce(struct ceph_gcm_nonce *nonce)
+{
+ u64 counter;
+
+ counter = le64_to_cpu(nonce->counter);
+ nonce->counter = cpu_to_le64(counter + 1);
+}
+
+static int gcm_crypt(struct ceph_connection *con, bool encrypt,
+ struct scatterlist *src, struct scatterlist *dst,
+ int src_len)
+{
+ struct ceph_gcm_nonce *nonce;
+ int ret;
+
+ nonce = encrypt ? &con->v2.out_gcm_nonce : &con->v2.in_gcm_nonce;
+
+ aead_request_set_ad(con->v2.gcm_req, 0); /* no AAD */
+ aead_request_set_crypt(con->v2.gcm_req, src, dst, src_len, (u8 *)nonce);
+ ret = crypto_wait_req(encrypt ? crypto_aead_encrypt(con->v2.gcm_req) :
+ crypto_aead_decrypt(con->v2.gcm_req),
+ &con->v2.gcm_wait);
+ if (ret)
+ return ret;
+
+ gcm_inc_nonce(nonce);
+ return 0;
+}
+
+static void get_bvec_at(struct ceph_msg_data_cursor *cursor,
+ struct bio_vec *bv)
+{
+ struct page *page;
+ size_t off, len;
+
+ WARN_ON(!cursor->total_resid);
+
+ /* skip zero-length data items */
+ while (!cursor->resid)
+ ceph_msg_data_advance(cursor, 0);
+
+ /* get a piece of data, cursor isn't advanced */
+ page = ceph_msg_data_next(cursor, &off, &len, NULL);
+
+ bv->bv_page = page;
+ bv->bv_offset = off;
+ bv->bv_len = len;
+}
+
+static int calc_sg_cnt(void *buf, int buf_len)
+{
+ int sg_cnt;
+
+ if (!buf_len)
+ return 0;
+
+ sg_cnt = need_padding(buf_len) ? 1 : 0;
+ if (is_vmalloc_addr(buf)) {
+ WARN_ON(offset_in_page(buf));
+ sg_cnt += PAGE_ALIGN(buf_len) >> PAGE_SHIFT;
+ } else {
+ sg_cnt++;
+ }
+
+ return sg_cnt;
+}
+
+static int calc_sg_cnt_cursor(struct ceph_msg_data_cursor *cursor)
+{
+ int data_len = cursor->total_resid;
+ struct bio_vec bv;
+ int sg_cnt;
+
+ if (!data_len)
+ return 0;
+
+ sg_cnt = need_padding(data_len) ? 1 : 0;
+ do {
+ get_bvec_at(cursor, &bv);
+ sg_cnt++;
+
+ ceph_msg_data_advance(cursor, bv.bv_len);
+ } while (cursor->total_resid);
+
+ return sg_cnt;
+}
+
+static void init_sgs(struct scatterlist **sg, void *buf, int buf_len, u8 *pad)
+{
+ void *end = buf + buf_len;
+ struct page *page;
+ int len;
+ void *p;
+
+ if (!buf_len)
+ return;
+
+ if (is_vmalloc_addr(buf)) {
+ p = buf;
+ do {
+ page = vmalloc_to_page(p);
+ len = min_t(int, end - p, PAGE_SIZE);
+ WARN_ON(!page || !len || offset_in_page(p));
+ sg_set_page(*sg, page, len, 0);
+ *sg = sg_next(*sg);
+ p += len;
+ } while (p != end);
+ } else {
+ sg_set_buf(*sg, buf, buf_len);
+ *sg = sg_next(*sg);
+ }
+
+ if (need_padding(buf_len)) {
+ sg_set_buf(*sg, pad, padding_len(buf_len));
+ *sg = sg_next(*sg);
+ }
+}
+
+static void init_sgs_cursor(struct scatterlist **sg,
+ struct ceph_msg_data_cursor *cursor, u8 *pad)
+{
+ int data_len = cursor->total_resid;
+ struct bio_vec bv;
+
+ if (!data_len)
+ return;
+
+ do {
+ get_bvec_at(cursor, &bv);
+ sg_set_page(*sg, bv.bv_page, bv.bv_len, bv.bv_offset);
+ *sg = sg_next(*sg);
+
+ ceph_msg_data_advance(cursor, bv.bv_len);
+ } while (cursor->total_resid);
+
+ if (need_padding(data_len)) {
+ sg_set_buf(*sg, pad, padding_len(data_len));
+ *sg = sg_next(*sg);
+ }
+}
+
+static int setup_message_sgs(struct sg_table *sgt, struct ceph_msg *msg,
+ u8 *front_pad, u8 *middle_pad, u8 *data_pad,
+ void *epilogue, bool add_tag)
+{
+ struct ceph_msg_data_cursor cursor;
+ struct scatterlist *cur_sg;
+ int sg_cnt;
+ int ret;
+
+ if (!front_len(msg) && !middle_len(msg) && !data_len(msg))
+ return 0;
+
+ sg_cnt = 1; /* epilogue + [auth tag] */
+ if (front_len(msg))
+ sg_cnt += calc_sg_cnt(msg->front.iov_base,
+ front_len(msg));
+ if (middle_len(msg))
+ sg_cnt += calc_sg_cnt(msg->middle->vec.iov_base,
+ middle_len(msg));
+ if (data_len(msg)) {
+ ceph_msg_data_cursor_init(&cursor, msg, data_len(msg));
+ sg_cnt += calc_sg_cnt_cursor(&cursor);
+ }
+
+ ret = sg_alloc_table(sgt, sg_cnt, GFP_NOIO);
+ if (ret)
+ return ret;
+
+ cur_sg = sgt->sgl;
+ if (front_len(msg))
+ init_sgs(&cur_sg, msg->front.iov_base, front_len(msg),
+ front_pad);
+ if (middle_len(msg))
+ init_sgs(&cur_sg, msg->middle->vec.iov_base, middle_len(msg),
+ middle_pad);
+ if (data_len(msg)) {
+ ceph_msg_data_cursor_init(&cursor, msg, data_len(msg));
+ init_sgs_cursor(&cur_sg, &cursor, data_pad);
+ }
+
+ WARN_ON(!sg_is_last(cur_sg));
+ sg_set_buf(cur_sg, epilogue,
+ CEPH_GCM_BLOCK_LEN + (add_tag ? CEPH_GCM_TAG_LEN : 0));
+ return 0;
+}
+
+static int decrypt_preamble(struct ceph_connection *con)
+{
+ struct scatterlist sg;
+
+ sg_init_one(&sg, con->v2.in_buf, CEPH_PREAMBLE_SECURE_LEN);
+ return gcm_crypt(con, false, &sg, &sg, CEPH_PREAMBLE_SECURE_LEN);
+}
+
+static int decrypt_control_remainder(struct ceph_connection *con)
+{
+ int ctrl_len = con->v2.in_desc.fd_lens[0];
+ int rem_len = ctrl_len - CEPH_PREAMBLE_INLINE_LEN;
+ int pt_len = padding_len(rem_len) + CEPH_GCM_TAG_LEN;
+ struct scatterlist sgs[2];
+
+ WARN_ON(con->v2.in_kvecs[0].iov_len != rem_len);
+ WARN_ON(con->v2.in_kvecs[1].iov_len != pt_len);
+
+ sg_init_table(sgs, 2);
+ sg_set_buf(&sgs[0], con->v2.in_kvecs[0].iov_base, rem_len);
+ sg_set_buf(&sgs[1], con->v2.in_buf, pt_len);
+
+ return gcm_crypt(con, false, sgs, sgs,
+ padded_len(rem_len) + CEPH_GCM_TAG_LEN);
+}
+
+static int decrypt_message(struct ceph_connection *con)
+{
+ struct sg_table sgt = {};
+ int ret;
+
+ ret = setup_message_sgs(&sgt, con->in_msg, FRONT_PAD(con->v2.in_buf),
+ MIDDLE_PAD(con->v2.in_buf), DATA_PAD(con->v2.in_buf),
+ con->v2.in_buf, true);
+ if (ret)
+ goto out;
+
+ ret = gcm_crypt(con, false, sgt.sgl, sgt.sgl,
+ tail_onwire_len(con->in_msg, true));
+
+out:
+ sg_free_table(&sgt);
+ return ret;
+}
+
+static int prepare_banner(struct ceph_connection *con)
+{
+ int buf_len = CEPH_BANNER_V2_LEN + 2 + 8 + 8;
+ void *buf, *p;
+
+ buf = alloc_conn_buf(con, buf_len);
+ if (!buf)
+ return -ENOMEM;
+
+ p = buf;
+ ceph_encode_copy(&p, CEPH_BANNER_V2, CEPH_BANNER_V2_LEN);
+ ceph_encode_16(&p, sizeof(u64) + sizeof(u64));
+ ceph_encode_64(&p, CEPH_MSGR2_SUPPORTED_FEATURES);
+ ceph_encode_64(&p, CEPH_MSGR2_REQUIRED_FEATURES);
+ WARN_ON(p != buf + buf_len);
+
+ add_out_kvec(con, buf, buf_len);
+ add_out_sign_kvec(con, buf, buf_len);
+ ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
+ return 0;
+}
+
+/*
+ * base:
+ * preamble
+ * control body (ctrl_len bytes)
+ * space for control crc
+ *
+ * extdata (optional):
+ * control body (extdata_len bytes)
+ *
+ * Compute control crc and gather base and extdata into:
+ *
+ * preamble
+ * control body (ctrl_len + extdata_len bytes)
+ * control crc
+ *
+ * Preamble should already be encoded at the start of base.
+ */
+static void prepare_head_plain(struct ceph_connection *con, void *base,
+ int ctrl_len, void *extdata, int extdata_len,
+ bool to_be_signed)
+{
+ int base_len = CEPH_PREAMBLE_LEN + ctrl_len + CEPH_CRC_LEN;
+ void *crcp = base + base_len - CEPH_CRC_LEN;
+ u32 crc;
+
+ crc = crc32c(-1, CTRL_BODY(base), ctrl_len);
+ if (extdata_len)
+ crc = crc32c(crc, extdata, extdata_len);
+ put_unaligned_le32(crc, crcp);
+
+ if (!extdata_len) {
+ add_out_kvec(con, base, base_len);
+ if (to_be_signed)
+ add_out_sign_kvec(con, base, base_len);
+ return;
+ }
+
+ add_out_kvec(con, base, crcp - base);
+ add_out_kvec(con, extdata, extdata_len);
+ add_out_kvec(con, crcp, CEPH_CRC_LEN);
+ if (to_be_signed) {
+ add_out_sign_kvec(con, base, crcp - base);
+ add_out_sign_kvec(con, extdata, extdata_len);
+ add_out_sign_kvec(con, crcp, CEPH_CRC_LEN);
+ }
+}
+
+static int prepare_head_secure_small(struct ceph_connection *con,
+ void *base, int ctrl_len)
+{
+ struct scatterlist sg;
+ int ret;
+
+ /* inline buffer padding? */
+ if (ctrl_len < CEPH_PREAMBLE_INLINE_LEN)
+ memset(CTRL_BODY(base) + ctrl_len, 0,
+ CEPH_PREAMBLE_INLINE_LEN - ctrl_len);
+
+ sg_init_one(&sg, base, CEPH_PREAMBLE_SECURE_LEN);
+ ret = gcm_crypt(con, true, &sg, &sg,
+ CEPH_PREAMBLE_SECURE_LEN - CEPH_GCM_TAG_LEN);
+ if (ret)
+ return ret;
+
+ add_out_kvec(con, base, CEPH_PREAMBLE_SECURE_LEN);
+ return 0;
+}
+
+/*
+ * base:
+ * preamble
+ * control body (ctrl_len bytes)
+ * space for padding, if needed
+ * space for control remainder auth tag
+ * space for preamble auth tag
+ *
+ * Encrypt preamble and the inline portion, then encrypt the remainder
+ * and gather into:
+ *
+ * preamble
+ * control body (48 bytes)
+ * preamble auth tag
+ * control body (ctrl_len - 48 bytes)
+ * zero padding, if needed
+ * control remainder auth tag
+ *
+ * Preamble should already be encoded at the start of base.
+ */
+static int prepare_head_secure_big(struct ceph_connection *con,
+ void *base, int ctrl_len)
+{
+ int rem_len = ctrl_len - CEPH_PREAMBLE_INLINE_LEN;
+ void *rem = CTRL_BODY(base) + CEPH_PREAMBLE_INLINE_LEN;
+ void *rem_tag = rem + padded_len(rem_len);
+ void *pmbl_tag = rem_tag + CEPH_GCM_TAG_LEN;
+ struct scatterlist sgs[2];
+ int ret;
+
+ sg_init_table(sgs, 2);
+ sg_set_buf(&sgs[0], base, rem - base);
+ sg_set_buf(&sgs[1], pmbl_tag, CEPH_GCM_TAG_LEN);
+ ret = gcm_crypt(con, true, sgs, sgs, rem - base);
+ if (ret)
+ return ret;
+
+ /* control remainder padding? */
+ if (need_padding(rem_len))
+ memset(rem + rem_len, 0, padding_len(rem_len));
+
+ sg_init_one(&sgs[0], rem, pmbl_tag - rem);
+ ret = gcm_crypt(con, true, sgs, sgs, rem_tag - rem);
+ if (ret)
+ return ret;
+
+ add_out_kvec(con, base, rem - base);
+ add_out_kvec(con, pmbl_tag, CEPH_GCM_TAG_LEN);
+ add_out_kvec(con, rem, pmbl_tag - rem);
+ return 0;
+}
+
+static int __prepare_control(struct ceph_connection *con, int tag,
+ void *base, int ctrl_len, void *extdata,
+ int extdata_len, bool to_be_signed)
+{
+ int total_len = ctrl_len + extdata_len;
+ struct ceph_frame_desc desc;
+ int ret;
+
+ dout("%s con %p tag %d len %d (%d+%d)\n", __func__, con, tag,
+ total_len, ctrl_len, extdata_len);
+
+ /* extdata may be vmalloc'ed but not base */
+ if (WARN_ON(is_vmalloc_addr(base) || !ctrl_len))
+ return -EINVAL;
+
+ init_frame_desc(&desc, tag, &total_len, 1);
+ encode_preamble(&desc, base);
+
+ if (con_secure(con)) {
+ if (WARN_ON(extdata_len || to_be_signed))
+ return -EINVAL;
+
+ if (ctrl_len <= CEPH_PREAMBLE_INLINE_LEN)
+ /* fully inlined, inline buffer may need padding */
+ ret = prepare_head_secure_small(con, base, ctrl_len);
+ else
+ /* partially inlined, inline buffer is full */
+ ret = prepare_head_secure_big(con, base, ctrl_len);
+ if (ret)
+ return ret;
+ } else {
+ prepare_head_plain(con, base, ctrl_len, extdata, extdata_len,
+ to_be_signed);
+ }
+
+ ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
+ return 0;
+}
+
+static int prepare_control(struct ceph_connection *con, int tag,
+ void *base, int ctrl_len)
+{
+ return __prepare_control(con, tag, base, ctrl_len, NULL, 0, false);
+}
+
+static int prepare_hello(struct ceph_connection *con)
+{
+ void *buf, *p;
+ int ctrl_len;
+
+ ctrl_len = 1 + ceph_entity_addr_encoding_len(&con->peer_addr);
+ buf = alloc_conn_buf(con, head_onwire_len(ctrl_len, false));
+ if (!buf)
+ return -ENOMEM;
+
+ p = CTRL_BODY(buf);
+ ceph_encode_8(&p, CEPH_ENTITY_TYPE_CLIENT);
+ ceph_encode_entity_addr(&p, &con->peer_addr);
+ WARN_ON(p != CTRL_BODY(buf) + ctrl_len);
+
+ return __prepare_control(con, FRAME_TAG_HELLO, buf, ctrl_len,
+ NULL, 0, true);
+}
+
+/* so that head_onwire_len(AUTH_BUF_LEN, false) is 512 */
+#define AUTH_BUF_LEN (512 - CEPH_CRC_LEN - CEPH_PREAMBLE_PLAIN_LEN)
+
+static int prepare_auth_request(struct ceph_connection *con)
+{
+ void *authorizer, *authorizer_copy;
+ int ctrl_len, authorizer_len;
+ void *buf;
+ int ret;
+
+ ctrl_len = AUTH_BUF_LEN;
+ buf = alloc_conn_buf(con, head_onwire_len(ctrl_len, false));
+ if (!buf)
+ return -ENOMEM;
+
+ mutex_unlock(&con->mutex);
+ ret = con->ops->get_auth_request(con, CTRL_BODY(buf), &ctrl_len,
+ &authorizer, &authorizer_len);
+ mutex_lock(&con->mutex);
+ if (con->state != CEPH_CON_S_V2_HELLO) {
+ dout("%s con %p state changed to %d\n", __func__, con,
+ con->state);
+ return -EAGAIN;
+ }
+
+ dout("%s con %p get_auth_request ret %d\n", __func__, con, ret);
+ if (ret)
+ return ret;
+
+ authorizer_copy = alloc_conn_buf(con, authorizer_len);
+ if (!authorizer_copy)
+ return -ENOMEM;
+
+ memcpy(authorizer_copy, authorizer, authorizer_len);
+
+ return __prepare_control(con, FRAME_TAG_AUTH_REQUEST, buf, ctrl_len,
+ authorizer_copy, authorizer_len, true);
+}
+
+static int prepare_auth_request_more(struct ceph_connection *con,
+ void *reply, int reply_len)
+{
+ int ctrl_len, authorizer_len;
+ void *authorizer;
+ void *buf;
+ int ret;
+
+ ctrl_len = AUTH_BUF_LEN;
+ buf = alloc_conn_buf(con, head_onwire_len(ctrl_len, false));
+ if (!buf)
+ return -ENOMEM;
+
+ mutex_unlock(&con->mutex);
+ ret = con->ops->handle_auth_reply_more(con, reply, reply_len,
+ CTRL_BODY(buf), &ctrl_len,
+ &authorizer, &authorizer_len);
+ mutex_lock(&con->mutex);
+ if (con->state != CEPH_CON_S_V2_AUTH) {
+ dout("%s con %p state changed to %d\n", __func__, con,
+ con->state);
+ return -EAGAIN;
+ }
+
+ dout("%s con %p handle_auth_reply_more ret %d\n", __func__, con, ret);
+ if (ret)
+ return ret;
+
+ return __prepare_control(con, FRAME_TAG_AUTH_REQUEST_MORE, buf,
+ ctrl_len, authorizer, authorizer_len, true);
+}
+
+static int prepare_auth_signature(struct ceph_connection *con)
+{
+ void *buf;
+ int ret;
+
+ buf = alloc_conn_buf(con, head_onwire_len(SHA256_DIGEST_SIZE, false));
+ if (!buf)
+ return -ENOMEM;
+
+ ret = hmac_sha256(con, con->v2.in_sign_kvecs, con->v2.in_sign_kvec_cnt,
+ CTRL_BODY(buf));
+ if (ret)
+ return ret;
+
+ return prepare_control(con, FRAME_TAG_AUTH_SIGNATURE, buf,
+ SHA256_DIGEST_SIZE);
+}
+
+static int prepare_client_ident(struct ceph_connection *con)
+{
+ struct ceph_entity_addr *my_addr = &con->msgr->inst.addr;
+ struct ceph_client *client = from_msgr(con->msgr);
+ u64 global_id = ceph_client_gid(client);
+ void *buf, *p;
+ int ctrl_len;
+
+ WARN_ON(con->v2.server_cookie);
+ WARN_ON(con->v2.connect_seq);
+ WARN_ON(con->v2.peer_global_seq);
+
+ if (!con->v2.client_cookie) {
+ do {
+ get_random_bytes(&con->v2.client_cookie,
+ sizeof(con->v2.client_cookie));
+ } while (!con->v2.client_cookie);
+ dout("%s con %p generated cookie 0x%llx\n", __func__, con,
+ con->v2.client_cookie);
+ } else {
+ dout("%s con %p cookie already set 0x%llx\n", __func__, con,
+ con->v2.client_cookie);
+ }
+
+ dout("%s con %p my_addr %s/%u peer_addr %s/%u global_id %llu global_seq %llu features 0x%llx required_features 0x%llx cookie 0x%llx\n",
+ __func__, con, ceph_pr_addr(my_addr), le32_to_cpu(my_addr->nonce),
+ ceph_pr_addr(&con->peer_addr), le32_to_cpu(con->peer_addr.nonce),
+ global_id, con->v2.global_seq, client->supported_features,
+ client->required_features, con->v2.client_cookie);
+
+ ctrl_len = 1 + 4 + ceph_entity_addr_encoding_len(my_addr) +
+ ceph_entity_addr_encoding_len(&con->peer_addr) + 6 * 8;
+ buf = alloc_conn_buf(con, head_onwire_len(ctrl_len, con_secure(con)));
+ if (!buf)
+ return -ENOMEM;
+
+ p = CTRL_BODY(buf);
+ ceph_encode_8(&p, 2); /* addrvec marker */
+ ceph_encode_32(&p, 1); /* addr_cnt */
+ ceph_encode_entity_addr(&p, my_addr);
+ ceph_encode_entity_addr(&p, &con->peer_addr);
+ ceph_encode_64(&p, global_id);
+ ceph_encode_64(&p, con->v2.global_seq);
+ ceph_encode_64(&p, client->supported_features);
+ ceph_encode_64(&p, client->required_features);
+ ceph_encode_64(&p, 0); /* flags */
+ ceph_encode_64(&p, con->v2.client_cookie);
+ WARN_ON(p != CTRL_BODY(buf) + ctrl_len);
+
+ return prepare_control(con, FRAME_TAG_CLIENT_IDENT, buf, ctrl_len);
+}
+
+static int prepare_session_reconnect(struct ceph_connection *con)
+{
+ struct ceph_entity_addr *my_addr = &con->msgr->inst.addr;
+ void *buf, *p;
+ int ctrl_len;
+
+ WARN_ON(!con->v2.client_cookie);
+ WARN_ON(!con->v2.server_cookie);
+ WARN_ON(!con->v2.connect_seq);
+ WARN_ON(!con->v2.peer_global_seq);
+
+ dout("%s con %p my_addr %s/%u client_cookie 0x%llx server_cookie 0x%llx global_seq %llu connect_seq %llu in_seq %llu\n",
+ __func__, con, ceph_pr_addr(my_addr), le32_to_cpu(my_addr->nonce),
+ con->v2.client_cookie, con->v2.server_cookie, con->v2.global_seq,
+ con->v2.connect_seq, con->in_seq);
+
+ ctrl_len = 1 + 4 + ceph_entity_addr_encoding_len(my_addr) + 5 * 8;
+ buf = alloc_conn_buf(con, head_onwire_len(ctrl_len, con_secure(con)));
+ if (!buf)
+ return -ENOMEM;
+
+ p = CTRL_BODY(buf);
+ ceph_encode_8(&p, 2); /* entity_addrvec_t marker */
+ ceph_encode_32(&p, 1); /* my_addrs len */
+ ceph_encode_entity_addr(&p, my_addr);
+ ceph_encode_64(&p, con->v2.client_cookie);
+ ceph_encode_64(&p, con->v2.server_cookie);
+ ceph_encode_64(&p, con->v2.global_seq);
+ ceph_encode_64(&p, con->v2.connect_seq);
+ ceph_encode_64(&p, con->in_seq);
+ WARN_ON(p != CTRL_BODY(buf) + ctrl_len);
+
+ return prepare_control(con, FRAME_TAG_SESSION_RECONNECT, buf, ctrl_len);
+}
+
+static int prepare_keepalive2(struct ceph_connection *con)
+{
+ struct ceph_timespec *ts = CTRL_BODY(con->v2.out_buf);
+ struct timespec64 now;
+
+ ktime_get_real_ts64(&now);
+ dout("%s con %p timestamp %lld.%09ld\n", __func__, con, now.tv_sec,
+ now.tv_nsec);
+
+ ceph_encode_timespec64(ts, &now);
+
+ reset_out_kvecs(con);
+ return prepare_control(con, FRAME_TAG_KEEPALIVE2, con->v2.out_buf,
+ sizeof(struct ceph_timespec));
+}
+
+static int prepare_ack(struct ceph_connection *con)
+{
+ void *p;
+
+ dout("%s con %p in_seq_acked %llu -> %llu\n", __func__, con,
+ con->in_seq_acked, con->in_seq);
+ con->in_seq_acked = con->in_seq;
+
+ p = CTRL_BODY(con->v2.out_buf);
+ ceph_encode_64(&p, con->in_seq_acked);
+
+ reset_out_kvecs(con);
+ return prepare_control(con, FRAME_TAG_ACK, con->v2.out_buf, 8);
+}
+
+static void prepare_epilogue_plain(struct ceph_connection *con, bool aborted)
+{
+ dout("%s con %p msg %p aborted %d crcs %u %u %u\n", __func__, con,
+ con->out_msg, aborted, con->v2.out_epil.front_crc,
+ con->v2.out_epil.middle_crc, con->v2.out_epil.data_crc);
+
+ encode_epilogue_plain(con, aborted);
+ add_out_kvec(con, &con->v2.out_epil, CEPH_EPILOGUE_PLAIN_LEN);
+}
+
+/*
+ * For "used" empty segments, crc is -1. For unused (trailing)
+ * segments, crc is 0.
+ */
+static void prepare_message_plain(struct ceph_connection *con)
+{
+ struct ceph_msg *msg = con->out_msg;
+
+ prepare_head_plain(con, con->v2.out_buf,
+ sizeof(struct ceph_msg_header2), NULL, 0, false);
+
+ if (!front_len(msg) && !middle_len(msg)) {
+ if (!data_len(msg)) {
+ /*
+ * Empty message: once the head is written,
+ * we are done -- there is no epilogue.
+ */
+ con->v2.out_state = OUT_S_FINISH_MESSAGE;
+ return;
+ }
+
+ con->v2.out_epil.front_crc = -1;
+ con->v2.out_epil.middle_crc = -1;
+ con->v2.out_state = OUT_S_QUEUE_DATA;
+ return;
+ }
+
+ if (front_len(msg)) {
+ con->v2.out_epil.front_crc = crc32c(-1, msg->front.iov_base,
+ front_len(msg));
+ add_out_kvec(con, msg->front.iov_base, front_len(msg));
+ } else {
+ /* middle (at least) is there, checked above */
+ con->v2.out_epil.front_crc = -1;
+ }
+
+ if (middle_len(msg)) {
+ con->v2.out_epil.middle_crc =
+ crc32c(-1, msg->middle->vec.iov_base, middle_len(msg));
+ add_out_kvec(con, msg->middle->vec.iov_base, middle_len(msg));
+ } else {
+ con->v2.out_epil.middle_crc = data_len(msg) ? -1 : 0;
+ }
+
+ if (data_len(msg)) {
+ con->v2.out_state = OUT_S_QUEUE_DATA;
+ } else {
+ con->v2.out_epil.data_crc = 0;
+ prepare_epilogue_plain(con, false);
+ con->v2.out_state = OUT_S_FINISH_MESSAGE;
+ }
+}
+
+/*
+ * Unfortunately the kernel crypto API doesn't support streaming
+ * (piecewise) operation for AEAD algorithms, so we can't get away
+ * with a fixed size buffer and a couple sgs. Instead, we have to
+ * allocate pages for the entire tail of the message (currently up
+ * to ~32M) and two sgs arrays (up to ~256K each)...
+ */
+static int prepare_message_secure(struct ceph_connection *con)
+{
+ void *zerop = page_address(ceph_zero_page);
+ struct sg_table enc_sgt = {};
+ struct sg_table sgt = {};
+ struct page **enc_pages;
+ int enc_page_cnt;
+ int tail_len;
+ int ret;
+
+ ret = prepare_head_secure_small(con, con->v2.out_buf,
+ sizeof(struct ceph_msg_header2));
+ if (ret)
+ return ret;
+
+ tail_len = tail_onwire_len(con->out_msg, true);
+ if (!tail_len) {
+ /*
+ * Empty message: once the head is written,
+ * we are done -- there is no epilogue.
+ */
+ con->v2.out_state = OUT_S_FINISH_MESSAGE;
+ return 0;
+ }
+
+ encode_epilogue_secure(con, false);
+ ret = setup_message_sgs(&sgt, con->out_msg, zerop, zerop, zerop,
+ &con->v2.out_epil, false);
+ if (ret)
+ goto out;
+
+ enc_page_cnt = calc_pages_for(0, tail_len);
+ enc_pages = ceph_alloc_page_vector(enc_page_cnt, GFP_NOIO);
+ if (IS_ERR(enc_pages)) {
+ ret = PTR_ERR(enc_pages);
+ goto out;
+ }
+
+ WARN_ON(con->v2.out_enc_pages || con->v2.out_enc_page_cnt);
+ con->v2.out_enc_pages = enc_pages;
+ con->v2.out_enc_page_cnt = enc_page_cnt;
+ con->v2.out_enc_resid = tail_len;
+ con->v2.out_enc_i = 0;
+
+ ret = sg_alloc_table_from_pages(&enc_sgt, enc_pages, enc_page_cnt,
+ 0, tail_len, GFP_NOIO);
+ if (ret)
+ goto out;
+
+ ret = gcm_crypt(con, true, sgt.sgl, enc_sgt.sgl,
+ tail_len - CEPH_GCM_TAG_LEN);
+ if (ret)
+ goto out;
+
+ dout("%s con %p msg %p sg_cnt %d enc_page_cnt %d\n", __func__, con,
+ con->out_msg, sgt.orig_nents, enc_page_cnt);
+ con->v2.out_state = OUT_S_QUEUE_ENC_PAGE;
+
+out:
+ sg_free_table(&sgt);
+ sg_free_table(&enc_sgt);
+ return ret;
+}
+
+static int prepare_message(struct ceph_connection *con)
+{
+ int lens[] = {
+ sizeof(struct ceph_msg_header2),
+ front_len(con->out_msg),
+ middle_len(con->out_msg),
+ data_len(con->out_msg)
+ };
+ struct ceph_frame_desc desc;
+ int ret;
+
+ dout("%s con %p msg %p logical %d+%d+%d+%d\n", __func__, con,
+ con->out_msg, lens[0], lens[1], lens[2], lens[3]);
+
+ if (con->in_seq > con->in_seq_acked) {
+ dout("%s con %p in_seq_acked %llu -> %llu\n", __func__, con,
+ con->in_seq_acked, con->in_seq);
+ con->in_seq_acked = con->in_seq;
+ }
+
+ reset_out_kvecs(con);
+ init_frame_desc(&desc, FRAME_TAG_MESSAGE, lens, 4);
+ encode_preamble(&desc, con->v2.out_buf);
+ fill_header2(CTRL_BODY(con->v2.out_buf), &con->out_msg->hdr,
+ con->in_seq_acked);
+
+ if (con_secure(con)) {
+ ret = prepare_message_secure(con);
+ if (ret)
+ return ret;
+ } else {
+ prepare_message_plain(con);
+ }
+
+ ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
+ return 0;
+}
+
+static int prepare_read_banner_prefix(struct ceph_connection *con)
+{
+ void *buf;
+
+ buf = alloc_conn_buf(con, CEPH_BANNER_V2_PREFIX_LEN);
+ if (!buf)
+ return -ENOMEM;
+
+ reset_in_kvecs(con);
+ add_in_kvec(con, buf, CEPH_BANNER_V2_PREFIX_LEN);
+ add_in_sign_kvec(con, buf, CEPH_BANNER_V2_PREFIX_LEN);
+ con->state = CEPH_CON_S_V2_BANNER_PREFIX;
+ return 0;
+}
+
+static int prepare_read_banner_payload(struct ceph_connection *con,
+ int payload_len)
+{
+ void *buf;
+
+ buf = alloc_conn_buf(con, payload_len);
+ if (!buf)
+ return -ENOMEM;
+
+ reset_in_kvecs(con);
+ add_in_kvec(con, buf, payload_len);
+ add_in_sign_kvec(con, buf, payload_len);
+ con->state = CEPH_CON_S_V2_BANNER_PAYLOAD;
+ return 0;
+}
+
+static void prepare_read_preamble(struct ceph_connection *con)
+{
+ reset_in_kvecs(con);
+ add_in_kvec(con, con->v2.in_buf,
+ con_secure(con) ? CEPH_PREAMBLE_SECURE_LEN :
+ CEPH_PREAMBLE_PLAIN_LEN);
+ con->v2.in_state = IN_S_HANDLE_PREAMBLE;
+}
+
+static int prepare_read_control(struct ceph_connection *con)
+{
+ int ctrl_len = con->v2.in_desc.fd_lens[0];
+ int head_len;
+ void *buf;
+
+ reset_in_kvecs(con);
+ if (con->state == CEPH_CON_S_V2_HELLO ||
+ con->state == CEPH_CON_S_V2_AUTH) {
+ head_len = head_onwire_len(ctrl_len, false);
+ buf = alloc_conn_buf(con, head_len);
+ if (!buf)
+ return -ENOMEM;
+
+ /* preserve preamble */
+ memcpy(buf, con->v2.in_buf, CEPH_PREAMBLE_LEN);
+
+ add_in_kvec(con, CTRL_BODY(buf), ctrl_len);
+ add_in_kvec(con, CTRL_BODY(buf) + ctrl_len, CEPH_CRC_LEN);
+ add_in_sign_kvec(con, buf, head_len);
+ } else {
+ if (ctrl_len > CEPH_PREAMBLE_INLINE_LEN) {
+ buf = alloc_conn_buf(con, ctrl_len);
+ if (!buf)
+ return -ENOMEM;
+
+ add_in_kvec(con, buf, ctrl_len);
+ } else {
+ add_in_kvec(con, CTRL_BODY(con->v2.in_buf), ctrl_len);
+ }
+ add_in_kvec(con, con->v2.in_buf, CEPH_CRC_LEN);
+ }
+ con->v2.in_state = IN_S_HANDLE_CONTROL;
+ return 0;
+}
+
+static int prepare_read_control_remainder(struct ceph_connection *con)
+{
+ int ctrl_len = con->v2.in_desc.fd_lens[0];
+ int rem_len = ctrl_len - CEPH_PREAMBLE_INLINE_LEN;
+ void *buf;
+
+ buf = alloc_conn_buf(con, ctrl_len);
+ if (!buf)
+ return -ENOMEM;
+
+ memcpy(buf, CTRL_BODY(con->v2.in_buf), CEPH_PREAMBLE_INLINE_LEN);
+
+ reset_in_kvecs(con);
+ add_in_kvec(con, buf + CEPH_PREAMBLE_INLINE_LEN, rem_len);
+ add_in_kvec(con, con->v2.in_buf,
+ padding_len(rem_len) + CEPH_GCM_TAG_LEN);
+ con->v2.in_state = IN_S_HANDLE_CONTROL_REMAINDER;
+ return 0;
+}
+
+static void prepare_read_data(struct ceph_connection *con)
+{
+ struct bio_vec bv;
+
+ if (!con_secure(con))
+ con->in_data_crc = -1;
+ ceph_msg_data_cursor_init(&con->v2.in_cursor, con->in_msg,
+ data_len(con->in_msg));
+
+ get_bvec_at(&con->v2.in_cursor, &bv);
+ set_in_bvec(con, &bv);
+ con->v2.in_state = IN_S_PREPARE_READ_DATA_CONT;
+}
+
+static void prepare_read_data_cont(struct ceph_connection *con)
+{
+ struct bio_vec bv;
+
+ if (!con_secure(con))
+ con->in_data_crc = ceph_crc32c_page(con->in_data_crc,
+ con->v2.in_bvec.bv_page,
+ con->v2.in_bvec.bv_offset,
+ con->v2.in_bvec.bv_len);
+
+ ceph_msg_data_advance(&con->v2.in_cursor, con->v2.in_bvec.bv_len);
+ if (con->v2.in_cursor.total_resid) {
+ get_bvec_at(&con->v2.in_cursor, &bv);
+ set_in_bvec(con, &bv);
+ WARN_ON(con->v2.in_state != IN_S_PREPARE_READ_DATA_CONT);
+ return;
+ }
+
+ /*
+ * We've read all data. Prepare to read data padding (if any)
+ * and epilogue.
+ */
+ reset_in_kvecs(con);
+ if (con_secure(con)) {
+ if (need_padding(data_len(con->in_msg)))
+ add_in_kvec(con, DATA_PAD(con->v2.in_buf),
+ padding_len(data_len(con->in_msg)));
+ add_in_kvec(con, con->v2.in_buf, CEPH_EPILOGUE_SECURE_LEN);
+ } else {
+ add_in_kvec(con, con->v2.in_buf, CEPH_EPILOGUE_PLAIN_LEN);
+ }
+ con->v2.in_state = IN_S_HANDLE_EPILOGUE;
+}
+
+static void __finish_skip(struct ceph_connection *con)
+{
+ con->in_seq++;
+ prepare_read_preamble(con);
+}
+
+static void prepare_skip_message(struct ceph_connection *con)
+{
+ struct ceph_frame_desc *desc = &con->v2.in_desc;
+ int tail_len;
+
+ dout("%s con %p %d+%d+%d\n", __func__, con, desc->fd_lens[1],
+ desc->fd_lens[2], desc->fd_lens[3]);
+
+ tail_len = __tail_onwire_len(desc->fd_lens[1], desc->fd_lens[2],
+ desc->fd_lens[3], con_secure(con));
+ if (!tail_len) {
+ __finish_skip(con);
+ } else {
+ set_in_skip(con, tail_len);
+ con->v2.in_state = IN_S_FINISH_SKIP;
+ }
+}
+
+static int process_banner_prefix(struct ceph_connection *con)
+{
+ int payload_len;
+ void *p;
+
+ WARN_ON(con->v2.in_kvecs[0].iov_len != CEPH_BANNER_V2_PREFIX_LEN);
+
+ p = con->v2.in_kvecs[0].iov_base;
+ if (memcmp(p, CEPH_BANNER_V2, CEPH_BANNER_V2_LEN)) {
+ if (!memcmp(p, CEPH_BANNER, CEPH_BANNER_LEN))
+ con->error_msg = "server is speaking msgr1 protocol";
+ else
+ con->error_msg = "protocol error, bad banner";
+ return -EINVAL;
+ }
+
+ p += CEPH_BANNER_V2_LEN;
+ payload_len = ceph_decode_16(&p);
+ dout("%s con %p payload_len %d\n", __func__, con, payload_len);
+
+ return prepare_read_banner_payload(con, payload_len);
+}
+
+static int process_banner_payload(struct ceph_connection *con)
+{
+ void *end = con->v2.in_kvecs[0].iov_base + con->v2.in_kvecs[0].iov_len;
+ u64 feat = CEPH_MSGR2_SUPPORTED_FEATURES;
+ u64 req_feat = CEPH_MSGR2_REQUIRED_FEATURES;
+ u64 server_feat, server_req_feat;
+ void *p;
+ int ret;
+
+ p = con->v2.in_kvecs[0].iov_base;
+ ceph_decode_64_safe(&p, end, server_feat, bad);
+ ceph_decode_64_safe(&p, end, server_req_feat, bad);
+
+ dout("%s con %p server_feat 0x%llx server_req_feat 0x%llx\n",
+ __func__, con, server_feat, server_req_feat);
+
+ if (req_feat & ~server_feat) {
+ pr_err("msgr2 feature set mismatch: my required > server's supported 0x%llx, need 0x%llx\n",
+ server_feat, req_feat & ~server_feat);
+ con->error_msg = "missing required protocol features";
+ return -EINVAL;
+ }
+ if (server_req_feat & ~feat) {
+ pr_err("msgr2 feature set mismatch: server's required > my supported 0x%llx, missing 0x%llx\n",
+ feat, server_req_feat & ~feat);
+ con->error_msg = "missing required protocol features";
+ return -EINVAL;
+ }
+
+ /* no reset_out_kvecs() as our banner may still be pending */
+ ret = prepare_hello(con);
+ if (ret) {
+ pr_err("prepare_hello failed: %d\n", ret);
+ return ret;
+ }
+
+ con->state = CEPH_CON_S_V2_HELLO;
+ prepare_read_preamble(con);
+ return 0;
+
+bad:
+ pr_err("failed to decode banner payload\n");
+ return -EINVAL;
+}
+
+static int process_hello(struct ceph_connection *con, void *p, void *end)
+{
+ struct ceph_entity_addr *my_addr = &con->msgr->inst.addr;
+ struct ceph_entity_addr addr_for_me;
+ u8 entity_type;
+ int ret;
+
+ if (con->state != CEPH_CON_S_V2_HELLO) {
+ con->error_msg = "protocol error, unexpected hello";
+ return -EINVAL;
+ }
+
+ ceph_decode_8_safe(&p, end, entity_type, bad);
+ ret = ceph_decode_entity_addr(&p, end, &addr_for_me);
+ if (ret) {
+ pr_err("failed to decode addr_for_me: %d\n", ret);
+ return ret;
+ }
+
+ dout("%s con %p entity_type %d addr_for_me %s\n", __func__, con,
+ entity_type, ceph_pr_addr(&addr_for_me));
+
+ if (entity_type != con->peer_name.type) {
+ pr_err("bad peer type, want %d, got %d\n",
+ con->peer_name.type, entity_type);
+ con->error_msg = "wrong peer at address";
+ return -EINVAL;
+ }
+
+ /*
+ * Set our address to the address our first peer (i.e. monitor)
+ * sees that we are connecting from. If we are behind some sort
+ * of NAT and want to be identified by some private (not NATed)
+ * address, ip option should be used.
+ */
+ if (ceph_addr_is_blank(my_addr)) {
+ memcpy(&my_addr->in_addr, &addr_for_me.in_addr,
+ sizeof(my_addr->in_addr));
+ ceph_addr_set_port(my_addr, 0);
+ dout("%s con %p set my addr %s, as seen by peer %s\n",
+ __func__, con, ceph_pr_addr(my_addr),
+ ceph_pr_addr(&con->peer_addr));
+ } else {
+ dout("%s con %p my addr already set %s\n",
+ __func__, con, ceph_pr_addr(my_addr));
+ }
+
+ WARN_ON(ceph_addr_is_blank(my_addr) || ceph_addr_port(my_addr));
+ WARN_ON(my_addr->type != CEPH_ENTITY_ADDR_TYPE_ANY);
+ WARN_ON(!my_addr->nonce);
+
+ /* no reset_out_kvecs() as our hello may still be pending */
+ ret = prepare_auth_request(con);
+ if (ret) {
+ if (ret != -EAGAIN)
+ pr_err("prepare_auth_request failed: %d\n", ret);
+ return ret;
+ }
+
+ con->state = CEPH_CON_S_V2_AUTH;
+ return 0;
+
+bad:
+ pr_err("failed to decode hello\n");
+ return -EINVAL;
+}
+
+static int process_auth_bad_method(struct ceph_connection *con,
+ void *p, void *end)
+{
+ int allowed_protos[8], allowed_modes[8];
+ int allowed_proto_cnt, allowed_mode_cnt;
+ int used_proto, result;
+ int ret;
+ int i;
+
+ if (con->state != CEPH_CON_S_V2_AUTH) {
+ con->error_msg = "protocol error, unexpected auth_bad_method";
+ return -EINVAL;
+ }
+
+ ceph_decode_32_safe(&p, end, used_proto, bad);
+ ceph_decode_32_safe(&p, end, result, bad);
+ dout("%s con %p used_proto %d result %d\n", __func__, con, used_proto,
+ result);
+
+ ceph_decode_32_safe(&p, end, allowed_proto_cnt, bad);
+ if (allowed_proto_cnt > ARRAY_SIZE(allowed_protos)) {
+ pr_err("allowed_protos too big %d\n", allowed_proto_cnt);
+ return -EINVAL;
+ }
+ for (i = 0; i < allowed_proto_cnt; i++) {
+ ceph_decode_32_safe(&p, end, allowed_protos[i], bad);
+ dout("%s con %p allowed_protos[%d] %d\n", __func__, con,
+ i, allowed_protos[i]);
+ }
+
+ ceph_decode_32_safe(&p, end, allowed_mode_cnt, bad);
+ if (allowed_mode_cnt > ARRAY_SIZE(allowed_modes)) {
+ pr_err("allowed_modes too big %d\n", allowed_mode_cnt);
+ return -EINVAL;
+ }
+ for (i = 0; i < allowed_mode_cnt; i++) {
+ ceph_decode_32_safe(&p, end, allowed_modes[i], bad);
+ dout("%s con %p allowed_modes[%d] %d\n", __func__, con,
+ i, allowed_modes[i]);
+ }
+
+ mutex_unlock(&con->mutex);
+ ret = con->ops->handle_auth_bad_method(con, used_proto, result,
+ allowed_protos,
+ allowed_proto_cnt,
+ allowed_modes,
+ allowed_mode_cnt);
+ mutex_lock(&con->mutex);
+ if (con->state != CEPH_CON_S_V2_AUTH) {
+ dout("%s con %p state changed to %d\n", __func__, con,
+ con->state);
+ return -EAGAIN;
+ }
+
+ dout("%s con %p handle_auth_bad_method ret %d\n", __func__, con, ret);
+ return ret;
+
+bad:
+ pr_err("failed to decode auth_bad_method\n");
+ return -EINVAL;
+}
+
+static int process_auth_reply_more(struct ceph_connection *con,
+ void *p, void *end)
+{
+ int payload_len;
+ int ret;
+
+ if (con->state != CEPH_CON_S_V2_AUTH) {
+ con->error_msg = "protocol error, unexpected auth_reply_more";
+ return -EINVAL;
+ }
+
+ ceph_decode_32_safe(&p, end, payload_len, bad);
+ ceph_decode_need(&p, end, payload_len, bad);
+
+ dout("%s con %p payload_len %d\n", __func__, con, payload_len);
+
+ reset_out_kvecs(con);
+ ret = prepare_auth_request_more(con, p, payload_len);
+ if (ret) {
+ if (ret != -EAGAIN)
+ pr_err("prepare_auth_request_more failed: %d\n", ret);
+ return ret;
+ }
+
+ return 0;
+
+bad:
+ pr_err("failed to decode auth_reply_more\n");
+ return -EINVAL;
+}
+
+static int process_auth_done(struct ceph_connection *con, void *p, void *end)
+{
+ u8 session_key[CEPH_KEY_LEN];
+ u8 con_secret[CEPH_MAX_CON_SECRET_LEN];
+ int session_key_len, con_secret_len;
+ int payload_len;
+ u64 global_id;
+ int ret;
+
+ if (con->state != CEPH_CON_S_V2_AUTH) {
+ con->error_msg = "protocol error, unexpected auth_done";
+ return -EINVAL;
+ }
+
+ ceph_decode_64_safe(&p, end, global_id, bad);
+ ceph_decode_32_safe(&p, end, con->v2.con_mode, bad);
+ ceph_decode_32_safe(&p, end, payload_len, bad);
+
+ dout("%s con %p global_id %llu con_mode %d payload_len %d\n",
+ __func__, con, global_id, con->v2.con_mode, payload_len);
+
+ mutex_unlock(&con->mutex);
+ session_key_len = 0;
+ con_secret_len = 0;
+ ret = con->ops->handle_auth_done(con, global_id, p, payload_len,
+ session_key, &session_key_len,
+ con_secret, &con_secret_len);
+ mutex_lock(&con->mutex);
+ if (con->state != CEPH_CON_S_V2_AUTH) {
+ dout("%s con %p state changed to %d\n", __func__, con,
+ con->state);
+ return -EAGAIN;
+ }
+
+ dout("%s con %p handle_auth_done ret %d\n", __func__, con, ret);
+ if (ret)
+ return ret;
+
+ ret = setup_crypto(con, session_key, session_key_len, con_secret,
+ con_secret_len);
+ if (ret)
+ return ret;
+
+ reset_out_kvecs(con);
+ ret = prepare_auth_signature(con);
+ if (ret) {
+ pr_err("prepare_auth_signature failed: %d\n", ret);
+ return ret;
+ }
+
+ con->state = CEPH_CON_S_V2_AUTH_SIGNATURE;
+ return 0;
+
+bad:
+ pr_err("failed to decode auth_done\n");
+ return -EINVAL;
+}
+
+static int process_auth_signature(struct ceph_connection *con,
+ void *p, void *end)
+{
+ u8 hmac[SHA256_DIGEST_SIZE];
+ int ret;
+
+ if (con->state != CEPH_CON_S_V2_AUTH_SIGNATURE) {
+ con->error_msg = "protocol error, unexpected auth_signature";
+ return -EINVAL;
+ }
+
+ ret = hmac_sha256(con, con->v2.out_sign_kvecs,
+ con->v2.out_sign_kvec_cnt, hmac);
+ if (ret)
+ return ret;
+
+ ceph_decode_need(&p, end, SHA256_DIGEST_SIZE, bad);
+ if (crypto_memneq(p, hmac, SHA256_DIGEST_SIZE)) {
+ con->error_msg = "integrity error, bad auth signature";
+ return -EBADMSG;
+ }
+
+ dout("%s con %p auth signature ok\n", __func__, con);
+
+ /* no reset_out_kvecs() as our auth_signature may still be pending */
+ if (!con->v2.server_cookie) {
+ ret = prepare_client_ident(con);
+ if (ret) {
+ pr_err("prepare_client_ident failed: %d\n", ret);
+ return ret;
+ }
+
+ con->state = CEPH_CON_S_V2_SESSION_CONNECT;
+ } else {
+ ret = prepare_session_reconnect(con);
+ if (ret) {
+ pr_err("prepare_session_reconnect failed: %d\n", ret);
+ return ret;
+ }
+
+ con->state = CEPH_CON_S_V2_SESSION_RECONNECT;
+ }
+
+ return 0;
+
+bad:
+ pr_err("failed to decode auth_signature\n");
+ return -EINVAL;
+}
+
+static int process_server_ident(struct ceph_connection *con,
+ void *p, void *end)
+{
+ struct ceph_client *client = from_msgr(con->msgr);
+ u64 features, required_features;
+ struct ceph_entity_addr addr;
+ u64 global_seq;
+ u64 global_id;
+ u64 cookie;
+ u64 flags;
+ int ret;
+
+ if (con->state != CEPH_CON_S_V2_SESSION_CONNECT) {
+ con->error_msg = "protocol error, unexpected server_ident";
+ return -EINVAL;
+ }
+
+ ret = ceph_decode_entity_addrvec(&p, end, true, &addr);
+ if (ret) {
+ pr_err("failed to decode server addrs: %d\n", ret);
+ return ret;
+ }
+
+ ceph_decode_64_safe(&p, end, global_id, bad);
+ ceph_decode_64_safe(&p, end, global_seq, bad);
+ ceph_decode_64_safe(&p, end, features, bad);
+ ceph_decode_64_safe(&p, end, required_features, bad);
+ ceph_decode_64_safe(&p, end, flags, bad);
+ ceph_decode_64_safe(&p, end, cookie, bad);
+
+ dout("%s con %p addr %s/%u global_id %llu global_seq %llu features 0x%llx required_features 0x%llx flags 0x%llx cookie 0x%llx\n",
+ __func__, con, ceph_pr_addr(&addr), le32_to_cpu(addr.nonce),
+ global_id, global_seq, features, required_features, flags, cookie);
+
+ /* is this who we intended to talk to? */
+ if (memcmp(&addr, &con->peer_addr, sizeof(con->peer_addr))) {
+ pr_err("bad peer addr/nonce, want %s/%u, got %s/%u\n",
+ ceph_pr_addr(&con->peer_addr),
+ le32_to_cpu(con->peer_addr.nonce),
+ ceph_pr_addr(&addr), le32_to_cpu(addr.nonce));
+ con->error_msg = "wrong peer at address";
+ return -EINVAL;
+ }
+
+ if (client->required_features & ~features) {
+ pr_err("RADOS feature set mismatch: my required > server's supported 0x%llx, need 0x%llx\n",
+ features, client->required_features & ~features);
+ con->error_msg = "missing required protocol features";
+ return -EINVAL;
+ }
+
+ /*
+ * Both name->type and name->num are set in ceph_con_open() but
+ * name->num may be bogus in the initial monmap. name->type is
+ * verified in handle_hello().
+ */
+ WARN_ON(!con->peer_name.type);
+ con->peer_name.num = cpu_to_le64(global_id);
+ con->v2.peer_global_seq = global_seq;
+ con->peer_features = features;
+ WARN_ON(required_features & ~client->supported_features);
+ con->v2.server_cookie = cookie;
+
+ if (flags & CEPH_MSG_CONNECT_LOSSY) {
+ ceph_con_flag_set(con, CEPH_CON_F_LOSSYTX);
+ WARN_ON(con->v2.server_cookie);
+ } else {
+ WARN_ON(!con->v2.server_cookie);
+ }
+
+ clear_in_sign_kvecs(con);
+ clear_out_sign_kvecs(con);
+ free_conn_bufs(con);
+ con->delay = 0; /* reset backoff memory */
+
+ con->state = CEPH_CON_S_OPEN;
+ con->v2.out_state = OUT_S_GET_NEXT;
+ return 0;
+
+bad:
+ pr_err("failed to decode server_ident\n");
+ return -EINVAL;
+}
+
+static int process_ident_missing_features(struct ceph_connection *con,
+ void *p, void *end)
+{
+ struct ceph_client *client = from_msgr(con->msgr);
+ u64 missing_features;
+
+ if (con->state != CEPH_CON_S_V2_SESSION_CONNECT) {
+ con->error_msg = "protocol error, unexpected ident_missing_features";
+ return -EINVAL;
+ }
+
+ ceph_decode_64_safe(&p, end, missing_features, bad);
+ pr_err("RADOS feature set mismatch: server's required > my supported 0x%llx, missing 0x%llx\n",
+ client->supported_features, missing_features);
+ con->error_msg = "missing required protocol features";
+ return -EINVAL;
+
+bad:
+ pr_err("failed to decode ident_missing_features\n");
+ return -EINVAL;
+}
+
+static int process_session_reconnect_ok(struct ceph_connection *con,
+ void *p, void *end)
+{
+ u64 seq;
+
+ if (con->state != CEPH_CON_S_V2_SESSION_RECONNECT) {
+ con->error_msg = "protocol error, unexpected session_reconnect_ok";
+ return -EINVAL;
+ }
+
+ ceph_decode_64_safe(&p, end, seq, bad);
+
+ dout("%s con %p seq %llu\n", __func__, con, seq);
+ ceph_con_discard_requeued(con, seq);
+
+ clear_in_sign_kvecs(con);
+ clear_out_sign_kvecs(con);
+ free_conn_bufs(con);
+ con->delay = 0; /* reset backoff memory */
+
+ con->state = CEPH_CON_S_OPEN;
+ con->v2.out_state = OUT_S_GET_NEXT;
+ return 0;
+
+bad:
+ pr_err("failed to decode session_reconnect_ok\n");
+ return -EINVAL;
+}
+
+static int process_session_retry(struct ceph_connection *con,
+ void *p, void *end)
+{
+ u64 connect_seq;
+ int ret;
+
+ if (con->state != CEPH_CON_S_V2_SESSION_RECONNECT) {
+ con->error_msg = "protocol error, unexpected session_retry";
+ return -EINVAL;
+ }
+
+ ceph_decode_64_safe(&p, end, connect_seq, bad);
+
+ dout("%s con %p connect_seq %llu\n", __func__, con, connect_seq);
+ WARN_ON(connect_seq <= con->v2.connect_seq);
+ con->v2.connect_seq = connect_seq + 1;
+
+ free_conn_bufs(con);
+
+ reset_out_kvecs(con);
+ ret = prepare_session_reconnect(con);
+ if (ret) {
+ pr_err("prepare_session_reconnect (cseq) failed: %d\n", ret);
+ return ret;
+ }
+
+ return 0;
+
+bad:
+ pr_err("failed to decode session_retry\n");
+ return -EINVAL;
+}
+
+static int process_session_retry_global(struct ceph_connection *con,
+ void *p, void *end)
+{
+ u64 global_seq;
+ int ret;
+
+ if (con->state != CEPH_CON_S_V2_SESSION_RECONNECT) {
+ con->error_msg = "protocol error, unexpected session_retry_global";
+ return -EINVAL;
+ }
+
+ ceph_decode_64_safe(&p, end, global_seq, bad);
+
+ dout("%s con %p global_seq %llu\n", __func__, con, global_seq);
+ WARN_ON(global_seq <= con->v2.global_seq);
+ con->v2.global_seq = ceph_get_global_seq(con->msgr, global_seq);
+
+ free_conn_bufs(con);
+
+ reset_out_kvecs(con);
+ ret = prepare_session_reconnect(con);
+ if (ret) {
+ pr_err("prepare_session_reconnect (gseq) failed: %d\n", ret);
+ return ret;
+ }
+
+ return 0;
+
+bad:
+ pr_err("failed to decode session_retry_global\n");
+ return -EINVAL;
+}
+
+static int process_session_reset(struct ceph_connection *con,
+ void *p, void *end)
+{
+ bool full;
+ int ret;
+
+ if (con->state != CEPH_CON_S_V2_SESSION_RECONNECT) {
+ con->error_msg = "protocol error, unexpected session_reset";
+ return -EINVAL;
+ }
+
+ ceph_decode_8_safe(&p, end, full, bad);
+ if (!full) {
+ con->error_msg = "protocol error, bad session_reset";
+ return -EINVAL;
+ }
+
+ pr_info("%s%lld %s session reset\n", ENTITY_NAME(con->peer_name),
+ ceph_pr_addr(&con->peer_addr));
+ ceph_con_reset_session(con);
+
+ mutex_unlock(&con->mutex);
+ if (con->ops->peer_reset)
+ con->ops->peer_reset(con);
+ mutex_lock(&con->mutex);
+ if (con->state != CEPH_CON_S_V2_SESSION_RECONNECT) {
+ dout("%s con %p state changed to %d\n", __func__, con,
+ con->state);
+ return -EAGAIN;
+ }
+
+ free_conn_bufs(con);
+
+ reset_out_kvecs(con);
+ ret = prepare_client_ident(con);
+ if (ret) {
+ pr_err("prepare_client_ident (rst) failed: %d\n", ret);
+ return ret;
+ }
+
+ con->state = CEPH_CON_S_V2_SESSION_CONNECT;
+ return 0;
+
+bad:
+ pr_err("failed to decode session_reset\n");
+ return -EINVAL;
+}
+
+static int process_keepalive2_ack(struct ceph_connection *con,
+ void *p, void *end)
+{
+ if (con->state != CEPH_CON_S_OPEN) {
+ con->error_msg = "protocol error, unexpected keepalive2_ack";
+ return -EINVAL;
+ }
+
+ ceph_decode_need(&p, end, sizeof(struct ceph_timespec), bad);
+ ceph_decode_timespec64(&con->last_keepalive_ack, p);
+
+ dout("%s con %p timestamp %lld.%09ld\n", __func__, con,
+ con->last_keepalive_ack.tv_sec, con->last_keepalive_ack.tv_nsec);
+
+ return 0;
+
+bad:
+ pr_err("failed to decode keepalive2_ack\n");
+ return -EINVAL;
+}
+
+static int process_ack(struct ceph_connection *con, void *p, void *end)
+{
+ u64 seq;
+
+ if (con->state != CEPH_CON_S_OPEN) {
+ con->error_msg = "protocol error, unexpected ack";
+ return -EINVAL;
+ }
+
+ ceph_decode_64_safe(&p, end, seq, bad);
+
+ dout("%s con %p seq %llu\n", __func__, con, seq);
+ ceph_con_discard_sent(con, seq);
+ return 0;
+
+bad:
+ pr_err("failed to decode ack\n");
+ return -EINVAL;
+}
+
+static int process_control(struct ceph_connection *con, void *p, void *end)
+{
+ int tag = con->v2.in_desc.fd_tag;
+ int ret;
+
+ dout("%s con %p tag %d len %d\n", __func__, con, tag, (int)(end - p));
+
+ switch (tag) {
+ case FRAME_TAG_HELLO:
+ ret = process_hello(con, p, end);
+ break;
+ case FRAME_TAG_AUTH_BAD_METHOD:
+ ret = process_auth_bad_method(con, p, end);
+ break;
+ case FRAME_TAG_AUTH_REPLY_MORE:
+ ret = process_auth_reply_more(con, p, end);
+ break;
+ case FRAME_TAG_AUTH_DONE:
+ ret = process_auth_done(con, p, end);
+ break;
+ case FRAME_TAG_AUTH_SIGNATURE:
+ ret = process_auth_signature(con, p, end);
+ break;
+ case FRAME_TAG_SERVER_IDENT:
+ ret = process_server_ident(con, p, end);
+ break;
+ case FRAME_TAG_IDENT_MISSING_FEATURES:
+ ret = process_ident_missing_features(con, p, end);
+ break;
+ case FRAME_TAG_SESSION_RECONNECT_OK:
+ ret = process_session_reconnect_ok(con, p, end);
+ break;
+ case FRAME_TAG_SESSION_RETRY:
+ ret = process_session_retry(con, p, end);
+ break;
+ case FRAME_TAG_SESSION_RETRY_GLOBAL:
+ ret = process_session_retry_global(con, p, end);
+ break;
+ case FRAME_TAG_SESSION_RESET:
+ ret = process_session_reset(con, p, end);
+ break;
+ case FRAME_TAG_KEEPALIVE2_ACK:
+ ret = process_keepalive2_ack(con, p, end);
+ break;
+ case FRAME_TAG_ACK:
+ ret = process_ack(con, p, end);
+ break;
+ default:
+ pr_err("bad tag %d\n", tag);
+ con->error_msg = "protocol error, bad tag";
+ return -EINVAL;
+ }
+ if (ret) {
+ dout("%s con %p error %d\n", __func__, con, ret);
+ return ret;
+ }
+
+ prepare_read_preamble(con);
+ return 0;
+}
+
+/*
+ * Return:
+ * 1 - con->in_msg set, read message
+ * 0 - skip message
+ * <0 - error
+ */
+static int process_message_header(struct ceph_connection *con,
+ void *p, void *end)
+{
+ struct ceph_frame_desc *desc = &con->v2.in_desc;
+ struct ceph_msg_header2 *hdr2 = p;
+ struct ceph_msg_header hdr;
+ int skip;
+ int ret;
+ u64 seq;
+
+ /* verify seq# */
+ seq = le64_to_cpu(hdr2->seq);
+ if ((s64)seq - (s64)con->in_seq < 1) {
+ pr_info("%s%lld %s skipping old message: seq %llu, expected %llu\n",
+ ENTITY_NAME(con->peer_name),
+ ceph_pr_addr(&con->peer_addr),
+ seq, con->in_seq + 1);
+ return 0;
+ }
+ if ((s64)seq - (s64)con->in_seq > 1) {
+ pr_err("bad seq %llu, expected %llu\n", seq, con->in_seq + 1);
+ con->error_msg = "bad message sequence # for incoming message";
+ return -EBADE;
+ }
+
+ ceph_con_discard_sent(con, le64_to_cpu(hdr2->ack_seq));
+
+ fill_header(&hdr, hdr2, desc->fd_lens[1], desc->fd_lens[2],
+ desc->fd_lens[3], &con->peer_name);
+ ret = ceph_con_in_msg_alloc(con, &hdr, &skip);
+ if (ret)
+ return ret;
+
+ WARN_ON(!con->in_msg ^ skip);
+ if (skip)
+ return 0;
+
+ WARN_ON(!con->in_msg);
+ WARN_ON(con->in_msg->con != con);
+ return 1;
+}
+
+static int process_message(struct ceph_connection *con)
+{
+ ceph_con_process_message(con);
+
+ /*
+ * We could have been closed by ceph_con_close() because
+ * ceph_con_process_message() temporarily drops con->mutex.
+ */
+ if (con->state != CEPH_CON_S_OPEN) {
+ dout("%s con %p state changed to %d\n", __func__, con,
+ con->state);
+ return -EAGAIN;
+ }
+
+ prepare_read_preamble(con);
+ return 0;
+}
+
+static int __handle_control(struct ceph_connection *con, void *p)
+{
+ void *end = p + con->v2.in_desc.fd_lens[0];
+ struct ceph_msg *msg;
+ int ret;
+
+ if (con->v2.in_desc.fd_tag != FRAME_TAG_MESSAGE)
+ return process_control(con, p, end);
+
+ ret = process_message_header(con, p, end);
+ if (ret < 0)
+ return ret;
+ if (ret == 0) {
+ prepare_skip_message(con);
+ return 0;
+ }
+
+ msg = con->in_msg; /* set in process_message_header() */
+ if (!front_len(msg) && !middle_len(msg)) {
+ if (!data_len(msg))
+ return process_message(con);
+
+ prepare_read_data(con);
+ return 0;
+ }
+
+ reset_in_kvecs(con);
+ if (front_len(msg)) {
+ WARN_ON(front_len(msg) > msg->front_alloc_len);
+ add_in_kvec(con, msg->front.iov_base, front_len(msg));
+ msg->front.iov_len = front_len(msg);
+
+ if (con_secure(con) && need_padding(front_len(msg)))
+ add_in_kvec(con, FRONT_PAD(con->v2.in_buf),
+ padding_len(front_len(msg)));
+ } else {
+ msg->front.iov_len = 0;
+ }
+ if (middle_len(msg)) {
+ WARN_ON(middle_len(msg) > msg->middle->alloc_len);
+ add_in_kvec(con, msg->middle->vec.iov_base, middle_len(msg));
+ msg->middle->vec.iov_len = middle_len(msg);
+
+ if (con_secure(con) && need_padding(middle_len(msg)))
+ add_in_kvec(con, MIDDLE_PAD(con->v2.in_buf),
+ padding_len(middle_len(msg)));
+ } else if (msg->middle) {
+ msg->middle->vec.iov_len = 0;
+ }
+
+ if (data_len(msg)) {
+ con->v2.in_state = IN_S_PREPARE_READ_DATA;
+ } else {
+ add_in_kvec(con, con->v2.in_buf,
+ con_secure(con) ? CEPH_EPILOGUE_SECURE_LEN :
+ CEPH_EPILOGUE_PLAIN_LEN);
+ con->v2.in_state = IN_S_HANDLE_EPILOGUE;
+ }
+ return 0;
+}
+
+static int handle_preamble(struct ceph_connection *con)
+{
+ struct ceph_frame_desc *desc = &con->v2.in_desc;
+ int ret;
+
+ if (con_secure(con)) {
+ ret = decrypt_preamble(con);
+ if (ret) {
+ if (ret == -EBADMSG)
+ con->error_msg = "integrity error, bad preamble auth tag";
+ return ret;
+ }
+ }
+
+ ret = decode_preamble(con->v2.in_buf, desc);
+ if (ret) {
+ if (ret == -EBADMSG)
+ con->error_msg = "integrity error, bad crc";
+ else
+ con->error_msg = "protocol error, bad preamble";
+ return ret;
+ }
+
+ dout("%s con %p tag %d seg_cnt %d %d+%d+%d+%d\n", __func__,
+ con, desc->fd_tag, desc->fd_seg_cnt, desc->fd_lens[0],
+ desc->fd_lens[1], desc->fd_lens[2], desc->fd_lens[3]);
+
+ if (!con_secure(con))
+ return prepare_read_control(con);
+
+ if (desc->fd_lens[0] > CEPH_PREAMBLE_INLINE_LEN)
+ return prepare_read_control_remainder(con);
+
+ return __handle_control(con, CTRL_BODY(con->v2.in_buf));
+}
+
+static int handle_control(struct ceph_connection *con)
+{
+ int ctrl_len = con->v2.in_desc.fd_lens[0];
+ void *buf;
+ int ret;
+
+ WARN_ON(con_secure(con));
+
+ ret = verify_control_crc(con);
+ if (ret) {
+ con->error_msg = "integrity error, bad crc";
+ return ret;
+ }
+
+ if (con->state == CEPH_CON_S_V2_AUTH) {
+ buf = alloc_conn_buf(con, ctrl_len);
+ if (!buf)
+ return -ENOMEM;
+
+ memcpy(buf, con->v2.in_kvecs[0].iov_base, ctrl_len);
+ return __handle_control(con, buf);
+ }
+
+ return __handle_control(con, con->v2.in_kvecs[0].iov_base);
+}
+
+static int handle_control_remainder(struct ceph_connection *con)
+{
+ int ret;
+
+ WARN_ON(!con_secure(con));
+
+ ret = decrypt_control_remainder(con);
+ if (ret) {
+ if (ret == -EBADMSG)
+ con->error_msg = "integrity error, bad control remainder auth tag";
+ return ret;
+ }
+
+ return __handle_control(con, con->v2.in_kvecs[0].iov_base -
+ CEPH_PREAMBLE_INLINE_LEN);
+}
+
+static int handle_epilogue(struct ceph_connection *con)
+{
+ u32 front_crc, middle_crc, data_crc;
+ int ret;
+
+ if (con_secure(con)) {
+ ret = decrypt_message(con);
+ if (ret) {
+ if (ret == -EBADMSG)
+ con->error_msg = "integrity error, bad epilogue auth tag";
+ return ret;
+ }
+
+ /* just late_status */
+ ret = decode_epilogue(con->v2.in_buf, NULL, NULL, NULL);
+ if (ret) {
+ con->error_msg = "protocol error, bad epilogue";
+ return ret;
+ }
+ } else {
+ ret = decode_epilogue(con->v2.in_buf, &front_crc,
+ &middle_crc, &data_crc);
+ if (ret) {
+ con->error_msg = "protocol error, bad epilogue";
+ return ret;
+ }
+
+ ret = verify_epilogue_crcs(con, front_crc, middle_crc,
+ data_crc);
+ if (ret) {
+ con->error_msg = "integrity error, bad crc";
+ return ret;
+ }
+ }
+
+ return process_message(con);
+}
+
+static void finish_skip(struct ceph_connection *con)
+{
+ dout("%s con %p\n", __func__, con);
+
+ if (con_secure(con))
+ gcm_inc_nonce(&con->v2.in_gcm_nonce);
+
+ __finish_skip(con);
+}
+
+static int populate_in_iter(struct ceph_connection *con)
+{
+ int ret;
+
+ dout("%s con %p state %d in_state %d\n", __func__, con, con->state,
+ con->v2.in_state);
+ WARN_ON(iov_iter_count(&con->v2.in_iter));
+
+ if (con->state == CEPH_CON_S_V2_BANNER_PREFIX) {
+ ret = process_banner_prefix(con);
+ } else if (con->state == CEPH_CON_S_V2_BANNER_PAYLOAD) {
+ ret = process_banner_payload(con);
+ } else if ((con->state >= CEPH_CON_S_V2_HELLO &&
+ con->state <= CEPH_CON_S_V2_SESSION_RECONNECT) ||
+ con->state == CEPH_CON_S_OPEN) {
+ switch (con->v2.in_state) {
+ case IN_S_HANDLE_PREAMBLE:
+ ret = handle_preamble(con);
+ break;
+ case IN_S_HANDLE_CONTROL:
+ ret = handle_control(con);
+ break;
+ case IN_S_HANDLE_CONTROL_REMAINDER:
+ ret = handle_control_remainder(con);
+ break;
+ case IN_S_PREPARE_READ_DATA:
+ prepare_read_data(con);
+ ret = 0;
+ break;
+ case IN_S_PREPARE_READ_DATA_CONT:
+ prepare_read_data_cont(con);
+ ret = 0;
+ break;
+ case IN_S_HANDLE_EPILOGUE:
+ ret = handle_epilogue(con);
+ break;
+ case IN_S_FINISH_SKIP:
+ finish_skip(con);
+ ret = 0;
+ break;
+ default:
+ WARN(1, "bad in_state %d", con->v2.in_state);
+ return -EINVAL;
+ }
+ } else {
+ WARN(1, "bad state %d", con->state);
+ return -EINVAL;
+ }
+ if (ret) {
+ dout("%s con %p error %d\n", __func__, con, ret);
+ return ret;
+ }
+
+ if (WARN_ON(!iov_iter_count(&con->v2.in_iter)))
+ return -ENODATA;
+ dout("%s con %p populated %zu\n", __func__, con,
+ iov_iter_count(&con->v2.in_iter));
+ return 1;
+}
+
+int ceph_con_v2_try_read(struct ceph_connection *con)
+{
+ int ret;
+
+ dout("%s con %p state %d need %zu\n", __func__, con, con->state,
+ iov_iter_count(&con->v2.in_iter));
+
+ if (con->state == CEPH_CON_S_PREOPEN)
+ return 0;
+
+ /*
+ * We should always have something pending here. If not,
+ * avoid calling populate_in_iter() as if we read something
+ * (ceph_tcp_recv() would immediately return 1).
+ */
+ if (WARN_ON(!iov_iter_count(&con->v2.in_iter)))
+ return -ENODATA;
+
+ for (;;) {
+ ret = ceph_tcp_recv(con);
+ if (ret <= 0)
+ return ret;
+
+ ret = populate_in_iter(con);
+ if (ret <= 0) {
+ if (ret && ret != -EAGAIN && !con->error_msg)
+ con->error_msg = "read processing error";
+ return ret;
+ }
+ }
+}
+
+static void queue_data(struct ceph_connection *con)
+{
+ struct bio_vec bv;
+
+ con->v2.out_epil.data_crc = -1;
+ ceph_msg_data_cursor_init(&con->v2.out_cursor, con->out_msg,
+ data_len(con->out_msg));
+
+ get_bvec_at(&con->v2.out_cursor, &bv);
+ set_out_bvec(con, &bv, true);
+ con->v2.out_state = OUT_S_QUEUE_DATA_CONT;
+}
+
+static void queue_data_cont(struct ceph_connection *con)
+{
+ struct bio_vec bv;
+
+ con->v2.out_epil.data_crc = ceph_crc32c_page(
+ con->v2.out_epil.data_crc, con->v2.out_bvec.bv_page,
+ con->v2.out_bvec.bv_offset, con->v2.out_bvec.bv_len);
+
+ ceph_msg_data_advance(&con->v2.out_cursor, con->v2.out_bvec.bv_len);
+ if (con->v2.out_cursor.total_resid) {
+ get_bvec_at(&con->v2.out_cursor, &bv);
+ set_out_bvec(con, &bv, true);
+ WARN_ON(con->v2.out_state != OUT_S_QUEUE_DATA_CONT);
+ return;
+ }
+
+ /*
+ * We've written all data. Queue epilogue. Once it's written,
+ * we are done.
+ */
+ reset_out_kvecs(con);
+ prepare_epilogue_plain(con, false);
+ con->v2.out_state = OUT_S_FINISH_MESSAGE;
+}
+
+static void queue_enc_page(struct ceph_connection *con)
+{
+ struct bio_vec bv;
+
+ dout("%s con %p i %d resid %d\n", __func__, con, con->v2.out_enc_i,
+ con->v2.out_enc_resid);
+ WARN_ON(!con->v2.out_enc_resid);
+
+ bv.bv_page = con->v2.out_enc_pages[con->v2.out_enc_i];
+ bv.bv_offset = 0;
+ bv.bv_len = min(con->v2.out_enc_resid, (int)PAGE_SIZE);
+
+ set_out_bvec(con, &bv, false);
+ con->v2.out_enc_i++;
+ con->v2.out_enc_resid -= bv.bv_len;
+
+ if (con->v2.out_enc_resid) {
+ WARN_ON(con->v2.out_state != OUT_S_QUEUE_ENC_PAGE);
+ return;
+ }
+
+ /*
+ * We've queued the last piece of ciphertext (ending with
+ * epilogue) + auth tag. Once it's written, we are done.
+ */
+ WARN_ON(con->v2.out_enc_i != con->v2.out_enc_page_cnt);
+ con->v2.out_state = OUT_S_FINISH_MESSAGE;
+}
+
+static void queue_zeros(struct ceph_connection *con)
+{
+ dout("%s con %p out_zero %d\n", __func__, con, con->v2.out_zero);
+
+ if (con->v2.out_zero) {
+ set_out_bvec_zero(con);
+ con->v2.out_zero -= con->v2.out_bvec.bv_len;
+ con->v2.out_state = OUT_S_QUEUE_ZEROS;
+ return;
+ }
+
+ /*
+ * We've zero-filled everything up to epilogue. Queue epilogue
+ * with late_status set to ABORTED and crcs adjusted for zeros.
+ * Once it's written, we are done patching up for the revoke.
+ */
+ reset_out_kvecs(con);
+ prepare_epilogue_plain(con, true);
+ con->v2.out_state = OUT_S_FINISH_MESSAGE;
+}
+
+static void finish_message(struct ceph_connection *con)
+{
+ dout("%s con %p msg %p\n", __func__, con, con->out_msg);
+
+ /* we end up here both plain and secure modes */
+ if (con->v2.out_enc_pages) {
+ WARN_ON(!con->v2.out_enc_page_cnt);
+ ceph_release_page_vector(con->v2.out_enc_pages,
+ con->v2.out_enc_page_cnt);
+ con->v2.out_enc_pages = NULL;
+ con->v2.out_enc_page_cnt = 0;
+ }
+ /* message may have been revoked */
+ if (con->out_msg) {
+ ceph_msg_put(con->out_msg);
+ con->out_msg = NULL;
+ }
+
+ con->v2.out_state = OUT_S_GET_NEXT;
+}
+
+static int populate_out_iter(struct ceph_connection *con)
+{
+ int ret;
+
+ dout("%s con %p state %d out_state %d\n", __func__, con, con->state,
+ con->v2.out_state);
+ WARN_ON(iov_iter_count(&con->v2.out_iter));
+
+ if (con->state != CEPH_CON_S_OPEN) {
+ WARN_ON(con->state < CEPH_CON_S_V2_BANNER_PREFIX ||
+ con->state > CEPH_CON_S_V2_SESSION_RECONNECT);
+ goto nothing_pending;
+ }
+
+ switch (con->v2.out_state) {
+ case OUT_S_QUEUE_DATA:
+ WARN_ON(!con->out_msg);
+ queue_data(con);
+ goto populated;
+ case OUT_S_QUEUE_DATA_CONT:
+ WARN_ON(!con->out_msg);
+ queue_data_cont(con);
+ goto populated;
+ case OUT_S_QUEUE_ENC_PAGE:
+ queue_enc_page(con);
+ goto populated;
+ case OUT_S_QUEUE_ZEROS:
+ WARN_ON(con->out_msg); /* revoked */
+ queue_zeros(con);
+ goto populated;
+ case OUT_S_FINISH_MESSAGE:
+ finish_message(con);
+ break;
+ case OUT_S_GET_NEXT:
+ break;
+ default:
+ WARN(1, "bad out_state %d", con->v2.out_state);
+ return -EINVAL;
+ }
+
+ WARN_ON(con->v2.out_state != OUT_S_GET_NEXT);
+ if (ceph_con_flag_test_and_clear(con, CEPH_CON_F_KEEPALIVE_PENDING)) {
+ ret = prepare_keepalive2(con);
+ if (ret) {
+ pr_err("prepare_keepalive2 failed: %d\n", ret);
+ return ret;
+ }
+ } else if (!list_empty(&con->out_queue)) {
+ ceph_con_get_out_msg(con);
+ ret = prepare_message(con);
+ if (ret) {
+ pr_err("prepare_message failed: %d\n", ret);
+ return ret;
+ }
+ } else if (con->in_seq > con->in_seq_acked) {
+ ret = prepare_ack(con);
+ if (ret) {
+ pr_err("prepare_ack failed: %d\n", ret);
+ return ret;
+ }
+ } else {
+ goto nothing_pending;
+ }
+
+populated:
+ if (WARN_ON(!iov_iter_count(&con->v2.out_iter)))
+ return -ENODATA;
+ dout("%s con %p populated %zu\n", __func__, con,
+ iov_iter_count(&con->v2.out_iter));
+ return 1;
+
+nothing_pending:
+ WARN_ON(iov_iter_count(&con->v2.out_iter));
+ dout("%s con %p nothing pending\n", __func__, con);
+ ceph_con_flag_clear(con, CEPH_CON_F_WRITE_PENDING);
+ return 0;
+}
+
+int ceph_con_v2_try_write(struct ceph_connection *con)
+{
+ int ret;
+
+ dout("%s con %p state %d have %zu\n", __func__, con, con->state,
+ iov_iter_count(&con->v2.out_iter));
+
+ /* open the socket first? */
+ if (con->state == CEPH_CON_S_PREOPEN) {
+ WARN_ON(con->peer_addr.type != CEPH_ENTITY_ADDR_TYPE_MSGR2);
+
+ /*
+ * Always bump global_seq. Bump connect_seq only if
+ * there is a session (i.e. we are reconnecting and will
+ * send session_reconnect instead of client_ident).
+ */
+ con->v2.global_seq = ceph_get_global_seq(con->msgr, 0);
+ if (con->v2.server_cookie)
+ con->v2.connect_seq++;
+
+ ret = prepare_read_banner_prefix(con);
+ if (ret) {
+ pr_err("prepare_read_banner_prefix failed: %d\n", ret);
+ con->error_msg = "connect error";
+ return ret;
+ }
+
+ reset_out_kvecs(con);
+ ret = prepare_banner(con);
+ if (ret) {
+ pr_err("prepare_banner failed: %d\n", ret);
+ con->error_msg = "connect error";
+ return ret;
+ }
+
+ ret = ceph_tcp_connect(con);
+ if (ret) {
+ pr_err("ceph_tcp_connect failed: %d\n", ret);
+ con->error_msg = "connect error";
+ return ret;
+ }
+ }
+
+ if (!iov_iter_count(&con->v2.out_iter)) {
+ ret = populate_out_iter(con);
+ if (ret <= 0) {
+ if (ret && ret != -EAGAIN && !con->error_msg)
+ con->error_msg = "write processing error";
+ return ret;
+ }
+ }
+
+ tcp_sock_set_cork(con->sock->sk, true);
+ for (;;) {
+ ret = ceph_tcp_send(con);
+ if (ret <= 0)
+ break;
+
+ ret = populate_out_iter(con);
+ if (ret <= 0) {
+ if (ret && ret != -EAGAIN && !con->error_msg)
+ con->error_msg = "write processing error";
+ break;
+ }
+ }
+
+ tcp_sock_set_cork(con->sock->sk, false);
+ return ret;
+}
+
+static u32 crc32c_zeros(u32 crc, int zero_len)
+{
+ int len;
+
+ while (zero_len) {
+ len = min(zero_len, (int)PAGE_SIZE);
+ crc = crc32c(crc, page_address(ceph_zero_page), len);
+ zero_len -= len;
+ }
+
+ return crc;
+}
+
+static void prepare_zero_front(struct ceph_connection *con, int resid)
+{
+ int sent;
+
+ WARN_ON(!resid || resid > front_len(con->out_msg));
+ sent = front_len(con->out_msg) - resid;
+ dout("%s con %p sent %d resid %d\n", __func__, con, sent, resid);
+
+ if (sent) {
+ con->v2.out_epil.front_crc =
+ crc32c(-1, con->out_msg->front.iov_base, sent);
+ con->v2.out_epil.front_crc =
+ crc32c_zeros(con->v2.out_epil.front_crc, resid);
+ } else {
+ con->v2.out_epil.front_crc = crc32c_zeros(-1, resid);
+ }
+
+ con->v2.out_iter.count -= resid;
+ out_zero_add(con, resid);
+}
+
+static void prepare_zero_middle(struct ceph_connection *con, int resid)
+{
+ int sent;
+
+ WARN_ON(!resid || resid > middle_len(con->out_msg));
+ sent = middle_len(con->out_msg) - resid;
+ dout("%s con %p sent %d resid %d\n", __func__, con, sent, resid);
+
+ if (sent) {
+ con->v2.out_epil.middle_crc =
+ crc32c(-1, con->out_msg->middle->vec.iov_base, sent);
+ con->v2.out_epil.middle_crc =
+ crc32c_zeros(con->v2.out_epil.middle_crc, resid);
+ } else {
+ con->v2.out_epil.middle_crc = crc32c_zeros(-1, resid);
+ }
+
+ con->v2.out_iter.count -= resid;
+ out_zero_add(con, resid);
+}
+
+static void prepare_zero_data(struct ceph_connection *con)
+{
+ dout("%s con %p\n", __func__, con);
+ con->v2.out_epil.data_crc = crc32c_zeros(-1, data_len(con->out_msg));
+ out_zero_add(con, data_len(con->out_msg));
+}
+
+static void revoke_at_queue_data(struct ceph_connection *con)
+{
+ int boundary;
+ int resid;
+
+ WARN_ON(!data_len(con->out_msg));
+ WARN_ON(!iov_iter_is_kvec(&con->v2.out_iter));
+ resid = iov_iter_count(&con->v2.out_iter);
+
+ boundary = front_len(con->out_msg) + middle_len(con->out_msg);
+ if (resid > boundary) {
+ resid -= boundary;
+ WARN_ON(resid > MESSAGE_HEAD_PLAIN_LEN);
+ dout("%s con %p was sending head\n", __func__, con);
+ if (front_len(con->out_msg))
+ prepare_zero_front(con, front_len(con->out_msg));
+ if (middle_len(con->out_msg))
+ prepare_zero_middle(con, middle_len(con->out_msg));
+ prepare_zero_data(con);
+ WARN_ON(iov_iter_count(&con->v2.out_iter) != resid);
+ con->v2.out_state = OUT_S_QUEUE_ZEROS;
+ return;
+ }
+
+ boundary = middle_len(con->out_msg);
+ if (resid > boundary) {
+ resid -= boundary;
+ dout("%s con %p was sending front\n", __func__, con);
+ prepare_zero_front(con, resid);
+ if (middle_len(con->out_msg))
+ prepare_zero_middle(con, middle_len(con->out_msg));
+ prepare_zero_data(con);
+ queue_zeros(con);
+ return;
+ }
+
+ WARN_ON(!resid);
+ dout("%s con %p was sending middle\n", __func__, con);
+ prepare_zero_middle(con, resid);
+ prepare_zero_data(con);
+ queue_zeros(con);
+}
+
+static void revoke_at_queue_data_cont(struct ceph_connection *con)
+{
+ int sent, resid; /* current piece of data */
+
+ WARN_ON(!data_len(con->out_msg));
+ WARN_ON(!iov_iter_is_bvec(&con->v2.out_iter));
+ resid = iov_iter_count(&con->v2.out_iter);
+ WARN_ON(!resid || resid > con->v2.out_bvec.bv_len);
+ sent = con->v2.out_bvec.bv_len - resid;
+ dout("%s con %p sent %d resid %d\n", __func__, con, sent, resid);
+
+ if (sent) {
+ con->v2.out_epil.data_crc = ceph_crc32c_page(
+ con->v2.out_epil.data_crc, con->v2.out_bvec.bv_page,
+ con->v2.out_bvec.bv_offset, sent);
+ ceph_msg_data_advance(&con->v2.out_cursor, sent);
+ }
+ WARN_ON(resid > con->v2.out_cursor.total_resid);
+ con->v2.out_epil.data_crc = crc32c_zeros(con->v2.out_epil.data_crc,
+ con->v2.out_cursor.total_resid);
+
+ con->v2.out_iter.count -= resid;
+ out_zero_add(con, con->v2.out_cursor.total_resid);
+ queue_zeros(con);
+}
+
+static void revoke_at_finish_message(struct ceph_connection *con)
+{
+ int boundary;
+ int resid;
+
+ WARN_ON(!iov_iter_is_kvec(&con->v2.out_iter));
+ resid = iov_iter_count(&con->v2.out_iter);
+
+ if (!front_len(con->out_msg) && !middle_len(con->out_msg) &&
+ !data_len(con->out_msg)) {
+ WARN_ON(!resid || resid > MESSAGE_HEAD_PLAIN_LEN);
+ dout("%s con %p was sending head (empty message) - noop\n",
+ __func__, con);
+ return;
+ }
+
+ boundary = front_len(con->out_msg) + middle_len(con->out_msg) +
+ CEPH_EPILOGUE_PLAIN_LEN;
+ if (resid > boundary) {
+ resid -= boundary;
+ WARN_ON(resid > MESSAGE_HEAD_PLAIN_LEN);
+ dout("%s con %p was sending head\n", __func__, con);
+ if (front_len(con->out_msg))
+ prepare_zero_front(con, front_len(con->out_msg));
+ if (middle_len(con->out_msg))
+ prepare_zero_middle(con, middle_len(con->out_msg));
+ con->v2.out_iter.count -= CEPH_EPILOGUE_PLAIN_LEN;
+ WARN_ON(iov_iter_count(&con->v2.out_iter) != resid);
+ con->v2.out_state = OUT_S_QUEUE_ZEROS;
+ return;
+ }
+
+ boundary = middle_len(con->out_msg) + CEPH_EPILOGUE_PLAIN_LEN;
+ if (resid > boundary) {
+ resid -= boundary;
+ dout("%s con %p was sending front\n", __func__, con);
+ prepare_zero_front(con, resid);
+ if (middle_len(con->out_msg))
+ prepare_zero_middle(con, middle_len(con->out_msg));
+ con->v2.out_iter.count -= CEPH_EPILOGUE_PLAIN_LEN;
+ queue_zeros(con);
+ return;
+ }
+
+ boundary = CEPH_EPILOGUE_PLAIN_LEN;
+ if (resid > boundary) {
+ resid -= boundary;
+ dout("%s con %p was sending middle\n", __func__, con);
+ prepare_zero_middle(con, resid);
+ con->v2.out_iter.count -= CEPH_EPILOGUE_PLAIN_LEN;
+ queue_zeros(con);
+ return;
+ }
+
+ WARN_ON(!resid);
+ dout("%s con %p was sending epilogue - noop\n", __func__, con);
+}
+
+void ceph_con_v2_revoke(struct ceph_connection *con)
+{
+ WARN_ON(con->v2.out_zero);
+
+ if (con_secure(con)) {
+ WARN_ON(con->v2.out_state != OUT_S_QUEUE_ENC_PAGE &&
+ con->v2.out_state != OUT_S_FINISH_MESSAGE);
+ dout("%s con %p secure - noop\n", __func__, con);
+ return;
+ }
+
+ switch (con->v2.out_state) {
+ case OUT_S_QUEUE_DATA:
+ revoke_at_queue_data(con);
+ break;
+ case OUT_S_QUEUE_DATA_CONT:
+ revoke_at_queue_data_cont(con);
+ break;
+ case OUT_S_FINISH_MESSAGE:
+ revoke_at_finish_message(con);
+ break;
+ default:
+ WARN(1, "bad out_state %d", con->v2.out_state);
+ break;
+ }
+}
+
+static void revoke_at_prepare_read_data(struct ceph_connection *con)
+{
+ int remaining; /* data + [data padding] + epilogue */
+ int resid;
+
+ WARN_ON(!data_len(con->in_msg));
+ WARN_ON(!iov_iter_is_kvec(&con->v2.in_iter));
+ resid = iov_iter_count(&con->v2.in_iter);
+ WARN_ON(!resid);
+
+ if (con_secure(con))
+ remaining = padded_len(data_len(con->in_msg)) +
+ CEPH_EPILOGUE_SECURE_LEN;
+ else
+ remaining = data_len(con->in_msg) + CEPH_EPILOGUE_PLAIN_LEN;
+
+ dout("%s con %p resid %d remaining %d\n", __func__, con, resid,
+ remaining);
+ con->v2.in_iter.count -= resid;
+ set_in_skip(con, resid + remaining);
+ con->v2.in_state = IN_S_FINISH_SKIP;
+}
+
+static void revoke_at_prepare_read_data_cont(struct ceph_connection *con)
+{
+ int recved, resid; /* current piece of data */
+ int remaining; /* [data padding] + epilogue */
+
+ WARN_ON(!data_len(con->in_msg));
+ WARN_ON(!iov_iter_is_bvec(&con->v2.in_iter));
+ resid = iov_iter_count(&con->v2.in_iter);
+ WARN_ON(!resid || resid > con->v2.in_bvec.bv_len);
+ recved = con->v2.in_bvec.bv_len - resid;
+ dout("%s con %p recved %d resid %d\n", __func__, con, recved, resid);
+
+ if (recved)
+ ceph_msg_data_advance(&con->v2.in_cursor, recved);
+ WARN_ON(resid > con->v2.in_cursor.total_resid);
+
+ if (con_secure(con))
+ remaining = padding_len(data_len(con->in_msg)) +
+ CEPH_EPILOGUE_SECURE_LEN;
+ else
+ remaining = CEPH_EPILOGUE_PLAIN_LEN;
+
+ dout("%s con %p total_resid %zu remaining %d\n", __func__, con,
+ con->v2.in_cursor.total_resid, remaining);
+ con->v2.in_iter.count -= resid;
+ set_in_skip(con, con->v2.in_cursor.total_resid + remaining);
+ con->v2.in_state = IN_S_FINISH_SKIP;
+}
+
+static void revoke_at_handle_epilogue(struct ceph_connection *con)
+{
+ int resid;
+
+ WARN_ON(!iov_iter_is_kvec(&con->v2.in_iter));
+ resid = iov_iter_count(&con->v2.in_iter);
+ WARN_ON(!resid);
+
+ dout("%s con %p resid %d\n", __func__, con, resid);
+ con->v2.in_iter.count -= resid;
+ set_in_skip(con, resid);
+ con->v2.in_state = IN_S_FINISH_SKIP;
+}
+
+void ceph_con_v2_revoke_incoming(struct ceph_connection *con)
+{
+ switch (con->v2.in_state) {
+ case IN_S_PREPARE_READ_DATA:
+ revoke_at_prepare_read_data(con);
+ break;
+ case IN_S_PREPARE_READ_DATA_CONT:
+ revoke_at_prepare_read_data_cont(con);
+ break;
+ case IN_S_HANDLE_EPILOGUE:
+ revoke_at_handle_epilogue(con);
+ break;
+ default:
+ WARN(1, "bad in_state %d", con->v2.in_state);
+ break;
+ }
+}
+
+bool ceph_con_v2_opened(struct ceph_connection *con)
+{
+ return con->v2.peer_global_seq;
+}
+
+void ceph_con_v2_reset_session(struct ceph_connection *con)
+{
+ con->v2.client_cookie = 0;
+ con->v2.server_cookie = 0;
+ con->v2.global_seq = 0;
+ con->v2.connect_seq = 0;
+ con->v2.peer_global_seq = 0;
+}
+
+void ceph_con_v2_reset_protocol(struct ceph_connection *con)
+{
+ iov_iter_truncate(&con->v2.in_iter, 0);
+ iov_iter_truncate(&con->v2.out_iter, 0);
+ con->v2.out_zero = 0;
+
+ clear_in_sign_kvecs(con);
+ clear_out_sign_kvecs(con);
+ free_conn_bufs(con);
+
+ if (con->v2.out_enc_pages) {
+ WARN_ON(!con->v2.out_enc_page_cnt);
+ ceph_release_page_vector(con->v2.out_enc_pages,
+ con->v2.out_enc_page_cnt);
+ con->v2.out_enc_pages = NULL;
+ con->v2.out_enc_page_cnt = 0;
+ }
+
+ con->v2.con_mode = CEPH_CON_MODE_UNKNOWN;
+
+ if (con->v2.hmac_tfm) {
+ crypto_free_shash(con->v2.hmac_tfm);
+ con->v2.hmac_tfm = NULL;
+ }
+ if (con->v2.gcm_req) {
+ aead_request_free(con->v2.gcm_req);
+ con->v2.gcm_req = NULL;
+ }
+ if (con->v2.gcm_tfm) {
+ crypto_free_aead(con->v2.gcm_tfm);
+ con->v2.gcm_tfm = NULL;
+ }
+}
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index c4cf2529d08b..b9d54ed9f338 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -36,57 +36,122 @@ static const struct ceph_connection_operations mon_con_ops;
static int __validate_auth(struct ceph_mon_client *monc);
+static int decode_mon_info(void **p, void *end, bool msgr2,
+ struct ceph_entity_addr *addr)
+{
+ void *mon_info_end;
+ u32 struct_len;
+ u8 struct_v;
+ int ret;
+
+ ret = ceph_start_decoding(p, end, 1, "mon_info_t", &struct_v,
+ &struct_len);
+ if (ret)
+ return ret;
+
+ mon_info_end = *p + struct_len;
+ ceph_decode_skip_string(p, end, e_inval); /* skip mon name */
+ ret = ceph_decode_entity_addrvec(p, end, msgr2, addr);
+ if (ret)
+ return ret;
+
+ *p = mon_info_end;
+ return 0;
+
+e_inval:
+ return -EINVAL;
+}
+
/*
* Decode a monmap blob (e.g., during mount).
+ *
+ * Assume MonMap v3 (i.e. encoding with MONNAMES and MONENC).
*/
-static struct ceph_monmap *ceph_monmap_decode(void *p, void *end)
+static struct ceph_monmap *ceph_monmap_decode(void **p, void *end, bool msgr2)
{
- struct ceph_monmap *m = NULL;
- int i, err = -EINVAL;
+ struct ceph_monmap *monmap = NULL;
struct ceph_fsid fsid;
- u32 epoch, num_mon;
- u32 len;
+ u32 struct_len;
+ int blob_len;
+ int num_mon;
+ u8 struct_v;
+ u32 epoch;
+ int ret;
+ int i;
+
+ ceph_decode_32_safe(p, end, blob_len, e_inval);
+ ceph_decode_need(p, end, blob_len, e_inval);
+
+ ret = ceph_start_decoding(p, end, 6, "monmap", &struct_v, &struct_len);
+ if (ret)
+ goto fail;
+
+ dout("%s struct_v %d\n", __func__, struct_v);
+ ceph_decode_copy_safe(p, end, &fsid, sizeof(fsid), e_inval);
+ ceph_decode_32_safe(p, end, epoch, e_inval);
+ if (struct_v >= 6) {
+ u32 feat_struct_len;
+ u8 feat_struct_v;
+
+ *p += sizeof(struct ceph_timespec); /* skip last_changed */
+ *p += sizeof(struct ceph_timespec); /* skip created */
- ceph_decode_32_safe(&p, end, len, bad);
- ceph_decode_need(&p, end, len, bad);
+ ret = ceph_start_decoding(p, end, 1, "mon_feature_t",
+ &feat_struct_v, &feat_struct_len);
+ if (ret)
+ goto fail;
- dout("monmap_decode %p %p len %d (%d)\n", p, end, len, (int)(end-p));
- p += sizeof(u16); /* skip version */
+ *p += feat_struct_len; /* skip persistent_features */
- ceph_decode_need(&p, end, sizeof(fsid) + 2*sizeof(u32), bad);
- ceph_decode_copy(&p, &fsid, sizeof(fsid));
- epoch = ceph_decode_32(&p);
+ ret = ceph_start_decoding(p, end, 1, "mon_feature_t",
+ &feat_struct_v, &feat_struct_len);
+ if (ret)
+ goto fail;
- num_mon = ceph_decode_32(&p);
+ *p += feat_struct_len; /* skip optional_features */
+ }
+ ceph_decode_32_safe(p, end, num_mon, e_inval);
+ dout("%s fsid %pU epoch %u num_mon %d\n", __func__, &fsid, epoch,
+ num_mon);
if (num_mon > CEPH_MAX_MON)
- goto bad;
- m = kmalloc(struct_size(m, mon_inst, num_mon), GFP_NOFS);
- if (m == NULL)
- return ERR_PTR(-ENOMEM);
- m->fsid = fsid;
- m->epoch = epoch;
- m->num_mon = num_mon;
- for (i = 0; i < num_mon; ++i) {
- struct ceph_entity_inst *inst = &m->mon_inst[i];
-
- /* copy name portion */
- ceph_decode_copy_safe(&p, end, &inst->name,
- sizeof(inst->name), bad);
- err = ceph_decode_entity_addr(&p, end, &inst->addr);
- if (err)
- goto bad;
+ goto e_inval;
+
+ monmap = kmalloc(struct_size(monmap, mon_inst, num_mon), GFP_NOIO);
+ if (!monmap) {
+ ret = -ENOMEM;
+ goto fail;
}
- dout("monmap_decode epoch %d, num_mon %d\n", m->epoch,
- m->num_mon);
- for (i = 0; i < m->num_mon; i++)
- dout("monmap_decode mon%d is %s\n", i,
- ceph_pr_addr(&m->mon_inst[i].addr));
- return m;
-bad:
- dout("monmap_decode failed with %d\n", err);
- kfree(m);
- return ERR_PTR(err);
+ monmap->fsid = fsid;
+ monmap->epoch = epoch;
+ monmap->num_mon = num_mon;
+
+ /* legacy_mon_addr map or mon_info map */
+ for (i = 0; i < num_mon; i++) {
+ struct ceph_entity_inst *inst = &monmap->mon_inst[i];
+
+ ceph_decode_skip_string(p, end, e_inval); /* skip mon name */
+ inst->name.type = CEPH_ENTITY_TYPE_MON;
+ inst->name.num = cpu_to_le64(i);
+
+ if (struct_v >= 6)
+ ret = decode_mon_info(p, end, msgr2, &inst->addr);
+ else
+ ret = ceph_decode_entity_addr(p, end, &inst->addr);
+ if (ret)
+ goto fail;
+
+ dout("%s mon%d addr %s\n", __func__, i,
+ ceph_pr_addr(&inst->addr));
+ }
+
+ return monmap;
+
+e_inval:
+ ret = -EINVAL;
+fail:
+ kfree(monmap);
+ return ERR_PTR(ret);
}
/*
@@ -96,9 +161,11 @@ int ceph_monmap_contains(struct ceph_monmap *m, struct ceph_entity_addr *addr)
{
int i;
- for (i = 0; i < m->num_mon; i++)
- if (memcmp(addr, &m->mon_inst[i].addr, sizeof(*addr)) == 0)
+ for (i = 0; i < m->num_mon; i++) {
+ if (ceph_addr_equal_no_type(addr, &m->mon_inst[i].addr))
return 1;
+ }
+
return 0;
}
@@ -190,10 +257,16 @@ static void __open_session(struct ceph_mon_client *monc)
&monc->monmap->mon_inst[monc->cur_mon].addr);
/*
- * send an initial keepalive to ensure our timestamp is valid
- * by the time we are in an OPENED state
+ * Queue a keepalive to ensure that in case of an early fault
+ * the messenger doesn't put us into STANDBY state and instead
+ * retries. This also ensures that our timestamp is valid by
+ * the time we finish hunting and delayed_work() checks it.
*/
ceph_con_keepalive(&monc->con);
+ if (ceph_msgr2(monc->client)) {
+ monc->pending_auth = 1;
+ return;
+ }
/* initiate authentication handshake */
ret = ceph_auth_build_hello(monc->auth,
@@ -476,7 +549,7 @@ static void ceph_monc_handle_map(struct ceph_mon_client *monc,
p = msg->front.iov_base;
end = p + msg->front.iov_len;
- monmap = ceph_monmap_decode(p, end);
+ monmap = ceph_monmap_decode(&p, end, ceph_msgr2(client));
if (IS_ERR(monmap)) {
pr_err("problem decoding monmap, %d\n",
(int)PTR_ERR(monmap));
@@ -1052,8 +1125,9 @@ static void delayed_work(struct work_struct *work)
*/
static int build_initial_monmap(struct ceph_mon_client *monc)
{
+ __le32 my_type = ceph_msgr2(monc->client) ?
+ CEPH_ENTITY_ADDR_TYPE_MSGR2 : CEPH_ENTITY_ADDR_TYPE_LEGACY;
struct ceph_options *opt = monc->client->options;
- struct ceph_entity_addr *mon_addr = opt->mon_addr;
int num_mon = opt->num_mon;
int i;
@@ -1062,12 +1136,16 @@ static int build_initial_monmap(struct ceph_mon_client *monc)
GFP_KERNEL);
if (!monc->monmap)
return -ENOMEM;
+
for (i = 0; i < num_mon; i++) {
- monc->monmap->mon_inst[i].addr = mon_addr[i];
- monc->monmap->mon_inst[i].addr.nonce = 0;
- monc->monmap->mon_inst[i].name.type =
- CEPH_ENTITY_TYPE_MON;
- monc->monmap->mon_inst[i].name.num = cpu_to_le64(i);
+ struct ceph_entity_inst *inst = &monc->monmap->mon_inst[i];
+
+ memcpy(&inst->addr.in_addr, &opt->mon_addr[i].in_addr,
+ sizeof(inst->addr.in_addr));
+ inst->addr.type = my_type;
+ inst->addr.nonce = 0;
+ inst->name.type = CEPH_ENTITY_TYPE_MON;
+ inst->name.num = cpu_to_le64(i);
}
monc->monmap->num_mon = num_mon;
return 0;
@@ -1089,8 +1167,8 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
/* connection */
/* authentication */
- monc->auth = ceph_auth_init(cl->options->name,
- cl->options->key);
+ monc->auth = ceph_auth_init(cl->options->name, cl->options->key,
+ cl->options->con_modes);
if (IS_ERR(monc->auth)) {
err = PTR_ERR(monc->auth);
goto out_monmap;
@@ -1194,30 +1272,22 @@ static void finish_hunting(struct ceph_mon_client *monc)
}
}
-static void handle_auth_reply(struct ceph_mon_client *monc,
- struct ceph_msg *msg)
+static void finish_auth(struct ceph_mon_client *monc, int auth_err,
+ bool was_authed)
{
- int ret;
- int was_auth = 0;
+ dout("%s auth_err %d was_authed %d\n", __func__, auth_err, was_authed);
+ WARN_ON(auth_err > 0);
- mutex_lock(&monc->mutex);
- was_auth = ceph_auth_is_authenticated(monc->auth);
monc->pending_auth = 0;
- ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base,
- msg->front.iov_len,
- monc->m_auth->front.iov_base,
- monc->m_auth->front_alloc_len);
- if (ret > 0) {
- __send_prepared_auth_request(monc, ret);
- goto out;
+ if (auth_err) {
+ monc->client->auth_err = auth_err;
+ wake_up_all(&monc->client->auth_wq);
+ return;
}
- finish_hunting(monc);
-
- if (ret < 0) {
- monc->client->auth_err = ret;
- } else if (!was_auth && ceph_auth_is_authenticated(monc->auth)) {
- dout("authenticated, starting session\n");
+ if (!was_authed && ceph_auth_is_authenticated(monc->auth)) {
+ dout("%s authenticated, starting session global_id %llu\n",
+ __func__, monc->auth->global_id);
monc->client->msgr.inst.name.type = CEPH_ENTITY_TYPE_CLIENT;
monc->client->msgr.inst.name.num =
@@ -1229,11 +1299,27 @@ static void handle_auth_reply(struct ceph_mon_client *monc,
pr_info("mon%d %s session established\n", monc->cur_mon,
ceph_pr_addr(&monc->con.peer_addr));
}
+}
-out:
+static void handle_auth_reply(struct ceph_mon_client *monc,
+ struct ceph_msg *msg)
+{
+ bool was_authed;
+ int ret;
+
+ mutex_lock(&monc->mutex);
+ was_authed = ceph_auth_is_authenticated(monc->auth);
+ ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base,
+ msg->front.iov_len,
+ monc->m_auth->front.iov_base,
+ monc->m_auth->front_alloc_len);
+ if (ret > 0) {
+ __send_prepared_auth_request(monc, ret);
+ } else {
+ finish_auth(monc, ret, was_authed);
+ finish_hunting(monc);
+ }
mutex_unlock(&monc->mutex);
- if (monc->client->auth_err < 0)
- wake_up_all(&monc->client->auth_wq);
}
static int __validate_auth(struct ceph_mon_client *monc)
@@ -1262,6 +1348,88 @@ int ceph_monc_validate_auth(struct ceph_mon_client *monc)
}
EXPORT_SYMBOL(ceph_monc_validate_auth);
+static int mon_get_auth_request(struct ceph_connection *con,
+ void *buf, int *buf_len,
+ void **authorizer, int *authorizer_len)
+{
+ struct ceph_mon_client *monc = con->private;
+ int ret;
+
+ mutex_lock(&monc->mutex);
+ ret = ceph_auth_get_request(monc->auth, buf, *buf_len);
+ mutex_unlock(&monc->mutex);
+ if (ret < 0)
+ return ret;
+
+ *buf_len = ret;
+ *authorizer = NULL;
+ *authorizer_len = 0;
+ return 0;
+}
+
+static int mon_handle_auth_reply_more(struct ceph_connection *con,
+ void *reply, int reply_len,
+ void *buf, int *buf_len,
+ void **authorizer, int *authorizer_len)
+{
+ struct ceph_mon_client *monc = con->private;
+ int ret;
+
+ mutex_lock(&monc->mutex);
+ ret = ceph_auth_handle_reply_more(monc->auth, reply, reply_len,
+ buf, *buf_len);
+ mutex_unlock(&monc->mutex);
+ if (ret < 0)
+ return ret;
+
+ *buf_len = ret;
+ *authorizer = NULL;
+ *authorizer_len = 0;
+ return 0;
+}
+
+static int mon_handle_auth_done(struct ceph_connection *con,
+ u64 global_id, void *reply, int reply_len,
+ u8 *session_key, int *session_key_len,
+ u8 *con_secret, int *con_secret_len)
+{
+ struct ceph_mon_client *monc = con->private;
+ bool was_authed;
+ int ret;
+
+ mutex_lock(&monc->mutex);
+ WARN_ON(!monc->hunting);
+ was_authed = ceph_auth_is_authenticated(monc->auth);
+ ret = ceph_auth_handle_reply_done(monc->auth, global_id,
+ reply, reply_len,
+ session_key, session_key_len,
+ con_secret, con_secret_len);
+ finish_auth(monc, ret, was_authed);
+ if (!ret)
+ finish_hunting(monc);
+ mutex_unlock(&monc->mutex);
+ return 0;
+}
+
+static int mon_handle_auth_bad_method(struct ceph_connection *con,
+ int used_proto, int result,
+ const int *allowed_protos, int proto_cnt,
+ const int *allowed_modes, int mode_cnt)
+{
+ struct ceph_mon_client *monc = con->private;
+ bool was_authed;
+
+ mutex_lock(&monc->mutex);
+ WARN_ON(!monc->hunting);
+ was_authed = ceph_auth_is_authenticated(monc->auth);
+ ceph_auth_handle_bad_method(monc->auth, used_proto, result,
+ allowed_protos, proto_cnt,
+ allowed_modes, mode_cnt);
+ finish_auth(monc, -EACCES, was_authed);
+ mutex_unlock(&monc->mutex);
+ return 0;
+}
+
/*
* handle incoming message
*/
@@ -1412,4 +1580,8 @@ static const struct ceph_connection_operations mon_con_ops = {
.dispatch = dispatch,
.fault = mon_fault,
.alloc_msg = mon_alloc_msg,
+ .get_auth_request = mon_get_auth_request,
+ .handle_auth_reply_more = mon_handle_auth_reply_more,
+ .handle_auth_done = mon_handle_auth_done,
+ .handle_auth_bad_method = mon_handle_auth_bad_method,
};
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 7901ab6c79fd..61229c5e22cb 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -3918,9 +3918,11 @@ static int handle_one_map(struct ceph_osd_client *osdc,
set_pool_was_full(osdc);
if (incremental)
- newmap = osdmap_apply_incremental(&p, end, osdc->osdmap);
+ newmap = osdmap_apply_incremental(&p, end,
+ ceph_msgr2(osdc->client),
+ osdc->osdmap);
else
- newmap = ceph_osdmap_decode(&p, end);
+ newmap = ceph_osdmap_decode(&p, end, ceph_msgr2(osdc->client));
if (IS_ERR(newmap))
return PTR_ERR(newmap);
@@ -5575,6 +5577,7 @@ static void put_osd_con(struct ceph_connection *con)
/*
* authentication
*/
+
/*
* Note: returned pointer is the address of a structure that's
* managed separately. Caller must *not* attempt to free it.
@@ -5586,23 +5589,12 @@ static struct ceph_auth_handshake *get_authorizer(struct ceph_connection *con,
struct ceph_osd_client *osdc = o->o_osdc;
struct ceph_auth_client *ac = osdc->client->monc.auth;
struct ceph_auth_handshake *auth = &o->o_auth;
+ int ret;
- if (force_new && auth->authorizer) {
- ceph_auth_destroy_authorizer(auth->authorizer);
- auth->authorizer = NULL;
- }
- if (!auth->authorizer) {
- int ret = ceph_auth_create_authorizer(ac, CEPH_ENTITY_TYPE_OSD,
- auth);
- if (ret)
- return ERR_PTR(ret);
- } else {
- int ret = ceph_auth_update_authorizer(ac, CEPH_ENTITY_TYPE_OSD,
- auth);
- if (ret)
- return ERR_PTR(ret);
- }
- *proto = ac->protocol;
+ ret = __ceph_auth_get_authorizer(ac, auth, CEPH_ENTITY_TYPE_OSD,
+ force_new, proto, NULL, NULL);
+ if (ret)
+ return ERR_PTR(ret);
return auth;
}
@@ -5623,8 +5615,11 @@ static int verify_authorizer_reply(struct ceph_connection *con)
struct ceph_osd *o = con->private;
struct ceph_osd_client *osdc = o->o_osdc;
struct ceph_auth_client *ac = osdc->client->monc.auth;
+ struct ceph_auth_handshake *auth = &o->o_auth;
- return ceph_auth_verify_authorizer_reply(ac, o->o_auth.authorizer);
+ return ceph_auth_verify_authorizer_reply(ac, auth->authorizer,
+ auth->authorizer_reply_buf, auth->authorizer_reply_buf_len,
+ NULL, NULL, NULL, NULL);
}
static int invalidate_authorizer(struct ceph_connection *con)
@@ -5637,6 +5632,80 @@ static int invalidate_authorizer(struct ceph_connection *con)
return ceph_monc_validate_auth(&osdc->client->monc);
}
+static int osd_get_auth_request(struct ceph_connection *con,
+ void *buf, int *buf_len,
+ void **authorizer, int *authorizer_len)
+{
+ struct ceph_osd *o = con->private;
+ struct ceph_auth_client *ac = o->o_osdc->client->monc.auth;
+ struct ceph_auth_handshake *auth = &o->o_auth;
+ int ret;
+
+ ret = ceph_auth_get_authorizer(ac, auth, CEPH_ENTITY_TYPE_OSD,
+ buf, buf_len);
+ if (ret)
+ return ret;
+
+ *authorizer = auth->authorizer_buf;
+ *authorizer_len = auth->authorizer_buf_len;
+ return 0;
+}
+
+static int osd_handle_auth_reply_more(struct ceph_connection *con,
+ void *reply, int reply_len,
+ void *buf, int *buf_len,
+ void **authorizer, int *authorizer_len)
+{
+ struct ceph_osd *o = con->private;
+ struct ceph_auth_client *ac = o->o_osdc->client->monc.auth;
+ struct ceph_auth_handshake *auth = &o->o_auth;
+ int ret;
+
+ ret = ceph_auth_handle_svc_reply_more(ac, auth, reply, reply_len,
+ buf, buf_len);
+ if (ret)
+ return ret;
+
+ *authorizer = auth->authorizer_buf;
+ *authorizer_len = auth->authorizer_buf_len;
+ return 0;
+}
+
+static int osd_handle_auth_done(struct ceph_connection *con,
+ u64 global_id, void *reply, int reply_len,
+ u8 *session_key, int *session_key_len,
+ u8 *con_secret, int *con_secret_len)
+{
+ struct ceph_osd *o = con->private;
+ struct ceph_auth_client *ac = o->o_osdc->client->monc.auth;
+ struct ceph_auth_handshake *auth = &o->o_auth;
+
+ return ceph_auth_handle_svc_reply_done(ac, auth, reply, reply_len,
+ session_key, session_key_len,
+ con_secret, con_secret_len);
+}
+
+static int osd_handle_auth_bad_method(struct ceph_connection *con,
+ int used_proto, int result,
+ const int *allowed_protos, int proto_cnt,
+ const int *allowed_modes, int mode_cnt)
+{
+ struct ceph_osd *o = con->private;
+ struct ceph_mon_client *monc = &o->o_osdc->client->monc;
+ int ret;
+
+ if (ceph_auth_handle_bad_authorizer(monc->auth, CEPH_ENTITY_TYPE_OSD,
+ used_proto, result,
+ allowed_protos, proto_cnt,
+ allowed_modes, mode_cnt)) {
+ ret = ceph_monc_validate_auth(monc);
+ if (ret)
+ return ret;
+ }
+
+ return -EACCES;
+}
+
static void osd_reencode_message(struct ceph_msg *msg)
{
int type = le16_to_cpu(msg->hdr.type);
@@ -5674,4 +5743,8 @@ static const struct ceph_connection_operations osd_con_ops = {
.sign_message = osd_sign_message,
.check_message_signature = osd_check_message_signature,
.fault = osd_fault,
+ .get_auth_request = osd_get_auth_request,
+ .handle_auth_reply_more = osd_handle_auth_reply_more,
+ .handle_auth_done = osd_handle_auth_done,
+ .handle_auth_bad_method = osd_handle_auth_bad_method,
};
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index fa08c15be0c0..2b1dd252f231 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -1647,7 +1647,8 @@ static int decode_old_pg_upmap_items(void **p, void *end,
/*
* decode a full map.
*/
-static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map)
+static int osdmap_decode(void **p, void *end, bool msgr2,
+ struct ceph_osdmap *map)
{
u8 struct_v;
u32 epoch = 0;
@@ -1718,9 +1719,16 @@ static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map)
goto e_inval;
for (i = 0; i < map->max_osd; i++) {
- err = ceph_decode_entity_addr(p, end, &map->osd_addr[i]);
+ struct ceph_entity_addr *addr = &map->osd_addr[i];
+
+ if (struct_v >= 8)
+ err = ceph_decode_entity_addrvec(p, end, msgr2, addr);
+ else
+ err = ceph_decode_entity_addr(p, end, addr);
if (err)
goto bad;
+
+ dout("%s osd%d addr %s\n", __func__, i, ceph_pr_addr(addr));
}
/* pg_temp */
@@ -1790,7 +1798,7 @@ bad:
/*
* Allocate and decode a full map.
*/
-struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end)
+struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end, bool msgr2)
{
struct ceph_osdmap *map;
int ret;
@@ -1799,7 +1807,7 @@ struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end)
if (!map)
return ERR_PTR(-ENOMEM);
- ret = osdmap_decode(p, end, map);
+ ret = osdmap_decode(p, end, msgr2, map);
if (ret) {
ceph_osdmap_destroy(map);
return ERR_PTR(ret);
@@ -1817,12 +1825,13 @@ struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end)
* new_state: { osd=6, xorstate=EXISTS } # clear osd_state
*/
static int decode_new_up_state_weight(void **p, void *end, u8 struct_v,
- struct ceph_osdmap *map)
+ bool msgr2, struct ceph_osdmap *map)
{
void *new_up_client;
void *new_state;
void *new_weight_end;
u32 len;
+ int ret;
int i;
new_up_client = *p;
@@ -1831,8 +1840,12 @@ static int decode_new_up_state_weight(void **p, void *end, u8 struct_v,
struct ceph_entity_addr addr;
ceph_decode_skip_32(p, end, e_inval);
- if (ceph_decode_entity_addr(p, end, &addr))
- goto e_inval;
+ if (struct_v >= 7)
+ ret = ceph_decode_entity_addrvec(p, end, msgr2, &addr);
+ else
+ ret = ceph_decode_entity_addr(p, end, &addr);
+ if (ret)
+ return ret;
}
new_state = *p;
@@ -1874,7 +1887,6 @@ static int decode_new_up_state_weight(void **p, void *end, u8 struct_v,
while (len--) {
s32 osd;
u32 xorstate;
- int ret;
osd = ceph_decode_32(p);
if (struct_v >= 5)
@@ -1910,8 +1922,15 @@ static int decode_new_up_state_weight(void **p, void *end, u8 struct_v,
osd = ceph_decode_32(p);
BUG_ON(osd >= map->max_osd);
- if (ceph_decode_entity_addr(p, end, &addr))
- goto e_inval;
+ if (struct_v >= 7)
+ ret = ceph_decode_entity_addrvec(p, end, msgr2, &addr);
+ else
+ ret = ceph_decode_entity_addr(p, end, &addr);
+ if (ret)
+ return ret;
+
+ dout("%s osd%d addr %s\n", __func__, osd, ceph_pr_addr(&addr));
+
pr_info("osd%d up\n", osd);
map->osd_state[osd] |= CEPH_OSD_EXISTS | CEPH_OSD_UP;
map->osd_addr[osd] = addr;
@@ -1927,7 +1946,7 @@ e_inval:
/*
* decode and apply an incremental map update.
*/
-struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
+struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, bool msgr2,
struct ceph_osdmap *map)
{
struct ceph_fsid fsid;
@@ -1962,7 +1981,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
if (len > 0) {
dout("apply_incremental full map len %d, %p to %p\n",
len, *p, end);
- return ceph_osdmap_decode(p, min(*p+len, end));
+ return ceph_osdmap_decode(p, min(*p+len, end), msgr2);
}
/* new crush? */
@@ -2014,7 +2033,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
}
/* new_up_client, new_state, new_weight */
- err = decode_new_up_state_weight(p, end, struct_v, map);
+ err = decode_new_up_state_weight(p, end, struct_v, msgr2, map);
if (err)
goto bad;
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index b0b6e6a4784e..2455b0c0e486 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -464,7 +464,7 @@ static struct dst_entry* dccp_v4_route_skb(struct net *net, struct sock *sk,
.fl4_dport = dccp_hdr(skb)->dccph_sport,
};
- security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
+ security_skb_classify_flow(skb, flowi4_to_flowi_common(&fl4));
rt = ip_route_output_flow(net, &fl4, sk);
if (IS_ERR(rt)) {
IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index 78ee1b5acf1f..1f73603913f5 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -203,7 +203,7 @@ static int dccp_v6_send_response(const struct sock *sk, struct request_sock *req
fl6.flowi6_oif = ireq->ir_iif;
fl6.fl6_dport = ireq->ir_rmt_port;
fl6.fl6_sport = htons(ireq->ir_num);
- security_req_classify_flow(req, flowi6_to_flowi(&fl6));
+ security_req_classify_flow(req, flowi6_to_flowi_common(&fl6));
rcu_read_lock();
@@ -279,7 +279,7 @@ static void dccp_v6_ctl_send_reset(const struct sock *sk, struct sk_buff *rxskb)
fl6.flowi6_oif = inet6_iif(rxskb);
fl6.fl6_dport = dccp_hdr(skb)->dccph_dport;
fl6.fl6_sport = dccp_hdr(skb)->dccph_sport;
- security_skb_classify_flow(rxskb, flowi6_to_flowi(&fl6));
+ security_skb_classify_flow(rxskb, flowi6_to_flowi_common(&fl6));
/* sk = NULL, but it is safe for now. RST socket required. */
dst = ip6_dst_lookup_flow(sock_net(ctl_sk), ctl_sk, &fl6, NULL);
@@ -907,7 +907,7 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
fl6.flowi6_oif = sk->sk_bound_dev_if;
fl6.fl6_dport = usin->sin6_port;
fl6.fl6_sport = inet->inet_sport;
- security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
+ security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6));
opt = rcu_dereference_protected(np->opt, lockdep_sock_is_held(sk));
final_p = fl6_update_dst(&fl6, opt, &final);
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 005faea415a4..396b492c804f 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -447,7 +447,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos);
fl4.flowi4_proto = IPPROTO_ICMP;
fl4.flowi4_oif = l3mdev_master_ifindex(skb->dev);
- security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
+ security_skb_classify_flow(skb, flowi4_to_flowi_common(&fl4));
rt = ip_route_output_key(net, &fl4);
if (IS_ERR(rt))
goto out_unlock;
@@ -503,7 +503,7 @@ static struct rtable *icmp_route_lookup(struct net *net,
route_lookup_dev = icmp_get_route_lookup_dev(skb_in);
fl4->flowi4_oif = l3mdev_master_ifindex(route_lookup_dev);
- security_skb_classify_flow(skb_in, flowi4_to_flowi(fl4));
+ security_skb_classify_flow(skb_in, flowi4_to_flowi_common(fl4));
rt = ip_route_output_key_hash(net, fl4, skb_in);
if (IS_ERR(rt))
return rt;
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index f60869acbef0..fd8b8800a2c3 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -602,7 +602,7 @@ struct dst_entry *inet_csk_route_req(const struct sock *sk,
(opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr,
ireq->ir_loc_addr, ireq->ir_rmt_port,
htons(ireq->ir_num), sk->sk_uid);
- security_req_classify_flow(req, flowi4_to_flowi(fl4));
+ security_req_classify_flow(req, flowi4_to_flowi_common(fl4));
rt = ip_route_output_flow(net, fl4, sk);
if (IS_ERR(rt))
goto no_route;
@@ -640,7 +640,7 @@ struct dst_entry *inet_csk_route_child_sock(const struct sock *sk,
(opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr,
ireq->ir_loc_addr, ireq->ir_rmt_port,
htons(ireq->ir_num), sk->sk_uid);
- security_req_classify_flow(req, flowi4_to_flowi(fl4));
+ security_req_classify_flow(req, flowi4_to_flowi_common(fl4));
rt = ip_route_output_flow(net, fl4, sk);
if (IS_ERR(rt))
goto no_route;
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 879b76ae4435..89fff5f59eea 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1700,7 +1700,7 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
daddr, saddr,
tcp_hdr(skb)->source, tcp_hdr(skb)->dest,
arg->uid);
- security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
+ security_skb_classify_flow(skb, flowi4_to_flowi_common(&fl4));
rt = ip_route_output_key(net, &fl4);
if (IS_ERR(rt))
return;
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 248856b301c4..8b943f85fff9 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -778,7 +778,7 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
fl4.fl4_icmp_type = user_icmph.type;
fl4.fl4_icmp_code = user_icmph.code;
- security_sk_classify_flow(sk, flowi4_to_flowi(&fl4));
+ security_sk_classify_flow(sk, flowi4_to_flowi_common(&fl4));
rt = ip_route_output_flow(net, &fl4, sk);
if (IS_ERR(rt)) {
err = PTR_ERR(rt);
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 7d26e0f8bdae..50a73178d63a 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -640,7 +640,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
goto done;
}
- security_sk_classify_flow(sk, flowi4_to_flowi(&fl4));
+ security_sk_classify_flow(sk, flowi4_to_flowi_common(&fl4));
rt = ip_route_output_flow(net, &fl4, sk);
if (IS_ERR(rt)) {
err = PTR_ERR(rt);
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 00dc3f943c80..33792cf55a79 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -418,7 +418,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
inet_sk_flowi_flags(sk),
opt->srr ? opt->faddr : ireq->ir_rmt_addr,
ireq->ir_loc_addr, th->source, th->dest, sk->sk_uid);
- security_req_classify_flow(req, flowi4_to_flowi(&fl4));
+ security_req_classify_flow(req, flowi4_to_flowi_common(&fl4));
rt = ip_route_output_key(sock_net(sk), &fl4);
if (IS_ERR(rt)) {
reqsk_free(req);
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index dece195f212c..7103b0a89756 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1196,7 +1196,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
faddr, saddr, dport, inet->inet_sport,
sk->sk_uid);
- security_sk_classify_flow(sk, flowi4_to_flowi(fl4));
+ security_sk_classify_flow(sk, flowi4_to_flowi_common(fl4));
rt = ip_route_output_flow(net, fl4, sk);
if (IS_ERR(rt)) {
err = PTR_ERR(rt);
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index a7e3d170af51..8e9c3e9ea36e 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -819,7 +819,7 @@ int inet6_sk_rebuild_header(struct sock *sk)
fl6.fl6_dport = inet->inet_dport;
fl6.fl6_sport = inet->inet_sport;
fl6.flowi6_uid = sk->sk_uid;
- security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
+ security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6));
rcu_read_lock();
final_p = fl6_update_dst(&fl6, rcu_dereference(np->opt),
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index cc8ad7ddecda..206f66310a88 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -60,7 +60,7 @@ static void ip6_datagram_flow_key_init(struct flowi6 *fl6, struct sock *sk)
if (!fl6->flowi6_oif && ipv6_addr_is_multicast(&fl6->daddr))
fl6->flowi6_oif = np->mcast_oif;
- security_sk_classify_flow(sk, flowi6_to_flowi(fl6));
+ security_sk_classify_flow(sk, flowi6_to_flowi_common(fl6));
}
int ip6_datagram_dst_update(struct sock *sk, bool fix_sk_saddr)
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index 8956144ea65e..f3d05866692e 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -573,7 +573,7 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
fl6.fl6_icmp_code = code;
fl6.flowi6_uid = sock_net_uid(net, NULL);
fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, NULL);
- security_skb_classify_flow(skb, flowi6_to_flowi(&fl6));
+ security_skb_classify_flow(skb, flowi6_to_flowi_common(&fl6));
np = inet6_sk(sk);
@@ -755,7 +755,7 @@ static void icmpv6_echo_reply(struct sk_buff *skb)
fl6.fl6_icmp_type = ICMPV6_ECHO_REPLY;
fl6.flowi6_mark = mark;
fl6.flowi6_uid = sock_net_uid(net, NULL);
- security_skb_classify_flow(skb, flowi6_to_flowi(&fl6));
+ security_skb_classify_flow(skb, flowi6_to_flowi_common(&fl6));
local_bh_disable();
sk = icmpv6_xmit_lock(net);
@@ -1008,7 +1008,7 @@ void icmpv6_flow_init(struct sock *sk, struct flowi6 *fl6,
fl6->fl6_icmp_type = type;
fl6->fl6_icmp_code = 0;
fl6->flowi6_oif = oif;
- security_sk_classify_flow(sk, flowi6_to_flowi(fl6));
+ security_sk_classify_flow(sk, flowi6_to_flowi_common(fl6));
}
static void __net_exit icmpv6_sk_exit(struct net *net)
diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c
index e315526fa244..5a9f4d722f35 100644
--- a/net/ipv6/inet6_connection_sock.c
+++ b/net/ipv6/inet6_connection_sock.c
@@ -46,7 +46,7 @@ struct dst_entry *inet6_csk_route_req(const struct sock *sk,
fl6->fl6_dport = ireq->ir_rmt_port;
fl6->fl6_sport = htons(ireq->ir_num);
fl6->flowi6_uid = sk->sk_uid;
- security_req_classify_flow(req, flowi6_to_flowi(fl6));
+ security_req_classify_flow(req, flowi6_to_flowi_common(fl6));
dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_p);
if (IS_ERR(dst))
@@ -95,7 +95,7 @@ static struct dst_entry *inet6_csk_route_socket(struct sock *sk,
fl6->fl6_sport = inet->inet_sport;
fl6->fl6_dport = inet->inet_dport;
fl6->flowi6_uid = sk->sk_uid;
- security_sk_classify_flow(sk, flowi6_to_flowi(fl6));
+ security_sk_classify_flow(sk, flowi6_to_flowi_common(fl6));
rcu_read_lock();
final_p = fl6_update_dst(fl6, rcu_dereference(np->opt), &final);
diff --git a/net/ipv6/netfilter/nf_reject_ipv6.c b/net/ipv6/netfilter/nf_reject_ipv6.c
index 570d1d76c44d..dffeaaaadcde 100644
--- a/net/ipv6/netfilter/nf_reject_ipv6.c
+++ b/net/ipv6/netfilter/nf_reject_ipv6.c
@@ -314,7 +314,7 @@ void nf_send_reset6(struct net *net, struct sock *sk, struct sk_buff *oldskb,
fl6.flowi6_oif = l3mdev_master_ifindex(skb_dst(oldskb)->dev);
fl6.flowi6_mark = IP6_REPLY_MARK(net, oldskb->mark);
- security_skb_classify_flow(oldskb, flowi6_to_flowi(&fl6));
+ security_skb_classify_flow(oldskb, flowi6_to_flowi_common(&fl6));
dst = ip6_route_output(net, NULL, &fl6);
if (dst->error) {
dst_release(dst);
diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c
index 6caa062f68e7..6ac88fe24a8e 100644
--- a/net/ipv6/ping.c
+++ b/net/ipv6/ping.c
@@ -111,7 +111,7 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
fl6.flowi6_uid = sk->sk_uid;
fl6.fl6_icmp_type = user_icmph.icmp6_type;
fl6.fl6_icmp_code = user_icmph.icmp6_code;
- security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
+ security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6));
ipcm6_init_sk(&ipc6, np);
ipc6.sockc.mark = sk->sk_mark;
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 6e4ab80a3b94..1f56d9aae589 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -915,7 +915,7 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
fl6.flowi6_oif = np->mcast_oif;
else if (!fl6.flowi6_oif)
fl6.flowi6_oif = np->ucast_oif;
- security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
+ security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6));
if (hdrincl)
fl6.flowi6_flags |= FLOWI_FLAG_KNOWN_NH;
diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
index 9b6cae1e49d9..e8cfb9e997bf 100644
--- a/net/ipv6/syncookies.c
+++ b/net/ipv6/syncookies.c
@@ -233,7 +233,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
fl6.fl6_dport = ireq->ir_rmt_port;
fl6.fl6_sport = inet_sk(sk)->inet_sport;
fl6.flowi6_uid = sk->sk_uid;
- security_req_classify_flow(req, flowi6_to_flowi(&fl6));
+ security_req_classify_flow(req, flowi6_to_flowi_common(&fl6));
dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p);
if (IS_ERR(dst))
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index e254569a3005..0e1509b02cb3 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -278,7 +278,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
opt = rcu_dereference_protected(np->opt, lockdep_sock_is_held(sk));
final_p = fl6_update_dst(&fl6, opt, &final);
- security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
+ security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6));
dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p);
if (IS_ERR(dst)) {
@@ -965,7 +965,7 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
fl6.fl6_dport = t1->dest;
fl6.fl6_sport = t1->source;
fl6.flowi6_uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
- security_skb_classify_flow(skb, flowi6_to_flowi(&fl6));
+ security_skb_classify_flow(skb, flowi6_to_flowi_common(&fl6));
/* Pass a socket to ip6_dst_lookup either it is for RST
* Underlying function will use this to retrieve the network
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 9008f5796ad4..b9f3dfdd2383 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1498,7 +1498,7 @@ do_udp_sendmsg:
} else if (!fl6.flowi6_oif)
fl6.flowi6_oif = np->ucast_oif;
- security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
+ security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6));
if (ipc6.tclass < 0)
ipc6.tclass = np->tclass;
diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c
index e5e5036257b0..96f975777438 100644
--- a/net/l2tp/l2tp_ip6.c
+++ b/net/l2tp/l2tp_ip6.c
@@ -606,7 +606,7 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
else if (!fl6.flowi6_oif)
fl6.flowi6_oif = np->ucast_oif;
- security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
+ security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6));
if (ipc6.tclass < 0)
ipc6.tclass = np->tclass;
diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c
index d7d34a62d3bf..b100c04a0e43 100644
--- a/net/netfilter/nf_synproxy_core.c
+++ b/net/netfilter/nf_synproxy_core.c
@@ -849,7 +849,7 @@ synproxy_send_tcp_ipv6(struct net *net,
fl6.fl6_sport = nth->source;
fl6.fl6_dport = nth->dest;
security_skb_classify_flow((struct sk_buff *)skb,
- flowi6_to_flowi(&fl6));
+ flowi6_to_flowi_common(&fl6));
err = nf_ip6_route(net, &dst, flowi6_to_flowi(&fl6), false);
if (err) {
goto free_nskb;
diff --git a/net/rds/ib.c b/net/rds/ib.c
index deecbdcdae84..24c9a9005a6f 100644
--- a/net/rds/ib.c
+++ b/net/rds/ib.c
@@ -30,7 +30,6 @@
* SOFTWARE.
*
*/
-#include <linux/dmapool.h>
#include <linux/kernel.h>
#include <linux/in.h>
#include <linux/if.h>
@@ -108,7 +107,6 @@ static void rds_ib_dev_free(struct work_struct *work)
rds_ib_destroy_mr_pool(rds_ibdev->mr_1m_pool);
if (rds_ibdev->pd)
ib_dealloc_pd(rds_ibdev->pd);
- dma_pool_destroy(rds_ibdev->rid_hdrs_pool);
list_for_each_entry_safe(i_ipaddr, i_next, &rds_ibdev->ipaddr_list, list) {
list_del(&i_ipaddr->list);
@@ -191,14 +189,6 @@ static int rds_ib_add_one(struct ib_device *device)
rds_ibdev->pd = NULL;
goto put_dev;
}
- rds_ibdev->rid_hdrs_pool = dma_pool_create(device->name,
- device->dma_device,
- sizeof(struct rds_header),
- L1_CACHE_BYTES, 0);
- if (!rds_ibdev->rid_hdrs_pool) {
- ret = -ENOMEM;
- goto put_dev;
- }
rds_ibdev->mr_1m_pool =
rds_ib_create_mr_pool(rds_ibdev, RDS_IB_MR_1M_POOL);
diff --git a/net/rds/ib.h b/net/rds/ib.h
index 8dfff43cf07f..2ba71102b1f1 100644
--- a/net/rds/ib.h
+++ b/net/rds/ib.h
@@ -246,7 +246,6 @@ struct rds_ib_device {
struct list_head conn_list;
struct ib_device *dev;
struct ib_pd *pd;
- struct dma_pool *rid_hdrs_pool; /* RDS headers DMA pool */
u8 odp_capable:1;
unsigned int max_mrs;
@@ -264,13 +263,6 @@ struct rds_ib_device {
int *vector_load;
};
-static inline int ibdev_to_node(struct ib_device *ibdev)
-{
- struct device *parent;
-
- parent = ibdev->dev.parent;
- return parent ? dev_to_node(parent) : NUMA_NO_NODE;
-}
#define rdsibdev_to_node(rdsibdev) ibdev_to_node(rdsibdev->dev)
/* bits for i_ack_flags */
@@ -387,11 +379,6 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id, bool isv6);
void rds_ib_cm_connect_complete(struct rds_connection *conn,
struct rdma_cm_event *event);
-struct rds_header **rds_dma_hdrs_alloc(struct ib_device *ibdev,
- struct dma_pool *pool,
- dma_addr_t **dma_addrs, u32 num_hdrs);
-void rds_dma_hdrs_free(struct dma_pool *pool, struct rds_header **hdrs,
- dma_addr_t *dma_addrs, u32 num_hdrs);
#define rds_ib_conn_error(conn, fmt...) \
__rds_ib_conn_error(conn, KERN_WARNING "RDS/IB: " fmt)
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
index b36b60668b1d..f5cbe963cd8f 100644
--- a/net/rds/ib_cm.c
+++ b/net/rds/ib_cm.c
@@ -30,7 +30,6 @@
* SOFTWARE.
*
*/
-#include <linux/dmapool.h>
#include <linux/kernel.h>
#include <linux/in.h>
#include <linux/slab.h>
@@ -441,42 +440,87 @@ static inline void ibdev_put_vector(struct rds_ib_device *rds_ibdev, int index)
rds_ibdev->vector_load[index]--;
}
+static void rds_dma_hdr_free(struct ib_device *dev, struct rds_header *hdr,
+ dma_addr_t dma_addr, enum dma_data_direction dir)
+{
+ ib_dma_unmap_single(dev, dma_addr, sizeof(*hdr), dir);
+ kfree(hdr);
+}
+
+static struct rds_header *rds_dma_hdr_alloc(struct ib_device *dev,
+ dma_addr_t *dma_addr, enum dma_data_direction dir)
+{
+ struct rds_header *hdr;
+
+ hdr = kzalloc_node(sizeof(*hdr), GFP_KERNEL, ibdev_to_node(dev));
+ if (!hdr)
+ return NULL;
+
+ *dma_addr = ib_dma_map_single(dev, hdr, sizeof(*hdr),
+ DMA_BIDIRECTIONAL);
+ if (ib_dma_mapping_error(dev, *dma_addr)) {
+ kfree(hdr);
+ return NULL;
+ }
+
+ return hdr;
+}
+
+/* Free the DMA memory used to store struct rds_header.
+ *
+ * @dev: the RDS IB device
+ * @hdrs: pointer to the array storing DMA memory pointers
+ * @dma_addrs: pointer to the array storing DMA addresses
+ * @num_hdars: number of headers to free.
+ */
+static void rds_dma_hdrs_free(struct rds_ib_device *dev,
+ struct rds_header **hdrs, dma_addr_t *dma_addrs, u32 num_hdrs,
+ enum dma_data_direction dir)
+{
+ u32 i;
+
+ for (i = 0; i < num_hdrs; i++)
+ rds_dma_hdr_free(dev->dev, hdrs[i], dma_addrs[i], dir);
+ kvfree(hdrs);
+ kvfree(dma_addrs);
+}
+
+
/* Allocate DMA coherent memory to be used to store struct rds_header for
* sending/receiving packets. The pointers to the DMA memory and the
* associated DMA addresses are stored in two arrays.
*
- * @ibdev: the IB device
- * @pool: the DMA memory pool
+ * @dev: the RDS IB device
* @dma_addrs: pointer to the array for storing DMA addresses
* @num_hdrs: number of headers to allocate
*
* It returns the pointer to the array storing the DMA memory pointers. On
* error, NULL pointer is returned.
*/
-struct rds_header **rds_dma_hdrs_alloc(struct ib_device *ibdev,
- struct dma_pool *pool,
- dma_addr_t **dma_addrs, u32 num_hdrs)
+static struct rds_header **rds_dma_hdrs_alloc(struct rds_ib_device *dev,
+ dma_addr_t **dma_addrs, u32 num_hdrs,
+ enum dma_data_direction dir)
{
struct rds_header **hdrs;
dma_addr_t *hdr_daddrs;
u32 i;
hdrs = kvmalloc_node(sizeof(*hdrs) * num_hdrs, GFP_KERNEL,
- ibdev_to_node(ibdev));
+ ibdev_to_node(dev->dev));
if (!hdrs)
return NULL;
hdr_daddrs = kvmalloc_node(sizeof(*hdr_daddrs) * num_hdrs, GFP_KERNEL,
- ibdev_to_node(ibdev));
+ ibdev_to_node(dev->dev));
if (!hdr_daddrs) {
kvfree(hdrs);
return NULL;
}
for (i = 0; i < num_hdrs; i++) {
- hdrs[i] = dma_pool_zalloc(pool, GFP_KERNEL, &hdr_daddrs[i]);
+ hdrs[i] = rds_dma_hdr_alloc(dev->dev, &hdr_daddrs[i], dir);
if (!hdrs[i]) {
- rds_dma_hdrs_free(pool, hdrs, hdr_daddrs, i);
+ rds_dma_hdrs_free(dev, hdrs, hdr_daddrs, i, dir);
return NULL;
}
}
@@ -485,24 +529,6 @@ struct rds_header **rds_dma_hdrs_alloc(struct ib_device *ibdev,
return hdrs;
}
-/* Free the DMA memory used to store struct rds_header.
- *
- * @pool: the DMA memory pool
- * @hdrs: pointer to the array storing DMA memory pointers
- * @dma_addrs: pointer to the array storing DMA addresses
- * @num_hdars: number of headers to free.
- */
-void rds_dma_hdrs_free(struct dma_pool *pool, struct rds_header **hdrs,
- dma_addr_t *dma_addrs, u32 num_hdrs)
-{
- u32 i;
-
- for (i = 0; i < num_hdrs; i++)
- dma_pool_free(pool, hdrs[i], dma_addrs[i]);
- kvfree(hdrs);
- kvfree(dma_addrs);
-}
-
/*
* This needs to be very careful to not leave IS_ERR pointers around for
* cleanup to trip over.
@@ -516,7 +542,6 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
struct rds_ib_device *rds_ibdev;
unsigned long max_wrs;
int ret, fr_queue_space;
- struct dma_pool *pool;
/*
* It's normal to see a null device if an incoming connection races
@@ -612,25 +637,26 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
goto recv_cq_out;
}
- pool = rds_ibdev->rid_hdrs_pool;
- ic->i_send_hdrs = rds_dma_hdrs_alloc(dev, pool, &ic->i_send_hdrs_dma,
- ic->i_send_ring.w_nr);
+ ic->i_send_hdrs = rds_dma_hdrs_alloc(rds_ibdev, &ic->i_send_hdrs_dma,
+ ic->i_send_ring.w_nr,
+ DMA_TO_DEVICE);
if (!ic->i_send_hdrs) {
ret = -ENOMEM;
rdsdebug("DMA send hdrs alloc failed\n");
goto qp_out;
}
- ic->i_recv_hdrs = rds_dma_hdrs_alloc(dev, pool, &ic->i_recv_hdrs_dma,
- ic->i_recv_ring.w_nr);
+ ic->i_recv_hdrs = rds_dma_hdrs_alloc(rds_ibdev, &ic->i_recv_hdrs_dma,
+ ic->i_recv_ring.w_nr,
+ DMA_FROM_DEVICE);
if (!ic->i_recv_hdrs) {
ret = -ENOMEM;
rdsdebug("DMA recv hdrs alloc failed\n");
goto send_hdrs_dma_out;
}
- ic->i_ack = dma_pool_zalloc(pool, GFP_KERNEL,
- &ic->i_ack_dma);
+ ic->i_ack = rds_dma_hdr_alloc(rds_ibdev->dev, &ic->i_ack_dma,
+ DMA_TO_DEVICE);
if (!ic->i_ack) {
ret = -ENOMEM;
rdsdebug("DMA ack header alloc failed\n");
@@ -666,18 +692,19 @@ sends_out:
vfree(ic->i_sends);
ack_dma_out:
- dma_pool_free(pool, ic->i_ack, ic->i_ack_dma);
+ rds_dma_hdr_free(rds_ibdev->dev, ic->i_ack, ic->i_ack_dma,
+ DMA_TO_DEVICE);
ic->i_ack = NULL;
recv_hdrs_dma_out:
- rds_dma_hdrs_free(pool, ic->i_recv_hdrs, ic->i_recv_hdrs_dma,
- ic->i_recv_ring.w_nr);
+ rds_dma_hdrs_free(rds_ibdev, ic->i_recv_hdrs, ic->i_recv_hdrs_dma,
+ ic->i_recv_ring.w_nr, DMA_FROM_DEVICE);
ic->i_recv_hdrs = NULL;
ic->i_recv_hdrs_dma = NULL;
send_hdrs_dma_out:
- rds_dma_hdrs_free(pool, ic->i_send_hdrs, ic->i_send_hdrs_dma,
- ic->i_send_ring.w_nr);
+ rds_dma_hdrs_free(rds_ibdev, ic->i_send_hdrs, ic->i_send_hdrs_dma,
+ ic->i_send_ring.w_nr, DMA_TO_DEVICE);
ic->i_send_hdrs = NULL;
ic->i_send_hdrs_dma = NULL;
@@ -1110,29 +1137,30 @@ void rds_ib_conn_path_shutdown(struct rds_conn_path *cp)
}
if (ic->rds_ibdev) {
- struct dma_pool *pool;
-
- pool = ic->rds_ibdev->rid_hdrs_pool;
-
/* then free the resources that ib callbacks use */
if (ic->i_send_hdrs) {
- rds_dma_hdrs_free(pool, ic->i_send_hdrs,
+ rds_dma_hdrs_free(ic->rds_ibdev,
+ ic->i_send_hdrs,
ic->i_send_hdrs_dma,
- ic->i_send_ring.w_nr);
+ ic->i_send_ring.w_nr,
+ DMA_TO_DEVICE);
ic->i_send_hdrs = NULL;
ic->i_send_hdrs_dma = NULL;
}
if (ic->i_recv_hdrs) {
- rds_dma_hdrs_free(pool, ic->i_recv_hdrs,
+ rds_dma_hdrs_free(ic->rds_ibdev,
+ ic->i_recv_hdrs,
ic->i_recv_hdrs_dma,
- ic->i_recv_ring.w_nr);
+ ic->i_recv_ring.w_nr,
+ DMA_FROM_DEVICE);
ic->i_recv_hdrs = NULL;
ic->i_recv_hdrs_dma = NULL;
}
if (ic->i_ack) {
- dma_pool_free(pool, ic->i_ack, ic->i_ack_dma);
+ rds_dma_hdr_free(ic->rds_ibdev->dev, ic->i_ack,
+ ic->i_ack_dma, DMA_TO_DEVICE);
ic->i_ack = NULL;
}
} else {
diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c
index 3cffcec5fb37..6fdedd9dbbc2 100644
--- a/net/rds/ib_recv.c
+++ b/net/rds/ib_recv.c
@@ -662,10 +662,16 @@ static void rds_ib_send_ack(struct rds_ib_connection *ic, unsigned int adv_credi
seq = rds_ib_get_ack(ic);
rdsdebug("send_ack: ic %p ack %llu\n", ic, (unsigned long long) seq);
+
+ ib_dma_sync_single_for_cpu(ic->rds_ibdev->dev, ic->i_ack_dma,
+ sizeof(*hdr), DMA_TO_DEVICE);
rds_message_populate_header(hdr, 0, 0, 0);
hdr->h_ack = cpu_to_be64(seq);
hdr->h_credit = adv_credits;
rds_message_make_checksum(hdr);
+ ib_dma_sync_single_for_device(ic->rds_ibdev->dev, ic->i_ack_dma,
+ sizeof(*hdr), DMA_TO_DEVICE);
+
ic->i_ack_queued = jiffies;
ret = ib_post_send(ic->i_cm_id->qp, &ic->i_ack_wr, NULL);
@@ -845,6 +851,7 @@ static void rds_ib_process_recv(struct rds_connection *conn,
struct rds_ib_connection *ic = conn->c_transport_data;
struct rds_ib_incoming *ibinc = ic->i_ibinc;
struct rds_header *ihdr, *hdr;
+ dma_addr_t dma_addr = ic->i_recv_hdrs_dma[recv - ic->i_recvs];
/* XXX shut down the connection if port 0,0 are seen? */
@@ -863,6 +870,8 @@ static void rds_ib_process_recv(struct rds_connection *conn,
ihdr = ic->i_recv_hdrs[recv - ic->i_recvs];
+ ib_dma_sync_single_for_cpu(ic->rds_ibdev->dev, dma_addr,
+ sizeof(*ihdr), DMA_FROM_DEVICE);
/* Validate the checksum. */
if (!rds_message_verify_checksum(ihdr)) {
rds_ib_conn_error(conn, "incoming message "
@@ -870,7 +879,7 @@ static void rds_ib_process_recv(struct rds_connection *conn,
"forcing a reconnect\n",
&conn->c_faddr);
rds_stats_inc(s_recv_drop_bad_checksum);
- return;
+ goto done;
}
/* Process the ACK sequence which comes with every packet */
@@ -899,7 +908,7 @@ static void rds_ib_process_recv(struct rds_connection *conn,
*/
rds_ib_frag_free(ic, recv->r_frag);
recv->r_frag = NULL;
- return;
+ goto done;
}
/*
@@ -933,7 +942,7 @@ static void rds_ib_process_recv(struct rds_connection *conn,
hdr->h_dport != ihdr->h_dport) {
rds_ib_conn_error(conn,
"fragment header mismatch; forcing reconnect\n");
- return;
+ goto done;
}
}
@@ -965,6 +974,9 @@ static void rds_ib_process_recv(struct rds_connection *conn,
rds_inc_put(&ibinc->ii_inc);
}
+done:
+ ib_dma_sync_single_for_device(ic->rds_ibdev->dev, dma_addr,
+ sizeof(*ihdr), DMA_FROM_DEVICE);
}
void rds_ib_recv_cqe_handler(struct rds_ib_connection *ic,
diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c
index dfe778220657..92b4a8689aae 100644
--- a/net/rds/ib_send.c
+++ b/net/rds/ib_send.c
@@ -638,6 +638,10 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
send->s_sge[0].length = sizeof(struct rds_header);
send->s_sge[0].lkey = ic->i_pd->local_dma_lkey;
+ ib_dma_sync_single_for_cpu(ic->rds_ibdev->dev,
+ ic->i_send_hdrs_dma[pos],
+ sizeof(struct rds_header),
+ DMA_TO_DEVICE);
memcpy(ic->i_send_hdrs[pos], &rm->m_inc.i_hdr,
sizeof(struct rds_header));
@@ -688,6 +692,10 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
adv_credits = 0;
rds_ib_stats_inc(s_ib_tx_credit_updates);
}
+ ib_dma_sync_single_for_device(ic->rds_ibdev->dev,
+ ic->i_send_hdrs_dma[pos],
+ sizeof(struct rds_header),
+ DMA_TO_DEVICE);
if (prev)
prev->s_wr.next = &send->s_wr;
diff --git a/net/socket.c b/net/socket.c
index 9a240b45bdf3..33e8b6c4e1d3 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -2175,6 +2175,17 @@ SYSCALL_DEFINE5(getsockopt, int, fd, int, level, int, optname,
* Shutdown a socket.
*/
+int __sys_shutdown_sock(struct socket *sock, int how)
+{
+ int err;
+
+ err = security_socket_shutdown(sock, how);
+ if (!err)
+ err = sock->ops->shutdown(sock, how);
+
+ return err;
+}
+
int __sys_shutdown(int fd, int how)
{
int err, fput_needed;
@@ -2182,9 +2193,7 @@ int __sys_shutdown(int fd, int how)
sock = sockfd_lookup_light(fd, &err, &fput_needed);
if (sock != NULL) {
- err = security_socket_shutdown(sock, how);
- if (!err)
- err = sock->ops->shutdown(sock, how);
+ err = __sys_shutdown_sock(sock, how);
fput_light(sock->file, fput_needed);
}
return err;
diff --git a/net/sunrpc/auth_gss/gss_rpc_upcall.c b/net/sunrpc/auth_gss/gss_rpc_upcall.c
index af9c7f43859c..d1c003a25b0f 100644
--- a/net/sunrpc/auth_gss/gss_rpc_upcall.c
+++ b/net/sunrpc/auth_gss/gss_rpc_upcall.c
@@ -200,7 +200,7 @@ static int gssp_call(struct net *net, struct rpc_message *msg)
static void gssp_free_receive_pages(struct gssx_arg_accept_sec_context *arg)
{
- int i;
+ unsigned int i;
for (i = 0; i < arg->npages && arg->pages[i]; i++)
__free_page(arg->pages[i]);
@@ -210,14 +210,19 @@ static void gssp_free_receive_pages(struct gssx_arg_accept_sec_context *arg)
static int gssp_alloc_receive_pages(struct gssx_arg_accept_sec_context *arg)
{
+ unsigned int i;
+
arg->npages = DIV_ROUND_UP(NGROUPS_MAX * 4, PAGE_SIZE);
arg->pages = kcalloc(arg->npages, sizeof(struct page *), GFP_KERNEL);
- /*
- * XXX: actual pages are allocated by xdr layer in
- * xdr_partial_copy_from_skb.
- */
if (!arg->pages)
return -ENOMEM;
+ for (i = 0; i < arg->npages; i++) {
+ arg->pages[i] = alloc_page(GFP_KERNEL);
+ if (!arg->pages[i]) {
+ gssp_free_receive_pages(arg);
+ return -ENOMEM;
+ }
+ }
return 0;
}
diff --git a/net/sunrpc/auth_gss/gss_rpc_xdr.c b/net/sunrpc/auth_gss/gss_rpc_xdr.c
index 2ff7b7083eba..d79f12c2550a 100644
--- a/net/sunrpc/auth_gss/gss_rpc_xdr.c
+++ b/net/sunrpc/auth_gss/gss_rpc_xdr.c
@@ -771,7 +771,6 @@ void gssx_enc_accept_sec_context(struct rpc_rqst *req,
xdr_inline_pages(&req->rq_rcv_buf,
PAGE_SIZE/2 /* pretty arbitrary */,
arg->pages, 0 /* page base */, arg->npages * PAGE_SIZE);
- req->rq_rcv_buf.flags |= XDRBUF_SPARSE_PAGES;
done:
if (err)
dprintk("RPC: gssx_enc_accept_sec_context: %d\n", err);
@@ -789,7 +788,7 @@ int gssx_dec_accept_sec_context(struct rpc_rqst *rqstp,
scratch = alloc_page(GFP_KERNEL);
if (!scratch)
return -ENOMEM;
- xdr_set_scratch_buffer(xdr, page_address(scratch), PAGE_SIZE);
+ xdr_set_scratch_page(xdr, scratch);
/* res->status */
err = gssx_dec_status(xdr, &res->status);
diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c
index 20c93b68505e..1a2c1c44bb00 100644
--- a/net/sunrpc/cache.c
+++ b/net/sunrpc/cache.c
@@ -778,7 +778,6 @@ void cache_clean_deferred(void *owner)
*/
static DEFINE_SPINLOCK(queue_lock);
-static DEFINE_MUTEX(queue_io_mutex);
struct cache_queue {
struct list_head list;
@@ -906,44 +905,26 @@ static ssize_t cache_do_downcall(char *kaddr, const char __user *buf,
return ret;
}
-static ssize_t cache_slow_downcall(const char __user *buf,
- size_t count, struct cache_detail *cd)
-{
- static char write_buf[32768]; /* protected by queue_io_mutex */
- ssize_t ret = -EINVAL;
-
- if (count >= sizeof(write_buf))
- goto out;
- mutex_lock(&queue_io_mutex);
- ret = cache_do_downcall(write_buf, buf, count, cd);
- mutex_unlock(&queue_io_mutex);
-out:
- return ret;
-}
-
static ssize_t cache_downcall(struct address_space *mapping,
const char __user *buf,
size_t count, struct cache_detail *cd)
{
- struct page *page;
- char *kaddr;
+ char *write_buf;
ssize_t ret = -ENOMEM;
- if (count >= PAGE_SIZE)
- goto out_slow;
+ if (count >= 32768) { /* 32k is max userland buffer, lets check anyway */
+ ret = -EINVAL;
+ goto out;
+ }
- page = find_or_create_page(mapping, 0, GFP_KERNEL);
- if (!page)
- goto out_slow;
+ write_buf = kvmalloc(count + 1, GFP_KERNEL);
+ if (!write_buf)
+ goto out;
- kaddr = kmap(page);
- ret = cache_do_downcall(kaddr, buf, count, cd);
- kunmap(page);
- unlock_page(page);
- put_page(page);
+ ret = cache_do_downcall(write_buf, buf, count, cd);
+ kvfree(write_buf);
+out:
return ret;
-out_slow:
- return cache_slow_downcall(buf, count, cd);
}
static ssize_t cache_write(struct file *filp, const char __user *buf,
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 3259120462ed..612f0a641f4c 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -1251,10 +1251,7 @@ void rpc_prepare_reply_pages(struct rpc_rqst *req, struct page **pages,
unsigned int base, unsigned int len,
unsigned int hdrsize)
{
- /* Subtract one to force an extra word of buffer space for the
- * payload's XDR pad to fall into the rcv_buf's tail iovec.
- */
- hdrsize += RPC_REPHDRSIZE + req->rq_cred->cr_auth->au_ralign - 1;
+ hdrsize += RPC_REPHDRSIZE + req->rq_cred->cr_auth->au_ralign;
xdr_inline_pages(&req->rq_rcv_buf, hdrsize << 2, pages, base, len);
trace_rpc_xdr_reply_pages(req->rq_task, &req->rq_rcv_buf);
diff --git a/net/sunrpc/debugfs.c b/net/sunrpc/debugfs.c
index fd9bca242724..56029e3af6ff 100644
--- a/net/sunrpc/debugfs.c
+++ b/net/sunrpc/debugfs.c
@@ -128,13 +128,13 @@ static int do_xprt_debugfs(struct rpc_clnt *clnt, struct rpc_xprt *xprt, void *n
return 0;
len = snprintf(name, sizeof(name), "../../rpc_xprt/%s",
xprt->debugfs->d_name.name);
- if (len > sizeof(name))
+ if (len >= sizeof(name))
return -1;
if (*nump == 0)
strcpy(link, "xprt");
else {
len = snprintf(link, sizeof(link), "xprt%d", *nump);
- if (len > sizeof(link))
+ if (len >= sizeof(link))
return -1;
}
debugfs_create_symlink(link, clnt->cl_debugfs, name);
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index f06d7c315017..cf702a5f7fe5 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -676,6 +676,23 @@ struct rpc_task *rpc_wake_up_next(struct rpc_wait_queue *queue)
EXPORT_SYMBOL_GPL(rpc_wake_up_next);
/**
+ * rpc_wake_up_locked - wake up all rpc_tasks
+ * @queue: rpc_wait_queue on which the tasks are sleeping
+ *
+ */
+static void rpc_wake_up_locked(struct rpc_wait_queue *queue)
+{
+ struct rpc_task *task;
+
+ for (;;) {
+ task = __rpc_find_next_queued(queue);
+ if (task == NULL)
+ break;
+ rpc_wake_up_task_queue_locked(queue, task);
+ }
+}
+
+/**
* rpc_wake_up - wake up all rpc_tasks
* @queue: rpc_wait_queue on which the tasks are sleeping
*
@@ -683,25 +700,28 @@ EXPORT_SYMBOL_GPL(rpc_wake_up_next);
*/
void rpc_wake_up(struct rpc_wait_queue *queue)
{
- struct list_head *head;
-
spin_lock(&queue->lock);
- head = &queue->tasks[queue->maxpriority];
+ rpc_wake_up_locked(queue);
+ spin_unlock(&queue->lock);
+}
+EXPORT_SYMBOL_GPL(rpc_wake_up);
+
+/**
+ * rpc_wake_up_status_locked - wake up all rpc_tasks and set their status value.
+ * @queue: rpc_wait_queue on which the tasks are sleeping
+ * @status: status value to set
+ */
+static void rpc_wake_up_status_locked(struct rpc_wait_queue *queue, int status)
+{
+ struct rpc_task *task;
+
for (;;) {
- while (!list_empty(head)) {
- struct rpc_task *task;
- task = list_first_entry(head,
- struct rpc_task,
- u.tk_wait.list);
- rpc_wake_up_task_queue_locked(queue, task);
- }
- if (head == &queue->tasks[0])
+ task = __rpc_find_next_queued(queue);
+ if (task == NULL)
break;
- head--;
+ rpc_wake_up_task_queue_set_status_locked(queue, task, status);
}
- spin_unlock(&queue->lock);
}
-EXPORT_SYMBOL_GPL(rpc_wake_up);
/**
* rpc_wake_up_status - wake up all rpc_tasks and set their status value.
@@ -712,23 +732,8 @@ EXPORT_SYMBOL_GPL(rpc_wake_up);
*/
void rpc_wake_up_status(struct rpc_wait_queue *queue, int status)
{
- struct list_head *head;
-
spin_lock(&queue->lock);
- head = &queue->tasks[queue->maxpriority];
- for (;;) {
- while (!list_empty(head)) {
- struct rpc_task *task;
- task = list_first_entry(head,
- struct rpc_task,
- u.tk_wait.list);
- task->tk_status = status;
- rpc_wake_up_task_queue_locked(queue, task);
- }
- if (head == &queue->tasks[0])
- break;
- head--;
- }
+ rpc_wake_up_status_locked(queue, status);
spin_unlock(&queue->lock);
}
EXPORT_SYMBOL_GPL(rpc_wake_up_status);
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index c211b607239e..4187745887f0 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -614,6 +614,10 @@ svc_rqst_alloc(struct svc_serv *serv, struct svc_pool *pool, int node)
rqstp->rq_server = serv;
rqstp->rq_pool = pool;
+ rqstp->rq_scratch_page = alloc_pages_node(node, GFP_KERNEL, 0);
+ if (!rqstp->rq_scratch_page)
+ goto out_enomem;
+
rqstp->rq_argp = kmalloc_node(serv->sv_xdrsize, GFP_KERNEL, node);
if (!rqstp->rq_argp)
goto out_enomem;
@@ -842,6 +846,7 @@ void
svc_rqst_free(struct svc_rqst *rqstp)
{
svc_release_buffer(rqstp);
+ put_page(rqstp->rq_scratch_page);
kfree(rqstp->rq_resp);
kfree(rqstp->rq_argp);
kfree(rqstp->rq_auth_data);
@@ -1622,7 +1627,7 @@ u32 svc_max_payload(const struct svc_rqst *rqstp)
EXPORT_SYMBOL_GPL(svc_max_payload);
/**
- * svc_encode_read_payload - mark a range of bytes as a READ payload
+ * svc_encode_result_payload - mark a range of bytes as a result payload
* @rqstp: svc_rqst to operate on
* @offset: payload's byte offset in rqstp->rq_res
* @length: size of payload, in bytes
@@ -1630,12 +1635,13 @@ EXPORT_SYMBOL_GPL(svc_max_payload);
* Returns zero on success, or a negative errno if a permanent
* error occurred.
*/
-int svc_encode_read_payload(struct svc_rqst *rqstp, unsigned int offset,
- unsigned int length)
+int svc_encode_result_payload(struct svc_rqst *rqstp, unsigned int offset,
+ unsigned int length)
{
- return rqstp->rq_xprt->xpt_ops->xpo_read_payload(rqstp, offset, length);
+ return rqstp->rq_xprt->xpt_ops->xpo_result_payload(rqstp, offset,
+ length);
}
-EXPORT_SYMBOL_GPL(svc_encode_read_payload);
+EXPORT_SYMBOL_GPL(svc_encode_result_payload);
/**
* svc_fill_write_vector - Construct data argument for VFS write call
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index 43cf8dbde898..5fb9164aa690 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -813,8 +813,6 @@ static int svc_handle_xprt(struct svc_rqst *rqstp, struct svc_xprt *xprt)
len = svc_deferred_recv(rqstp);
else
len = xprt->xpt_ops->xpo_recvfrom(rqstp);
- if (len > 0)
- trace_svc_xdr_recvfrom(rqstp, &rqstp->rq_arg);
rqstp->rq_stime = ktime_get();
rqstp->rq_reserved = serv->sv_max_mesg;
atomic_add(rqstp->rq_reserved, &xprt->xpt_reserved);
@@ -868,7 +866,7 @@ int svc_recv(struct svc_rqst *rqstp, long timeout)
if (serv->sv_stats)
serv->sv_stats->netcnt++;
- trace_svc_recv(rqstp, len);
+ trace_svc_xdr_recvfrom(rqstp, &rqstp->rq_arg);
return len;
out_release:
rqstp->rq_res.len = 0;
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index c2752e2b9ce3..b248f2349437 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -181,8 +181,8 @@ static void svc_set_cmsg_data(struct svc_rqst *rqstp, struct cmsghdr *cmh)
}
}
-static int svc_sock_read_payload(struct svc_rqst *rqstp, unsigned int offset,
- unsigned int length)
+static int svc_sock_result_payload(struct svc_rqst *rqstp, unsigned int offset,
+ unsigned int length)
{
return 0;
}
@@ -635,7 +635,7 @@ static const struct svc_xprt_ops svc_udp_ops = {
.xpo_create = svc_udp_create,
.xpo_recvfrom = svc_udp_recvfrom,
.xpo_sendto = svc_udp_sendto,
- .xpo_read_payload = svc_sock_read_payload,
+ .xpo_result_payload = svc_sock_result_payload,
.xpo_release_rqst = svc_udp_release_rqst,
.xpo_detach = svc_sock_detach,
.xpo_free = svc_sock_free,
@@ -1123,7 +1123,7 @@ static const struct svc_xprt_ops svc_tcp_ops = {
.xpo_create = svc_tcp_create,
.xpo_recvfrom = svc_tcp_recvfrom,
.xpo_sendto = svc_tcp_sendto,
- .xpo_read_payload = svc_sock_read_payload,
+ .xpo_result_payload = svc_sock_result_payload,
.xpo_release_rqst = svc_tcp_release_rqst,
.xpo_detach = svc_tcp_sock_detach,
.xpo_free = svc_sock_free,
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index 71e03b930b70..3964ff74ee51 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -123,8 +123,7 @@ EXPORT_SYMBOL_GPL(xdr_decode_string_inplace);
* @len: length of string, in bytes
*
*/
-void
-xdr_terminate_string(struct xdr_buf *buf, const u32 len)
+void xdr_terminate_string(const struct xdr_buf *buf, const u32 len)
{
char *kaddr;
@@ -134,8 +133,7 @@ xdr_terminate_string(struct xdr_buf *buf, const u32 len)
}
EXPORT_SYMBOL_GPL(xdr_terminate_string);
-size_t
-xdr_buf_pagecount(struct xdr_buf *buf)
+size_t xdr_buf_pagecount(const struct xdr_buf *buf)
{
if (!buf->page_len)
return 0;
@@ -193,9 +191,6 @@ xdr_inline_pages(struct xdr_buf *xdr, unsigned int offset,
tail->iov_base = buf + offset;
tail->iov_len = buflen - offset;
- if ((xdr->page_len & 3) == 0)
- tail->iov_len -= sizeof(__be32);
-
xdr->buflen += len;
}
EXPORT_SYMBOL_GPL(xdr_inline_pages);
@@ -228,6 +223,9 @@ _shift_data_left_pages(struct page **pages, size_t pgto_base,
BUG_ON(pgfrom_base <= pgto_base);
+ if (!len)
+ return;
+
pgto = pages + (pgto_base >> PAGE_SHIFT);
pgfrom = pages + (pgfrom_base >> PAGE_SHIFT);
@@ -266,26 +264,6 @@ _shift_data_left_pages(struct page **pages, size_t pgto_base,
} while ((len -= copy) != 0);
}
-static void
-_shift_data_left_tail(struct xdr_buf *buf, unsigned int pgto, size_t len)
-{
- struct kvec *tail = buf->tail;
-
- if (len > tail->iov_len)
- len = tail->iov_len;
-
- _copy_to_pages(buf->pages,
- buf->page_base + pgto,
- (char *)tail->iov_base,
- len);
- tail->iov_len -= len;
-
- if (tail->iov_len > 0)
- memmove((char *)tail->iov_base,
- tail->iov_base + len,
- tail->iov_len);
-}
-
/**
* _shift_data_right_pages
* @pages: vector of pages containing both the source and dest memory area.
@@ -310,6 +288,9 @@ _shift_data_right_pages(struct page **pages, size_t pgto_base,
BUG_ON(pgto_base <= pgfrom_base);
+ if (!len)
+ return;
+
pgto_base += len;
pgfrom_base += len;
@@ -351,46 +332,6 @@ _shift_data_right_pages(struct page **pages, size_t pgto_base,
} while ((len -= copy) != 0);
}
-static unsigned int
-_shift_data_right_tail(struct xdr_buf *buf, unsigned int pgfrom, size_t len)
-{
- struct kvec *tail = buf->tail;
- unsigned int tailbuf_len;
- unsigned int result = 0;
- size_t copy;
-
- tailbuf_len = buf->buflen - buf->head->iov_len - buf->page_len;
-
- /* Shift the tail first */
- if (tailbuf_len != 0) {
- unsigned int free_space = tailbuf_len - tail->iov_len;
-
- if (len < free_space)
- free_space = len;
- if (len > free_space)
- len = free_space;
-
- tail->iov_len += free_space;
- copy = len;
-
- if (tail->iov_len > len) {
- char *p = (char *)tail->iov_base + len;
- memmove(p, tail->iov_base, tail->iov_len - free_space);
- result += tail->iov_len - free_space;
- } else
- copy = tail->iov_len;
-
- /* Copy from the inlined pages into the tail */
- _copy_from_pages((char *)tail->iov_base,
- buf->pages,
- buf->page_base + pgfrom,
- copy);
- result += copy;
- }
-
- return result;
-}
-
/**
* _copy_to_pages
* @pages: array of pages
@@ -408,6 +349,9 @@ _copy_to_pages(struct page **pages, size_t pgbase, const char *p, size_t len)
char *vto;
size_t copy;
+ if (!len)
+ return;
+
pgto = pages + (pgbase >> PAGE_SHIFT);
pgbase &= ~PAGE_MASK;
@@ -452,6 +396,9 @@ _copy_from_pages(char *p, struct page **pages, size_t pgbase, size_t len)
char *vfrom;
size_t copy;
+ if (!len)
+ return;
+
pgfrom = pages + (pgbase >> PAGE_SHIFT);
pgbase &= ~PAGE_MASK;
@@ -475,18 +422,42 @@ _copy_from_pages(char *p, struct page **pages, size_t pgbase, size_t len)
}
EXPORT_SYMBOL_GPL(_copy_from_pages);
+static void xdr_buf_iov_zero(const struct kvec *iov, unsigned int base,
+ unsigned int len)
+{
+ if (base >= iov->iov_len)
+ return;
+ if (len > iov->iov_len - base)
+ len = iov->iov_len - base;
+ memset(iov->iov_base + base, 0, len);
+}
+
/**
- * _zero_pages
- * @pages: array of pages
- * @pgbase: beginning page vector address
+ * xdr_buf_pages_zero
+ * @buf: xdr_buf
+ * @pgbase: beginning offset
* @len: length
*/
-static void
-_zero_pages(struct page **pages, size_t pgbase, size_t len)
+static void xdr_buf_pages_zero(const struct xdr_buf *buf, unsigned int pgbase,
+ unsigned int len)
{
+ struct page **pages = buf->pages;
struct page **page;
char *vpage;
- size_t zero;
+ unsigned int zero;
+
+ if (!len)
+ return;
+ if (pgbase >= buf->page_len) {
+ xdr_buf_iov_zero(buf->tail, pgbase - buf->page_len, len);
+ return;
+ }
+ if (pgbase + len > buf->page_len) {
+ xdr_buf_iov_zero(buf->tail, 0, pgbase + len - buf->page_len);
+ len = buf->page_len - pgbase;
+ }
+
+ pgbase += buf->page_base;
page = pages + (pgbase >> PAGE_SHIFT);
pgbase &= ~PAGE_MASK;
@@ -507,122 +478,367 @@ _zero_pages(struct page **pages, size_t pgbase, size_t len)
} while ((len -= zero) != 0);
}
+static unsigned int xdr_buf_pages_fill_sparse(const struct xdr_buf *buf,
+ unsigned int buflen, gfp_t gfp)
+{
+ unsigned int i, npages, pagelen;
+
+ if (!(buf->flags & XDRBUF_SPARSE_PAGES))
+ return buflen;
+ if (buflen <= buf->head->iov_len)
+ return buflen;
+ pagelen = buflen - buf->head->iov_len;
+ if (pagelen > buf->page_len)
+ pagelen = buf->page_len;
+ npages = (pagelen + buf->page_base + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ for (i = 0; i < npages; i++) {
+ if (!buf->pages[i])
+ continue;
+ buf->pages[i] = alloc_page(gfp);
+ if (likely(buf->pages[i]))
+ continue;
+ buflen -= pagelen;
+ pagelen = i << PAGE_SHIFT;
+ if (pagelen > buf->page_base)
+ buflen += pagelen - buf->page_base;
+ break;
+ }
+ return buflen;
+}
+
+static void xdr_buf_try_expand(struct xdr_buf *buf, unsigned int len)
+{
+ struct kvec *head = buf->head;
+ struct kvec *tail = buf->tail;
+ unsigned int sum = head->iov_len + buf->page_len + tail->iov_len;
+ unsigned int free_space, newlen;
+
+ if (sum > buf->len) {
+ free_space = min_t(unsigned int, sum - buf->len, len);
+ newlen = xdr_buf_pages_fill_sparse(buf, buf->len + free_space,
+ GFP_KERNEL);
+ free_space = newlen - buf->len;
+ buf->len = newlen;
+ len -= free_space;
+ if (!len)
+ return;
+ }
+
+ if (buf->buflen > sum) {
+ /* Expand the tail buffer */
+ free_space = min_t(unsigned int, buf->buflen - sum, len);
+ tail->iov_len += free_space;
+ buf->len += free_space;
+ }
+}
+
+static void xdr_buf_tail_copy_right(const struct xdr_buf *buf,
+ unsigned int base, unsigned int len,
+ unsigned int shift)
+{
+ const struct kvec *tail = buf->tail;
+ unsigned int to = base + shift;
+
+ if (to >= tail->iov_len)
+ return;
+ if (len + to > tail->iov_len)
+ len = tail->iov_len - to;
+ memmove(tail->iov_base + to, tail->iov_base + base, len);
+}
+
+static void xdr_buf_pages_copy_right(const struct xdr_buf *buf,
+ unsigned int base, unsigned int len,
+ unsigned int shift)
+{
+ const struct kvec *tail = buf->tail;
+ unsigned int to = base + shift;
+ unsigned int pglen = 0;
+ unsigned int talen = 0, tato = 0;
+
+ if (base >= buf->page_len)
+ return;
+ if (len > buf->page_len - base)
+ len = buf->page_len - base;
+ if (to >= buf->page_len) {
+ tato = to - buf->page_len;
+ if (tail->iov_len >= len + tato)
+ talen = len;
+ else if (tail->iov_len > tato)
+ talen = tail->iov_len - tato;
+ } else if (len + to >= buf->page_len) {
+ pglen = buf->page_len - to;
+ talen = len - pglen;
+ if (talen > tail->iov_len)
+ talen = tail->iov_len;
+ } else
+ pglen = len;
+
+ _copy_from_pages(tail->iov_base + tato, buf->pages,
+ buf->page_base + base + pglen, talen);
+ _shift_data_right_pages(buf->pages, buf->page_base + to,
+ buf->page_base + base, pglen);
+}
+
+static void xdr_buf_head_copy_right(const struct xdr_buf *buf,
+ unsigned int base, unsigned int len,
+ unsigned int shift)
+{
+ const struct kvec *head = buf->head;
+ const struct kvec *tail = buf->tail;
+ unsigned int to = base + shift;
+ unsigned int pglen = 0, pgto = 0;
+ unsigned int talen = 0, tato = 0;
+
+ if (base >= head->iov_len)
+ return;
+ if (len > head->iov_len - base)
+ len = head->iov_len - base;
+ if (to >= buf->page_len + head->iov_len) {
+ tato = to - buf->page_len - head->iov_len;
+ talen = len;
+ } else if (to >= head->iov_len) {
+ pgto = to - head->iov_len;
+ pglen = len;
+ if (pgto + pglen > buf->page_len) {
+ talen = pgto + pglen - buf->page_len;
+ pglen -= talen;
+ }
+ } else {
+ pglen = len - to;
+ if (pglen > buf->page_len) {
+ talen = pglen - buf->page_len;
+ pglen = buf->page_len;
+ }
+ }
+
+ len -= talen;
+ base += len;
+ if (talen + tato > tail->iov_len)
+ talen = tail->iov_len > tato ? tail->iov_len - tato : 0;
+ memcpy(tail->iov_base + tato, head->iov_base + base, talen);
+
+ len -= pglen;
+ base -= pglen;
+ _copy_to_pages(buf->pages, buf->page_base + pgto, head->iov_base + base,
+ pglen);
+
+ base -= len;
+ memmove(head->iov_base + to, head->iov_base + base, len);
+}
+
+static void xdr_buf_tail_shift_right(const struct xdr_buf *buf,
+ unsigned int base, unsigned int len,
+ unsigned int shift)
+{
+ const struct kvec *tail = buf->tail;
+
+ if (base >= tail->iov_len || !shift || !len)
+ return;
+ xdr_buf_tail_copy_right(buf, base, len, shift);
+}
+
+static void xdr_buf_pages_shift_right(const struct xdr_buf *buf,
+ unsigned int base, unsigned int len,
+ unsigned int shift)
+{
+ if (!shift || !len)
+ return;
+ if (base >= buf->page_len) {
+ xdr_buf_tail_shift_right(buf, base - buf->page_len, len, shift);
+ return;
+ }
+ if (base + len > buf->page_len)
+ xdr_buf_tail_shift_right(buf, 0, base + len - buf->page_len,
+ shift);
+ xdr_buf_pages_copy_right(buf, base, len, shift);
+}
+
+static void xdr_buf_head_shift_right(const struct xdr_buf *buf,
+ unsigned int base, unsigned int len,
+ unsigned int shift)
+{
+ const struct kvec *head = buf->head;
+
+ if (!shift)
+ return;
+ if (base >= head->iov_len) {
+ xdr_buf_pages_shift_right(buf, head->iov_len - base, len,
+ shift);
+ return;
+ }
+ if (base + len > head->iov_len)
+ xdr_buf_pages_shift_right(buf, 0, base + len - head->iov_len,
+ shift);
+ xdr_buf_head_copy_right(buf, base, len, shift);
+}
+
+static void xdr_buf_tail_copy_left(const struct xdr_buf *buf, unsigned int base,
+ unsigned int len, unsigned int shift)
+{
+ const struct kvec *tail = buf->tail;
+
+ if (base >= tail->iov_len)
+ return;
+ if (len > tail->iov_len - base)
+ len = tail->iov_len - base;
+ /* Shift data into head */
+ if (shift > buf->page_len + base) {
+ const struct kvec *head = buf->head;
+ unsigned int hdto =
+ head->iov_len + buf->page_len + base - shift;
+ unsigned int hdlen = len;
+
+ if (WARN_ONCE(shift > head->iov_len + buf->page_len + base,
+ "SUNRPC: Misaligned data.\n"))
+ return;
+ if (hdto + hdlen > head->iov_len)
+ hdlen = head->iov_len - hdto;
+ memcpy(head->iov_base + hdto, tail->iov_base + base, hdlen);
+ base += hdlen;
+ len -= hdlen;
+ if (!len)
+ return;
+ }
+ /* Shift data into pages */
+ if (shift > base) {
+ unsigned int pgto = buf->page_len + base - shift;
+ unsigned int pglen = len;
+
+ if (pgto + pglen > buf->page_len)
+ pglen = buf->page_len - pgto;
+ _copy_to_pages(buf->pages, buf->page_base + pgto,
+ tail->iov_base + base, pglen);
+ base += pglen;
+ len -= pglen;
+ if (!len)
+ return;
+ }
+ memmove(tail->iov_base + base - shift, tail->iov_base + base, len);
+}
+
+static void xdr_buf_pages_copy_left(const struct xdr_buf *buf,
+ unsigned int base, unsigned int len,
+ unsigned int shift)
+{
+ unsigned int pgto;
+
+ if (base >= buf->page_len)
+ return;
+ if (len > buf->page_len - base)
+ len = buf->page_len - base;
+ /* Shift data into head */
+ if (shift > base) {
+ const struct kvec *head = buf->head;
+ unsigned int hdto = head->iov_len + base - shift;
+ unsigned int hdlen = len;
+
+ if (WARN_ONCE(shift > head->iov_len + base,
+ "SUNRPC: Misaligned data.\n"))
+ return;
+ if (hdto + hdlen > head->iov_len)
+ hdlen = head->iov_len - hdto;
+ _copy_from_pages(head->iov_base + hdto, buf->pages,
+ buf->page_base + base, hdlen);
+ base += hdlen;
+ len -= hdlen;
+ if (!len)
+ return;
+ }
+ pgto = base - shift;
+ _shift_data_left_pages(buf->pages, buf->page_base + pgto,
+ buf->page_base + base, len);
+}
+
+static void xdr_buf_tail_shift_left(const struct xdr_buf *buf,
+ unsigned int base, unsigned int len,
+ unsigned int shift)
+{
+ if (!shift || !len)
+ return;
+ xdr_buf_tail_copy_left(buf, base, len, shift);
+}
+
+static void xdr_buf_pages_shift_left(const struct xdr_buf *buf,
+ unsigned int base, unsigned int len,
+ unsigned int shift)
+{
+ if (!shift || !len)
+ return;
+ if (base >= buf->page_len) {
+ xdr_buf_tail_shift_left(buf, base - buf->page_len, len, shift);
+ return;
+ }
+ xdr_buf_pages_copy_left(buf, base, len, shift);
+ len += base;
+ if (len <= buf->page_len)
+ return;
+ xdr_buf_tail_copy_left(buf, 0, len - buf->page_len, shift);
+}
+
/**
* xdr_shrink_bufhead
* @buf: xdr_buf
- * @len: bytes to remove from buf->head[0]
+ * @len: new length of buf->head[0]
*
- * Shrinks XDR buffer's header kvec buf->head[0] by
+ * Shrinks XDR buffer's header kvec buf->head[0], setting it to
* 'len' bytes. The extra data is not lost, but is instead
* moved into the inlined pages and/or the tail.
*/
-static unsigned int
-xdr_shrink_bufhead(struct xdr_buf *buf, size_t len)
+static unsigned int xdr_shrink_bufhead(struct xdr_buf *buf, unsigned int len)
{
- struct kvec *head, *tail;
- size_t copy, offs;
- unsigned int pglen = buf->page_len;
- unsigned int result;
-
- result = 0;
- tail = buf->tail;
- head = buf->head;
+ struct kvec *head = buf->head;
+ unsigned int shift, buflen = max(buf->len, len);
WARN_ON_ONCE(len > head->iov_len);
- if (len > head->iov_len)
- len = head->iov_len;
-
- /* Shift the tail first */
- if (tail->iov_len != 0) {
- if (tail->iov_len > len) {
- copy = tail->iov_len - len;
- memmove((char *)tail->iov_base + len,
- tail->iov_base, copy);
- result += copy;
- }
- /* Copy from the inlined pages into the tail */
- copy = len;
- if (copy > pglen)
- copy = pglen;
- offs = len - copy;
- if (offs >= tail->iov_len)
- copy = 0;
- else if (copy > tail->iov_len - offs)
- copy = tail->iov_len - offs;
- if (copy != 0) {
- _copy_from_pages((char *)tail->iov_base + offs,
- buf->pages,
- buf->page_base + pglen + offs - len,
- copy);
- result += copy;
- }
- /* Do we also need to copy data from the head into the tail ? */
- if (len > pglen) {
- offs = copy = len - pglen;
- if (copy > tail->iov_len)
- copy = tail->iov_len;
- memcpy(tail->iov_base,
- (char *)head->iov_base +
- head->iov_len - offs,
- copy);
- result += copy;
- }
+ if (head->iov_len > buflen) {
+ buf->buflen -= head->iov_len - buflen;
+ head->iov_len = buflen;
}
- /* Now handle pages */
- if (pglen != 0) {
- if (pglen > len)
- _shift_data_right_pages(buf->pages,
- buf->page_base + len,
- buf->page_base,
- pglen - len);
- copy = len;
- if (len > pglen)
- copy = pglen;
- _copy_to_pages(buf->pages, buf->page_base,
- (char *)head->iov_base + head->iov_len - len,
- copy);
- result += copy;
- }
- head->iov_len -= len;
- buf->buflen -= len;
- /* Have we truncated the message? */
- if (buf->len > buf->buflen)
- buf->len = buf->buflen;
-
- return result;
+ if (len >= head->iov_len)
+ return 0;
+ shift = head->iov_len - len;
+ xdr_buf_try_expand(buf, shift);
+ xdr_buf_head_shift_right(buf, len, buflen - len, shift);
+ head->iov_len = len;
+ buf->buflen -= shift;
+ buf->len -= shift;
+ return shift;
}
/**
- * xdr_shrink_pagelen - shrinks buf->pages by up to @len bytes
+ * xdr_shrink_pagelen - shrinks buf->pages to @len bytes
* @buf: xdr_buf
- * @len: bytes to remove from buf->pages
+ * @len: new page buffer length
*
* The extra data is not lost, but is instead moved into buf->tail.
* Returns the actual number of bytes moved.
*/
-static unsigned int
-xdr_shrink_pagelen(struct xdr_buf *buf, size_t len)
+static unsigned int xdr_shrink_pagelen(struct xdr_buf *buf, unsigned int len)
{
- unsigned int pglen = buf->page_len;
- unsigned int result;
-
- if (len > buf->page_len)
- len = buf-> page_len;
+ unsigned int shift, buflen = buf->len - buf->head->iov_len;
- result = _shift_data_right_tail(buf, pglen - len, len);
- buf->page_len -= len;
- buf->buflen -= len;
- /* Have we truncated the message? */
- if (buf->len > buf->buflen)
- buf->len = buf->buflen;
-
- return result;
+ WARN_ON_ONCE(len > buf->page_len);
+ if (buf->head->iov_len >= buf->len || len > buflen)
+ buflen = len;
+ if (buf->page_len > buflen) {
+ buf->buflen -= buf->page_len - buflen;
+ buf->page_len = buflen;
+ }
+ if (len >= buf->page_len)
+ return 0;
+ shift = buf->page_len - len;
+ xdr_buf_try_expand(buf, shift);
+ xdr_buf_pages_shift_right(buf, len, buflen - len, shift);
+ buf->page_len = len;
+ buf->len -= shift;
+ buf->buflen -= shift;
+ return shift;
}
void
xdr_shift_buf(struct xdr_buf *buf, size_t len)
{
- xdr_shrink_bufhead(buf, len);
+ xdr_shrink_bufhead(buf, buf->head->iov_len - len);
}
EXPORT_SYMBOL_GPL(xdr_shift_buf);
@@ -636,6 +852,18 @@ unsigned int xdr_stream_pos(const struct xdr_stream *xdr)
}
EXPORT_SYMBOL_GPL(xdr_stream_pos);
+static void xdr_stream_set_pos(struct xdr_stream *xdr, unsigned int pos)
+{
+ unsigned int blen = xdr->buf->len;
+
+ xdr->nwords = blen > pos ? XDR_QUADLEN(blen) - XDR_QUADLEN(pos) : 0;
+}
+
+static void xdr_stream_page_set_pos(struct xdr_stream *xdr, unsigned int pos)
+{
+ xdr_stream_set_pos(xdr, pos + xdr->buf->head[0].iov_len);
+}
+
/**
* xdr_page_pos - Return the current offset from the start of the xdr pages
* @xdr: pointer to struct xdr_stream
@@ -669,7 +897,7 @@ void xdr_init_encode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p,
struct kvec *iov = buf->head;
int scratch_len = buf->buflen - buf->page_len - buf->tail[0].iov_len;
- xdr_set_scratch_buffer(xdr, NULL, 0);
+ xdr_reset_scratch_buffer(xdr);
BUG_ON(scratch_len < 0);
xdr->buf = buf;
xdr->iov = iov;
@@ -713,7 +941,7 @@ inline void xdr_commit_encode(struct xdr_stream *xdr)
page = page_address(*xdr->page_ptr);
memcpy(xdr->scratch.iov_base, page, shift);
memmove(page, page + shift, (void *)xdr->p - page);
- xdr->scratch.iov_len = 0;
+ xdr_reset_scratch_buffer(xdr);
}
EXPORT_SYMBOL_GPL(xdr_commit_encode);
@@ -743,8 +971,7 @@ static __be32 *xdr_get_next_encode_buffer(struct xdr_stream *xdr,
* the "scratch" iov to track any temporarily unused fragment of
* space at the end of the previous buffer:
*/
- xdr->scratch.iov_base = xdr->p;
- xdr->scratch.iov_len = frag1bytes;
+ xdr_set_scratch_buffer(xdr, xdr->p, frag1bytes);
p = page_address(*xdr->page_ptr);
/*
* Note this is where the next encode will start after we've
@@ -970,19 +1197,31 @@ void xdr_write_pages(struct xdr_stream *xdr, struct page **pages, unsigned int b
}
EXPORT_SYMBOL_GPL(xdr_write_pages);
-static void xdr_set_iov(struct xdr_stream *xdr, struct kvec *iov,
- unsigned int len)
+static unsigned int xdr_set_iov(struct xdr_stream *xdr, struct kvec *iov,
+ unsigned int base, unsigned int len)
{
if (len > iov->iov_len)
len = iov->iov_len;
- xdr->p = (__be32*)iov->iov_base;
+ if (unlikely(base > len))
+ base = len;
+ xdr->p = (__be32*)(iov->iov_base + base);
xdr->end = (__be32*)(iov->iov_base + len);
xdr->iov = iov;
xdr->page_ptr = NULL;
+ return len - base;
+}
+
+static unsigned int xdr_set_tail_base(struct xdr_stream *xdr,
+ unsigned int base, unsigned int len)
+{
+ struct xdr_buf *buf = xdr->buf;
+
+ xdr_stream_set_pos(xdr, base + buf->page_len + buf->head->iov_len);
+ return xdr_set_iov(xdr, buf->tail, base, len);
}
-static int xdr_set_page_base(struct xdr_stream *xdr,
- unsigned int base, unsigned int len)
+static unsigned int xdr_set_page_base(struct xdr_stream *xdr,
+ unsigned int base, unsigned int len)
{
unsigned int pgnr;
unsigned int maxlen;
@@ -991,12 +1230,15 @@ static int xdr_set_page_base(struct xdr_stream *xdr,
void *kaddr;
maxlen = xdr->buf->page_len;
- if (base >= maxlen)
- return -EINVAL;
- maxlen -= base;
+ if (base >= maxlen) {
+ base = maxlen;
+ maxlen = 0;
+ } else
+ maxlen -= base;
if (len > maxlen)
len = maxlen;
+ xdr_stream_page_set_pos(xdr, base);
base += xdr->buf->page_base;
pgnr = base >> PAGE_SHIFT;
@@ -1011,14 +1253,16 @@ static int xdr_set_page_base(struct xdr_stream *xdr,
pgend = PAGE_SIZE;
xdr->end = (__be32*)(kaddr + pgend);
xdr->iov = NULL;
- return 0;
+ return len;
}
static void xdr_set_page(struct xdr_stream *xdr, unsigned int base,
unsigned int len)
{
- if (xdr_set_page_base(xdr, base, len) < 0)
- xdr_set_iov(xdr, xdr->buf->tail, xdr->nwords << 2);
+ if (xdr_set_page_base(xdr, base, len) == 0) {
+ base -= xdr->buf->page_len;
+ xdr_set_tail_base(xdr, base, len);
+ }
}
static void xdr_set_next_page(struct xdr_stream *xdr)
@@ -1027,17 +1271,18 @@ static void xdr_set_next_page(struct xdr_stream *xdr)
newbase = (1 + xdr->page_ptr - xdr->buf->pages) << PAGE_SHIFT;
newbase -= xdr->buf->page_base;
-
- xdr_set_page(xdr, newbase, PAGE_SIZE);
+ if (newbase < xdr->buf->page_len)
+ xdr_set_page_base(xdr, newbase, xdr_stream_remaining(xdr));
+ else
+ xdr_set_tail_base(xdr, 0, xdr_stream_remaining(xdr));
}
static bool xdr_set_next_buffer(struct xdr_stream *xdr)
{
if (xdr->page_ptr != NULL)
xdr_set_next_page(xdr);
- else if (xdr->iov == xdr->buf->head) {
- xdr_set_page(xdr, 0, PAGE_SIZE);
- }
+ else if (xdr->iov == xdr->buf->head)
+ xdr_set_page(xdr, 0, xdr_stream_remaining(xdr));
return xdr->p != xdr->end;
}
@@ -1052,15 +1297,11 @@ void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p,
struct rpc_rqst *rqst)
{
xdr->buf = buf;
- xdr->scratch.iov_base = NULL;
- xdr->scratch.iov_len = 0;
+ xdr_reset_scratch_buffer(xdr);
xdr->nwords = XDR_QUADLEN(buf->len);
- if (buf->head[0].iov_len != 0)
- xdr_set_iov(xdr, buf->head, buf->len);
- else if (buf->page_len != 0)
- xdr_set_page_base(xdr, 0, buf->len);
- else
- xdr_set_iov(xdr, buf->head, buf->len);
+ if (xdr_set_iov(xdr, buf->head, 0, buf->len) == 0 &&
+ xdr_set_page_base(xdr, 0, buf->len) == 0)
+ xdr_set_iov(xdr, buf->tail, 0, buf->len);
if (p != NULL && p > xdr->p && xdr->end >= p) {
xdr->nwords -= p - xdr->p;
xdr->p = p;
@@ -1101,24 +1342,6 @@ static __be32 * __xdr_inline_decode(struct xdr_stream *xdr, size_t nbytes)
return p;
}
-/**
- * xdr_set_scratch_buffer - Attach a scratch buffer for decoding data.
- * @xdr: pointer to xdr_stream struct
- * @buf: pointer to an empty buffer
- * @buflen: size of 'buf'
- *
- * The scratch buffer is used when decoding from an array of pages.
- * If an xdr_inline_decode() call spans across page boundaries, then
- * we copy the data into the scratch buffer in order to allow linear
- * access.
- */
-void xdr_set_scratch_buffer(struct xdr_stream *xdr, void *buf, size_t buflen)
-{
- xdr->scratch.iov_base = buf;
- xdr->scratch.iov_len = buflen;
-}
-EXPORT_SYMBOL_GPL(xdr_set_scratch_buffer);
-
static __be32 *xdr_copy_to_scratch(struct xdr_stream *xdr, size_t nbytes)
{
__be32 *p;
@@ -1178,14 +1401,13 @@ static void xdr_realign_pages(struct xdr_stream *xdr)
struct xdr_buf *buf = xdr->buf;
struct kvec *iov = buf->head;
unsigned int cur = xdr_stream_pos(xdr);
- unsigned int copied, offset;
+ unsigned int copied;
/* Realign pages to current pointer position */
if (iov->iov_len > cur) {
- offset = iov->iov_len - cur;
- copied = xdr_shrink_bufhead(buf, offset);
- trace_rpc_xdr_alignment(xdr, offset, copied);
- xdr->nwords = XDR_QUADLEN(buf->len - cur);
+ copied = xdr_shrink_bufhead(buf, cur);
+ trace_rpc_xdr_alignment(xdr, cur, copied);
+ xdr_set_page(xdr, 0, buf->page_len);
}
}
@@ -1193,8 +1415,7 @@ static unsigned int xdr_align_pages(struct xdr_stream *xdr, unsigned int len)
{
struct xdr_buf *buf = xdr->buf;
unsigned int nwords = XDR_QUADLEN(len);
- unsigned int cur = xdr_stream_pos(xdr);
- unsigned int copied, offset;
+ unsigned int copied;
if (xdr->nwords == 0)
return 0;
@@ -1208,125 +1429,103 @@ static unsigned int xdr_align_pages(struct xdr_stream *xdr, unsigned int len)
len = buf->page_len;
else if (nwords < xdr->nwords) {
/* Truncate page data and move it into the tail */
- offset = buf->page_len - len;
- copied = xdr_shrink_pagelen(buf, offset);
- trace_rpc_xdr_alignment(xdr, offset, copied);
- xdr->nwords = XDR_QUADLEN(buf->len - cur);
+ copied = xdr_shrink_pagelen(buf, len);
+ trace_rpc_xdr_alignment(xdr, len, copied);
}
return len;
}
/**
- * xdr_read_pages - Ensure page-based XDR data to decode is aligned at current pointer position
+ * xdr_read_pages - align page-based XDR data to current pointer position
* @xdr: pointer to xdr_stream struct
* @len: number of bytes of page data
*
* Moves data beyond the current pointer position from the XDR head[] buffer
- * into the page list. Any data that lies beyond current position + "len"
- * bytes is moved into the XDR tail[].
+ * into the page list. Any data that lies beyond current position + @len
+ * bytes is moved into the XDR tail[]. The xdr_stream current position is
+ * then advanced past that data to align to the next XDR object in the tail.
*
* Returns the number of XDR encoded bytes now contained in the pages
*/
unsigned int xdr_read_pages(struct xdr_stream *xdr, unsigned int len)
{
- struct xdr_buf *buf = xdr->buf;
- struct kvec *iov;
- unsigned int nwords;
- unsigned int end;
- unsigned int padding;
+ unsigned int nwords = XDR_QUADLEN(len);
+ unsigned int base, end, pglen;
- len = xdr_align_pages(xdr, len);
- if (len == 0)
+ pglen = xdr_align_pages(xdr, nwords << 2);
+ if (pglen == 0)
return 0;
- nwords = XDR_QUADLEN(len);
- padding = (nwords << 2) - len;
- xdr->iov = iov = buf->tail;
- /* Compute remaining message length. */
- end = ((xdr->nwords - nwords) << 2) + padding;
- if (end > iov->iov_len)
- end = iov->iov_len;
- /*
- * Position current pointer at beginning of tail, and
- * set remaining message length.
- */
- xdr->p = (__be32 *)((char *)iov->iov_base + padding);
- xdr->end = (__be32 *)((char *)iov->iov_base + end);
- xdr->page_ptr = NULL;
- xdr->nwords = XDR_QUADLEN(end - padding);
- return len;
+ base = (nwords << 2) - pglen;
+ end = xdr_stream_remaining(xdr) - pglen;
+
+ xdr_set_tail_base(xdr, base, end);
+ return len <= pglen ? len : pglen;
}
EXPORT_SYMBOL_GPL(xdr_read_pages);
-uint64_t xdr_align_data(struct xdr_stream *xdr, uint64_t offset, uint32_t length)
+unsigned int xdr_align_data(struct xdr_stream *xdr, unsigned int offset,
+ unsigned int length)
{
struct xdr_buf *buf = xdr->buf;
- unsigned int from, bytes;
- unsigned int shift = 0;
-
- if ((offset + length) < offset ||
- (offset + length) > buf->page_len)
- length = buf->page_len - offset;
+ unsigned int from, bytes, len;
+ unsigned int shift;
xdr_realign_pages(xdr);
from = xdr_page_pos(xdr);
- bytes = xdr->nwords << 2;
- if (length < bytes)
- bytes = length;
+
+ if (from >= buf->page_len + buf->tail->iov_len)
+ return 0;
+ if (from + buf->head->iov_len >= buf->len)
+ return 0;
+
+ len = buf->len - buf->head->iov_len;
+
+ /* We only shift data left! */
+ if (WARN_ONCE(from < offset, "SUNRPC: misaligned data src=%u dst=%u\n",
+ from, offset))
+ return 0;
+ if (WARN_ONCE(offset > buf->page_len,
+ "SUNRPC: buffer overflow. offset=%u, page_len=%u\n",
+ offset, buf->page_len))
+ return 0;
/* Move page data to the left */
- if (from > offset) {
- shift = min_t(unsigned int, bytes, buf->page_len - from);
- _shift_data_left_pages(buf->pages,
- buf->page_base + offset,
- buf->page_base + from,
- shift);
- bytes -= shift;
-
- /* Move tail data into the pages, if necessary */
- if (bytes > 0)
- _shift_data_left_tail(buf, offset + shift, bytes);
- }
+ shift = from - offset;
+ xdr_buf_pages_shift_left(buf, from, len, shift);
+
+ bytes = xdr_stream_remaining(xdr);
+ if (length > bytes)
+ length = bytes;
+ bytes -= length;
- xdr->nwords -= XDR_QUADLEN(length);
- xdr_set_page(xdr, from + length, PAGE_SIZE);
+ xdr->buf->len -= shift;
+ xdr_set_page(xdr, offset + length, bytes);
return length;
}
EXPORT_SYMBOL_GPL(xdr_align_data);
-uint64_t xdr_expand_hole(struct xdr_stream *xdr, uint64_t offset, uint64_t length)
+unsigned int xdr_expand_hole(struct xdr_stream *xdr, unsigned int offset,
+ unsigned int length)
{
struct xdr_buf *buf = xdr->buf;
- unsigned int bytes;
- unsigned int from;
- unsigned int truncated = 0;
-
- if ((offset + length) < offset ||
- (offset + length) > buf->page_len)
- length = buf->page_len - offset;
+ unsigned int from, to, shift;
xdr_realign_pages(xdr);
from = xdr_page_pos(xdr);
- bytes = xdr->nwords << 2;
-
- if (offset + length + bytes > buf->page_len) {
- unsigned int shift = (offset + length + bytes) - buf->page_len;
- unsigned int res = _shift_data_right_tail(buf, from + bytes - shift, shift);
- truncated = shift - res;
- xdr->nwords -= XDR_QUADLEN(truncated);
- bytes -= shift;
- }
-
- /* Now move the page data over and zero pages */
- if (bytes > 0)
- _shift_data_right_pages(buf->pages,
- buf->page_base + offset + length,
- buf->page_base + from,
- bytes);
- _zero_pages(buf->pages, buf->page_base + offset, length);
+ to = xdr_align_size(offset + length);
+
+ /* Could the hole be behind us? */
+ if (to > from) {
+ unsigned int buflen = buf->len - buf->head->iov_len;
+ shift = to - from;
+ xdr_buf_try_expand(buf, shift);
+ xdr_buf_pages_shift_right(buf, from, buflen, shift);
+ xdr_set_page(xdr, to, xdr_stream_remaining(xdr));
+ } else if (to != from)
+ xdr_align_data(xdr, to, 0);
+ xdr_buf_pages_zero(buf, offset, length);
- buf->len += length - (from - offset) - truncated;
- xdr_set_page(xdr, offset + length, PAGE_SIZE);
return length;
}
EXPORT_SYMBOL_GPL(xdr_expand_hole);
@@ -1355,8 +1554,7 @@ EXPORT_SYMBOL_GPL(xdr_enter_page);
static const struct kvec empty_iov = {.iov_base = NULL, .iov_len = 0};
-void
-xdr_buf_from_iov(struct kvec *iov, struct xdr_buf *buf)
+void xdr_buf_from_iov(const struct kvec *iov, struct xdr_buf *buf)
{
buf->head[0] = *iov;
buf->tail[0] = empty_iov;
@@ -1379,9 +1577,8 @@ EXPORT_SYMBOL_GPL(xdr_buf_from_iov);
*
* Returns -1 if base of length are out of bounds.
*/
-int
-xdr_buf_subsegment(struct xdr_buf *buf, struct xdr_buf *subbuf,
- unsigned int base, unsigned int len)
+int xdr_buf_subsegment(const struct xdr_buf *buf, struct xdr_buf *subbuf,
+ unsigned int base, unsigned int len)
{
subbuf->buflen = subbuf->len = len;
if (base < buf->head[0].iov_len) {
@@ -1429,6 +1626,51 @@ xdr_buf_subsegment(struct xdr_buf *buf, struct xdr_buf *subbuf,
EXPORT_SYMBOL_GPL(xdr_buf_subsegment);
/**
+ * xdr_stream_subsegment - set @subbuf to a portion of @xdr
+ * @xdr: an xdr_stream set up for decoding
+ * @subbuf: the result buffer
+ * @nbytes: length of @xdr to extract, in bytes
+ *
+ * Sets up @subbuf to represent a portion of @xdr. The portion
+ * starts at the current offset in @xdr, and extends for a length
+ * of @nbytes. If this is successful, @xdr is advanced to the next
+ * position following that portion.
+ *
+ * Return values:
+ * %true: @subbuf has been initialized, and @xdr has been advanced.
+ * %false: a bounds error has occurred
+ */
+bool xdr_stream_subsegment(struct xdr_stream *xdr, struct xdr_buf *subbuf,
+ unsigned int nbytes)
+{
+ unsigned int remaining, offset, len;
+
+ if (xdr_buf_subsegment(xdr->buf, subbuf, xdr_stream_pos(xdr), nbytes))
+ return false;
+
+ if (subbuf->head[0].iov_len)
+ if (!__xdr_inline_decode(xdr, subbuf->head[0].iov_len))
+ return false;
+
+ remaining = subbuf->page_len;
+ offset = subbuf->page_base;
+ while (remaining) {
+ len = min_t(unsigned int, remaining, PAGE_SIZE) - offset;
+
+ if (xdr->p == xdr->end && !xdr_set_next_buffer(xdr))
+ return false;
+ if (!__xdr_inline_decode(xdr, len))
+ return false;
+
+ remaining -= len;
+ offset = 0;
+ }
+
+ return true;
+}
+EXPORT_SYMBOL_GPL(xdr_stream_subsegment);
+
+/**
* xdr_buf_trim - lop at most "len" bytes off the end of "buf"
* @buf: buf to be trimmed
* @len: number of bytes to reduce "buf" by
@@ -1469,7 +1711,8 @@ fix_len:
}
EXPORT_SYMBOL_GPL(xdr_buf_trim);
-static void __read_bytes_from_xdr_buf(struct xdr_buf *subbuf, void *obj, unsigned int len)
+static void __read_bytes_from_xdr_buf(const struct xdr_buf *subbuf,
+ void *obj, unsigned int len)
{
unsigned int this_len;
@@ -1478,8 +1721,7 @@ static void __read_bytes_from_xdr_buf(struct xdr_buf *subbuf, void *obj, unsigne
len -= this_len;
obj += this_len;
this_len = min_t(unsigned int, len, subbuf->page_len);
- if (this_len)
- _copy_from_pages(obj, subbuf->pages, subbuf->page_base, this_len);
+ _copy_from_pages(obj, subbuf->pages, subbuf->page_base, this_len);
len -= this_len;
obj += this_len;
this_len = min_t(unsigned int, len, subbuf->tail[0].iov_len);
@@ -1487,7 +1729,8 @@ static void __read_bytes_from_xdr_buf(struct xdr_buf *subbuf, void *obj, unsigne
}
/* obj is assumed to point to allocated memory of size at least len: */
-int read_bytes_from_xdr_buf(struct xdr_buf *buf, unsigned int base, void *obj, unsigned int len)
+int read_bytes_from_xdr_buf(const struct xdr_buf *buf, unsigned int base,
+ void *obj, unsigned int len)
{
struct xdr_buf subbuf;
int status;
@@ -1500,7 +1743,8 @@ int read_bytes_from_xdr_buf(struct xdr_buf *buf, unsigned int base, void *obj, u
}
EXPORT_SYMBOL_GPL(read_bytes_from_xdr_buf);
-static void __write_bytes_to_xdr_buf(struct xdr_buf *subbuf, void *obj, unsigned int len)
+static void __write_bytes_to_xdr_buf(const struct xdr_buf *subbuf,
+ void *obj, unsigned int len)
{
unsigned int this_len;
@@ -1509,8 +1753,7 @@ static void __write_bytes_to_xdr_buf(struct xdr_buf *subbuf, void *obj, unsigned
len -= this_len;
obj += this_len;
this_len = min_t(unsigned int, len, subbuf->page_len);
- if (this_len)
- _copy_to_pages(subbuf->pages, subbuf->page_base, obj, this_len);
+ _copy_to_pages(subbuf->pages, subbuf->page_base, obj, this_len);
len -= this_len;
obj += this_len;
this_len = min_t(unsigned int, len, subbuf->tail[0].iov_len);
@@ -1518,7 +1761,8 @@ static void __write_bytes_to_xdr_buf(struct xdr_buf *subbuf, void *obj, unsigned
}
/* obj is assumed to point to allocated memory of size at least len: */
-int write_bytes_to_xdr_buf(struct xdr_buf *buf, unsigned int base, void *obj, unsigned int len)
+int write_bytes_to_xdr_buf(const struct xdr_buf *buf, unsigned int base,
+ void *obj, unsigned int len)
{
struct xdr_buf subbuf;
int status;
@@ -1531,8 +1775,7 @@ int write_bytes_to_xdr_buf(struct xdr_buf *buf, unsigned int base, void *obj, un
}
EXPORT_SYMBOL_GPL(write_bytes_to_xdr_buf);
-int
-xdr_decode_word(struct xdr_buf *buf, unsigned int base, u32 *obj)
+int xdr_decode_word(const struct xdr_buf *buf, unsigned int base, u32 *obj)
{
__be32 raw;
int status;
@@ -1545,8 +1788,7 @@ xdr_decode_word(struct xdr_buf *buf, unsigned int base, u32 *obj)
}
EXPORT_SYMBOL_GPL(xdr_decode_word);
-int
-xdr_encode_word(struct xdr_buf *buf, unsigned int base, u32 obj)
+int xdr_encode_word(const struct xdr_buf *buf, unsigned int base, u32 obj)
{
__be32 raw = cpu_to_be32(obj);
@@ -1555,9 +1797,8 @@ xdr_encode_word(struct xdr_buf *buf, unsigned int base, u32 obj)
EXPORT_SYMBOL_GPL(xdr_encode_word);
/* Returns 0 on success, or else a negative error code. */
-static int
-xdr_xcode_array2(struct xdr_buf *buf, unsigned int base,
- struct xdr_array2_desc *desc, int encode)
+static int xdr_xcode_array2(const struct xdr_buf *buf, unsigned int base,
+ struct xdr_array2_desc *desc, int encode)
{
char *elem = NULL, *c;
unsigned int copied = 0, todo, avail_here;
@@ -1749,9 +1990,8 @@ out:
return err;
}
-int
-xdr_decode_array2(struct xdr_buf *buf, unsigned int base,
- struct xdr_array2_desc *desc)
+int xdr_decode_array2(const struct xdr_buf *buf, unsigned int base,
+ struct xdr_array2_desc *desc)
{
if (base >= buf->len)
return -EINVAL;
@@ -1760,9 +2000,8 @@ xdr_decode_array2(struct xdr_buf *buf, unsigned int base,
}
EXPORT_SYMBOL_GPL(xdr_decode_array2);
-int
-xdr_encode_array2(struct xdr_buf *buf, unsigned int base,
- struct xdr_array2_desc *desc)
+int xdr_encode_array2(const struct xdr_buf *buf, unsigned int base,
+ struct xdr_array2_desc *desc)
{
if ((unsigned long) base + 4 + desc->array_len * desc->elem_size >
buf->head->iov_len + buf->page_len + buf->tail->iov_len)
@@ -1772,9 +2011,9 @@ xdr_encode_array2(struct xdr_buf *buf, unsigned int base,
}
EXPORT_SYMBOL_GPL(xdr_encode_array2);
-int
-xdr_process_buf(struct xdr_buf *buf, unsigned int offset, unsigned int len,
- int (*actor)(struct scatterlist *, void *), void *data)
+int xdr_process_buf(const struct xdr_buf *buf, unsigned int offset,
+ unsigned int len,
+ int (*actor)(struct scatterlist *, void *), void *data)
{
int i, ret = 0;
unsigned int page_len, thislen, page_offset;
@@ -1942,10 +2181,8 @@ ssize_t xdr_stream_decode_string_dup(struct xdr_stream *xdr, char **str,
ret = xdr_stream_decode_opaque_inline(xdr, &p, maxlen);
if (ret > 0) {
- char *s = kmalloc(ret + 1, gfp_flags);
+ char *s = kmemdup_nul(p, ret, gfp_flags);
if (s != NULL) {
- memcpy(s, p, ret);
- s[ret] = '\0';
*str = s;
return strlen(s);
}
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index f6c17e75f20e..691ccf8049a4 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -151,33 +151,94 @@ out:
}
EXPORT_SYMBOL_GPL(xprt_unregister_transport);
-/**
- * xprt_load_transport - load a transport implementation
- * @transport_name: transport to load
- *
- * Returns:
- * 0: transport successfully loaded
- * -ENOENT: transport module not available
- */
-int xprt_load_transport(const char *transport_name)
+static void
+xprt_class_release(const struct xprt_class *t)
{
- struct xprt_class *t;
- int result;
+ module_put(t->owner);
+}
+
+static const struct xprt_class *
+xprt_class_find_by_ident_locked(int ident)
+{
+ const struct xprt_class *t;
+
+ list_for_each_entry(t, &xprt_list, list) {
+ if (t->ident != ident)
+ continue;
+ if (!try_module_get(t->owner))
+ continue;
+ return t;
+ }
+ return NULL;
+}
+
+static const struct xprt_class *
+xprt_class_find_by_ident(int ident)
+{
+ const struct xprt_class *t;
- result = 0;
spin_lock(&xprt_list_lock);
+ t = xprt_class_find_by_ident_locked(ident);
+ spin_unlock(&xprt_list_lock);
+ return t;
+}
+
+static const struct xprt_class *
+xprt_class_find_by_netid_locked(const char *netid)
+{
+ const struct xprt_class *t;
+ unsigned int i;
+
list_for_each_entry(t, &xprt_list, list) {
- if (strcmp(t->name, transport_name) == 0) {
- spin_unlock(&xprt_list_lock);
- goto out;
+ for (i = 0; t->netid[i][0] != '\0'; i++) {
+ if (strcmp(t->netid[i], netid) != 0)
+ continue;
+ if (!try_module_get(t->owner))
+ continue;
+ return t;
}
}
+ return NULL;
+}
+
+static const struct xprt_class *
+xprt_class_find_by_netid(const char *netid)
+{
+ const struct xprt_class *t;
+
+ spin_lock(&xprt_list_lock);
+ t = xprt_class_find_by_netid_locked(netid);
+ if (!t) {
+ spin_unlock(&xprt_list_lock);
+ request_module("rpc%s", netid);
+ spin_lock(&xprt_list_lock);
+ t = xprt_class_find_by_netid_locked(netid);
+ }
spin_unlock(&xprt_list_lock);
- result = request_module("xprt%s", transport_name);
-out:
- return result;
+ return t;
+}
+
+/**
+ * xprt_find_transport_ident - convert a netid into a transport identifier
+ * @netid: transport to load
+ *
+ * Returns:
+ * > 0: transport identifier
+ * -ENOENT: transport module not available
+ */
+int xprt_find_transport_ident(const char *netid)
+{
+ const struct xprt_class *t;
+ int ret;
+
+ t = xprt_class_find_by_netid(netid);
+ if (!t)
+ return -ENOENT;
+ ret = t->ident;
+ xprt_class_release(t);
+ return ret;
}
-EXPORT_SYMBOL_GPL(xprt_load_transport);
+EXPORT_SYMBOL_GPL(xprt_find_transport_ident);
static void xprt_clear_locked(struct rpc_xprt *xprt)
{
@@ -1896,21 +1957,17 @@ static void xprt_init(struct rpc_xprt *xprt, struct net *net)
struct rpc_xprt *xprt_create_transport(struct xprt_create *args)
{
struct rpc_xprt *xprt;
- struct xprt_class *t;
+ const struct xprt_class *t;
- spin_lock(&xprt_list_lock);
- list_for_each_entry(t, &xprt_list, list) {
- if (t->ident == args->ident) {
- spin_unlock(&xprt_list_lock);
- goto found;
- }
+ t = xprt_class_find_by_ident(args->ident);
+ if (!t) {
+ dprintk("RPC: transport (%d) not supported\n", args->ident);
+ return ERR_PTR(-EIO);
}
- spin_unlock(&xprt_list_lock);
- dprintk("RPC: transport (%d) not supported\n", args->ident);
- return ERR_PTR(-EIO);
-found:
xprt = t->setup(args);
+ xprt_class_release(t);
+
if (IS_ERR(xprt))
goto out;
if (args->flags & XPRT_CREATE_NO_IDLE_TIMEOUT)
diff --git a/net/sunrpc/xprtrdma/Makefile b/net/sunrpc/xprtrdma/Makefile
index 8ed0377d7a18..55b21bae866d 100644
--- a/net/sunrpc/xprtrdma/Makefile
+++ b/net/sunrpc/xprtrdma/Makefile
@@ -4,5 +4,5 @@ obj-$(CONFIG_SUNRPC_XPRT_RDMA) += rpcrdma.o
rpcrdma-y := transport.o rpc_rdma.o verbs.o frwr_ops.o \
svc_rdma.o svc_rdma_backchannel.o svc_rdma_transport.o \
svc_rdma_sendto.o svc_rdma_recvfrom.o svc_rdma_rw.o \
- module.o
+ svc_rdma_pcl.o module.o
rpcrdma-$(CONFIG_SUNRPC_BACKCHANNEL) += backchannel.o
diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c
index c92c1aac270a..946edf2db646 100644
--- a/net/sunrpc/xprtrdma/backchannel.c
+++ b/net/sunrpc/xprtrdma/backchannel.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * Copyright (c) 2015 Oracle. All rights reserved.
+ * Copyright (c) 2015-2020, Oracle and/or its affiliates.
*
* Support for backward direction RPCs on RPC/RDMA.
*/
@@ -82,7 +82,7 @@ static int rpcrdma_bc_marshal_reply(struct rpc_rqst *rqst)
&rqst->rq_snd_buf, rpcrdma_noch_pullup))
return -EIO;
- trace_xprtrdma_cb_reply(rqst);
+ trace_xprtrdma_cb_reply(r_xprt, rqst);
return 0;
}
@@ -260,7 +260,7 @@ void rpcrdma_bc_receive_call(struct rpcrdma_xprt *r_xprt,
*/
req = rpcr_to_rdmar(rqst);
req->rl_reply = rep;
- trace_xprtrdma_cb_call(rqst);
+ trace_xprtrdma_cb_call(r_xprt, rqst);
/* Queue rqst for ULP's callback service */
bc_serv = xprt->bc_serv;
diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index 44888f5badef..baca49fe83af 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -65,18 +65,23 @@ void frwr_release_mr(struct rpcrdma_mr *mr)
kfree(mr);
}
+static void frwr_mr_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr)
+{
+ if (mr->mr_device) {
+ trace_xprtrdma_mr_unmap(mr);
+ ib_dma_unmap_sg(mr->mr_device, mr->mr_sg, mr->mr_nents,
+ mr->mr_dir);
+ mr->mr_device = NULL;
+ }
+}
+
static void frwr_mr_recycle(struct rpcrdma_mr *mr)
{
struct rpcrdma_xprt *r_xprt = mr->mr_xprt;
trace_xprtrdma_mr_recycle(mr);
- if (mr->mr_dir != DMA_NONE) {
- trace_xprtrdma_mr_unmap(mr);
- ib_dma_unmap_sg(r_xprt->rx_ep->re_id->device,
- mr->mr_sg, mr->mr_nents, mr->mr_dir);
- mr->mr_dir = DMA_NONE;
- }
+ frwr_mr_unmap(r_xprt, mr);
spin_lock(&r_xprt->rx_buf.rb_lock);
list_del(&mr->mr_all);
@@ -86,6 +91,16 @@ static void frwr_mr_recycle(struct rpcrdma_mr *mr)
frwr_release_mr(mr);
}
+static void frwr_mr_put(struct rpcrdma_mr *mr)
+{
+ frwr_mr_unmap(mr->mr_xprt, mr);
+
+ /* The MR is returned to the req's MR free list instead
+ * of to the xprt's MR free list. No spinlock is needed.
+ */
+ rpcrdma_mr_push(mr, &mr->mr_req->rl_free_mrs);
+}
+
/* frwr_reset - Place MRs back on the free list
* @req: request to reset
*
@@ -101,7 +116,7 @@ void frwr_reset(struct rpcrdma_req *req)
struct rpcrdma_mr *mr;
while ((mr = rpcrdma_mr_pop(&req->rl_registered)))
- rpcrdma_mr_put(mr);
+ frwr_mr_put(mr);
}
/**
@@ -130,7 +145,7 @@ int frwr_mr_init(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr)
mr->mr_xprt = r_xprt;
mr->frwr.fr_mr = frmr;
- mr->mr_dir = DMA_NONE;
+ mr->mr_device = NULL;
INIT_LIST_HEAD(&mr->mr_list);
init_completion(&mr->frwr.fr_linv_done);
@@ -315,6 +330,7 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt,
mr->mr_dir);
if (!dma_nents)
goto out_dmamap_err;
+ mr->mr_device = ep->re_id->device;
ibmr = mr->frwr.fr_mr;
n = ib_map_mr_sg(ibmr, mr->mr_sg, dma_nents, NULL, PAGE_SIZE);
@@ -341,7 +357,6 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt,
return seg;
out_dmamap_err:
- mr->mr_dir = DMA_NONE;
trace_xprtrdma_frwr_sgerr(mr, i);
return ERR_PTR(-EIO);
@@ -363,12 +378,21 @@ static void frwr_wc_fastreg(struct ib_cq *cq, struct ib_wc *wc)
container_of(cqe, struct rpcrdma_frwr, fr_cqe);
/* WARNING: Only wr_cqe and status are reliable at this point */
- trace_xprtrdma_wc_fastreg(wc, frwr);
+ trace_xprtrdma_wc_fastreg(wc, &frwr->fr_cid);
/* The MR will get recycled when the associated req is retransmitted */
rpcrdma_flush_disconnect(cq->cq_context, wc);
}
+static void frwr_cid_init(struct rpcrdma_ep *ep,
+ struct rpcrdma_frwr *frwr)
+{
+ struct rpc_rdma_cid *cid = &frwr->fr_cid;
+
+ cid->ci_queue_id = ep->re_attr.send_cq->res.id;
+ cid->ci_completion_id = frwr->fr_mr->res.id;
+}
+
/**
* frwr_send - post Send WRs containing the RPC Call message
* @r_xprt: controlling transport instance
@@ -385,6 +409,7 @@ static void frwr_wc_fastreg(struct ib_cq *cq, struct ib_wc *wc)
*/
int frwr_send(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
{
+ struct rpcrdma_ep *ep = r_xprt->rx_ep;
struct ib_send_wr *post_wr;
struct rpcrdma_mr *mr;
@@ -395,6 +420,7 @@ int frwr_send(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
frwr = &mr->frwr;
frwr->fr_cqe.done = frwr_wc_fastreg;
+ frwr_cid_init(ep, frwr);
frwr->fr_regwr.wr.next = post_wr;
frwr->fr_regwr.wr.wr_cqe = &frwr->fr_cqe;
frwr->fr_regwr.wr.num_sge = 0;
@@ -404,7 +430,7 @@ int frwr_send(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
post_wr = &frwr->fr_regwr.wr;
}
- return ib_post_send(r_xprt->rx_ep->re_id->qp, post_wr, NULL);
+ return ib_post_send(ep->re_id->qp, post_wr, NULL);
}
/**
@@ -420,18 +446,17 @@ void frwr_reminv(struct rpcrdma_rep *rep, struct list_head *mrs)
list_for_each_entry(mr, mrs, mr_list)
if (mr->mr_handle == rep->rr_inv_rkey) {
list_del_init(&mr->mr_list);
- trace_xprtrdma_mr_reminv(mr);
- rpcrdma_mr_put(mr);
+ frwr_mr_put(mr);
break; /* only one invalidated MR per RPC */
}
}
-static void __frwr_release_mr(struct ib_wc *wc, struct rpcrdma_mr *mr)
+static void frwr_mr_done(struct ib_wc *wc, struct rpcrdma_mr *mr)
{
if (wc->status != IB_WC_SUCCESS)
frwr_mr_recycle(mr);
else
- rpcrdma_mr_put(mr);
+ frwr_mr_put(mr);
}
/**
@@ -448,8 +473,8 @@ static void frwr_wc_localinv(struct ib_cq *cq, struct ib_wc *wc)
struct rpcrdma_mr *mr = container_of(frwr, struct rpcrdma_mr, frwr);
/* WARNING: Only wr_cqe and status are reliable at this point */
- trace_xprtrdma_wc_li(wc, frwr);
- __frwr_release_mr(wc, mr);
+ trace_xprtrdma_wc_li(wc, &frwr->fr_cid);
+ frwr_mr_done(wc, mr);
rpcrdma_flush_disconnect(cq->cq_context, wc);
}
@@ -469,8 +494,8 @@ static void frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc)
struct rpcrdma_mr *mr = container_of(frwr, struct rpcrdma_mr, frwr);
/* WARNING: Only wr_cqe and status are reliable at this point */
- trace_xprtrdma_wc_li_wake(wc, frwr);
- __frwr_release_mr(wc, mr);
+ trace_xprtrdma_wc_li_wake(wc, &frwr->fr_cid);
+ frwr_mr_done(wc, mr);
complete(&frwr->fr_linv_done);
rpcrdma_flush_disconnect(cq->cq_context, wc);
@@ -490,6 +515,7 @@ static void frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc)
void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
{
struct ib_send_wr *first, **prev, *last;
+ struct rpcrdma_ep *ep = r_xprt->rx_ep;
const struct ib_send_wr *bad_wr;
struct rpcrdma_frwr *frwr;
struct rpcrdma_mr *mr;
@@ -509,6 +535,7 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
frwr = &mr->frwr;
frwr->fr_cqe.done = frwr_wc_localinv;
+ frwr_cid_init(ep, frwr);
last = &frwr->fr_invwr;
last->next = NULL;
last->wr_cqe = &frwr->fr_cqe;
@@ -534,7 +561,7 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
* unless re_id->qp is a valid pointer.
*/
bad_wr = NULL;
- rc = ib_post_send(r_xprt->rx_ep->re_id->qp, first, &bad_wr);
+ rc = ib_post_send(ep->re_id->qp, first, &bad_wr);
/* The final LOCAL_INV WR in the chain is supposed to
* do the wake. If it was never posted, the wake will
@@ -547,7 +574,7 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
/* Recycle MRs in the LOCAL_INV chain that did not get posted.
*/
- trace_xprtrdma_post_linv(req, rc);
+ trace_xprtrdma_post_linv_err(req, rc);
while (bad_wr) {
frwr = container_of(bad_wr, struct rpcrdma_frwr,
fr_invwr);
@@ -574,10 +601,10 @@ static void frwr_wc_localinv_done(struct ib_cq *cq, struct ib_wc *wc)
struct rpcrdma_rep *rep = mr->mr_req->rl_reply;
/* WARNING: Only wr_cqe and status are reliable at this point */
- trace_xprtrdma_wc_li_done(wc, frwr);
- __frwr_release_mr(wc, mr);
+ trace_xprtrdma_wc_li_done(wc, &frwr->fr_cid);
+ frwr_mr_done(wc, mr);
- /* Ensure @rep is generated before __frwr_release_mr */
+ /* Ensure @rep is generated before frwr_mr_done */
smp_rmb();
rpcrdma_complete_rqst(rep);
@@ -597,6 +624,7 @@ static void frwr_wc_localinv_done(struct ib_cq *cq, struct ib_wc *wc)
void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
{
struct ib_send_wr *first, *last, **prev;
+ struct rpcrdma_ep *ep = r_xprt->rx_ep;
const struct ib_send_wr *bad_wr;
struct rpcrdma_frwr *frwr;
struct rpcrdma_mr *mr;
@@ -614,6 +642,7 @@ void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
frwr = &mr->frwr;
frwr->fr_cqe.done = frwr_wc_localinv;
+ frwr_cid_init(ep, frwr);
last = &frwr->fr_invwr;
last->next = NULL;
last->wr_cqe = &frwr->fr_cqe;
@@ -639,13 +668,13 @@ void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
* unless re_id->qp is a valid pointer.
*/
bad_wr = NULL;
- rc = ib_post_send(r_xprt->rx_ep->re_id->qp, first, &bad_wr);
+ rc = ib_post_send(ep->re_id->qp, first, &bad_wr);
if (!rc)
return;
/* Recycle MRs in the LOCAL_INV chain that did not get posted.
*/
- trace_xprtrdma_post_linv(req, rc);
+ trace_xprtrdma_post_linv_err(req, rc);
while (bad_wr) {
frwr = container_of(bad_wr, struct rpcrdma_frwr, fr_invwr);
mr = container_of(frwr, struct rpcrdma_mr, frwr);
diff --git a/net/sunrpc/xprtrdma/module.c b/net/sunrpc/xprtrdma/module.c
index 620327c01302..45c5b41ac8dc 100644
--- a/net/sunrpc/xprtrdma/module.c
+++ b/net/sunrpc/xprtrdma/module.c
@@ -24,6 +24,7 @@ MODULE_DESCRIPTION("RPC/RDMA Transport");
MODULE_LICENSE("Dual BSD/GPL");
MODULE_ALIAS("svcrdma");
MODULE_ALIAS("xprtrdma");
+MODULE_ALIAS("rpcrdma6");
static void __exit rpc_rdma_cleanup(void)
{
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 0f5120c7668f..8f5d0cb68360 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
/*
- * Copyright (c) 2014-2017 Oracle. All rights reserved.
+ * Copyright (c) 2014-2020, Oracle and/or its affiliates.
* Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
*
* This software is available to you under a choice of one of two
@@ -179,6 +179,31 @@ rpcrdma_nonpayload_inline(const struct rpcrdma_xprt *r_xprt,
r_xprt->rx_ep->re_max_inline_recv;
}
+/* ACL likes to be lazy in allocating pages. For TCP, these
+ * pages can be allocated during receive processing. Not true
+ * for RDMA, which must always provision receive buffers
+ * up front.
+ */
+static noinline int
+rpcrdma_alloc_sparse_pages(struct xdr_buf *buf)
+{
+ struct page **ppages;
+ int len;
+
+ len = buf->page_len;
+ ppages = buf->pages + (buf->page_base >> PAGE_SHIFT);
+ while (len > 0) {
+ if (!*ppages)
+ *ppages = alloc_page(GFP_NOWAIT | __GFP_NOWARN);
+ if (!*ppages)
+ return -ENOBUFS;
+ ppages++;
+ len -= PAGE_SIZE;
+ }
+
+ return 0;
+}
+
/* Split @vec on page boundaries into SGEs. FMR registers pages, not
* a byte range. Other modes coalesce these SGEs into a single MR
* when they can.
@@ -233,15 +258,6 @@ rpcrdma_convert_iovs(struct rpcrdma_xprt *r_xprt, struct xdr_buf *xdrbuf,
ppages = xdrbuf->pages + (xdrbuf->page_base >> PAGE_SHIFT);
page_base = offset_in_page(xdrbuf->page_base);
while (len) {
- /* ACL likes to be lazy in allocating pages - ACLs
- * are small by default but can get huge.
- */
- if (unlikely(xdrbuf->flags & XDRBUF_SPARSE_PAGES)) {
- if (!*ppages)
- *ppages = alloc_page(GFP_NOWAIT | __GFP_NOWARN);
- if (!*ppages)
- return -ENOBUFS;
- }
seg->mr_page = *ppages;
seg->mr_offset = (char *)page_base;
seg->mr_len = min_t(u32, PAGE_SIZE - page_base, len);
@@ -315,7 +331,6 @@ static struct rpcrdma_mr_seg *rpcrdma_mr_prepare(struct rpcrdma_xprt *r_xprt,
*mr = rpcrdma_mr_get(r_xprt);
if (!*mr)
goto out_getmr_err;
- trace_xprtrdma_mr_get(req);
(*mr)->mr_req = req;
}
@@ -323,7 +338,7 @@ static struct rpcrdma_mr_seg *rpcrdma_mr_prepare(struct rpcrdma_xprt *r_xprt,
return frwr_map(r_xprt, seg, nsegs, writing, req->rl_slot.rq_xid, *mr);
out_getmr_err:
- trace_xprtrdma_nomrs(req);
+ trace_xprtrdma_nomrs_err(r_xprt, req);
xprt_wait_for_buffer_space(&r_xprt->rx_xprt);
rpcrdma_mrs_refresh(r_xprt);
return ERR_PTR(-EAGAIN);
@@ -867,6 +882,12 @@ rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst)
__be32 *p;
int ret;
+ if (unlikely(rqst->rq_rcv_buf.flags & XDRBUF_SPARSE_PAGES)) {
+ ret = rpcrdma_alloc_sparse_pages(&rqst->rq_rcv_buf);
+ if (ret)
+ return ret;
+ }
+
rpcrdma_set_xdrlen(&req->rl_hdrbuf, 0);
xdr_init_encode(xdr, &req->rl_hdrbuf, rdmab_data(req->rl_rdmabuf),
rqst);
@@ -1322,20 +1343,13 @@ rpcrdma_decode_error(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep,
p = xdr_inline_decode(xdr, 2 * sizeof(*p));
if (!p)
break;
- dprintk("RPC: %s: server reports "
- "version error (%u-%u), xid %08x\n", __func__,
- be32_to_cpup(p), be32_to_cpu(*(p + 1)),
- be32_to_cpu(rep->rr_xid));
+ trace_xprtrdma_err_vers(rqst, p, p + 1);
break;
case err_chunk:
- dprintk("RPC: %s: server reports "
- "header decoding error, xid %08x\n", __func__,
- be32_to_cpu(rep->rr_xid));
+ trace_xprtrdma_err_chunk(rqst);
break;
default:
- dprintk("RPC: %s: server reports "
- "unrecognized error %d, xid %08x\n", __func__,
- be32_to_cpup(p), be32_to_cpu(rep->rr_xid));
+ trace_xprtrdma_err_unrecognized(rqst, p);
}
return -EIO;
@@ -1376,7 +1390,7 @@ out:
return;
out_badheader:
- trace_xprtrdma_reply_hdr(rep);
+ trace_xprtrdma_reply_hdr_err(rep);
r_xprt->rx_stats.bad_reply_count++;
rqst->rq_task->tk_status = status;
status = 0;
@@ -1450,14 +1464,12 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep)
rpcrdma_post_recvs(r_xprt, false);
req = rpcr_to_rdmar(rqst);
- if (req->rl_reply) {
- trace_xprtrdma_leaked_rep(rqst, req->rl_reply);
+ if (unlikely(req->rl_reply))
rpcrdma_recv_buffer_put(req->rl_reply);
- }
req->rl_reply = rep;
rep->rr_rqst = rqst;
- trace_xprtrdma_reply(rqst->rq_task, rep, req, credits);
+ trace_xprtrdma_reply(rqst->rq_task, rep, credits);
if (rep->rr_wc_flags & IB_WC_WITH_INVALIDATE)
frwr_reminv(rep, &req->rl_registered);
@@ -1469,16 +1481,16 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep)
return;
out_badversion:
- trace_xprtrdma_reply_vers(rep);
+ trace_xprtrdma_reply_vers_err(rep);
goto out;
out_norqst:
spin_unlock(&xprt->queue_lock);
- trace_xprtrdma_reply_rqst(rep);
+ trace_xprtrdma_reply_rqst_err(rep);
goto out;
out_shortreply:
- trace_xprtrdma_reply_short(rep);
+ trace_xprtrdma_reply_short_err(rep);
out:
rpcrdma_recv_buffer_put(rep);
diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
index 5e7c4ba9e147..63f8be974df2 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
@@ -74,11 +74,17 @@ out_unlock:
*/
static int svc_rdma_bc_sendto(struct svcxprt_rdma *rdma,
struct rpc_rqst *rqst,
- struct svc_rdma_send_ctxt *ctxt)
+ struct svc_rdma_send_ctxt *sctxt)
{
+ struct svc_rdma_recv_ctxt *rctxt;
int ret;
- ret = svc_rdma_map_reply_msg(rdma, ctxt, NULL, &rqst->rq_snd_buf);
+ rctxt = svc_rdma_recv_ctxt_get(rdma);
+ if (!rctxt)
+ return -EIO;
+
+ ret = svc_rdma_map_reply_msg(rdma, sctxt, rctxt, &rqst->rq_snd_buf);
+ svc_rdma_recv_ctxt_put(rdma, rctxt);
if (ret < 0)
return -EIO;
@@ -86,8 +92,8 @@ static int svc_rdma_bc_sendto(struct svcxprt_rdma *rdma,
* the rq_buffer before all retransmits are complete.
*/
get_page(virt_to_page(rqst->rq_buffer));
- ctxt->sc_send_wr.opcode = IB_WR_SEND;
- return svc_rdma_send(rdma, ctxt);
+ sctxt->sc_send_wr.opcode = IB_WR_SEND;
+ return svc_rdma_send(rdma, sctxt);
}
/* Server-side transport endpoint wants a whole page for its send
diff --git a/net/sunrpc/xprtrdma/svc_rdma_pcl.c b/net/sunrpc/xprtrdma/svc_rdma_pcl.c
new file mode 100644
index 000000000000..b63cfeaa2923
--- /dev/null
+++ b/net/sunrpc/xprtrdma/svc_rdma_pcl.c
@@ -0,0 +1,306 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2020 Oracle. All rights reserved.
+ */
+
+#include <linux/sunrpc/svc_rdma.h>
+#include <linux/sunrpc/rpc_rdma.h>
+
+#include "xprt_rdma.h"
+#include <trace/events/rpcrdma.h>
+
+/**
+ * pcl_free - Release all memory associated with a parsed chunk list
+ * @pcl: parsed chunk list
+ *
+ */
+void pcl_free(struct svc_rdma_pcl *pcl)
+{
+ while (!list_empty(&pcl->cl_chunks)) {
+ struct svc_rdma_chunk *chunk;
+
+ chunk = pcl_first_chunk(pcl);
+ list_del(&chunk->ch_list);
+ kfree(chunk);
+ }
+}
+
+static struct svc_rdma_chunk *pcl_alloc_chunk(u32 segcount, u32 position)
+{
+ struct svc_rdma_chunk *chunk;
+
+ chunk = kmalloc(struct_size(chunk, ch_segments, segcount), GFP_KERNEL);
+ if (!chunk)
+ return NULL;
+
+ chunk->ch_position = position;
+ chunk->ch_length = 0;
+ chunk->ch_payload_length = 0;
+ chunk->ch_segcount = 0;
+ return chunk;
+}
+
+static struct svc_rdma_chunk *
+pcl_lookup_position(struct svc_rdma_pcl *pcl, u32 position)
+{
+ struct svc_rdma_chunk *pos;
+
+ pcl_for_each_chunk(pos, pcl) {
+ if (pos->ch_position == position)
+ return pos;
+ }
+ return NULL;
+}
+
+static void pcl_insert_position(struct svc_rdma_pcl *pcl,
+ struct svc_rdma_chunk *chunk)
+{
+ struct svc_rdma_chunk *pos;
+
+ pcl_for_each_chunk(pos, pcl) {
+ if (pos->ch_position > chunk->ch_position)
+ break;
+ }
+ __list_add(&chunk->ch_list, pos->ch_list.prev, &pos->ch_list);
+ pcl->cl_count++;
+}
+
+static void pcl_set_read_segment(const struct svc_rdma_recv_ctxt *rctxt,
+ struct svc_rdma_chunk *chunk,
+ u32 handle, u32 length, u64 offset)
+{
+ struct svc_rdma_segment *segment;
+
+ segment = &chunk->ch_segments[chunk->ch_segcount];
+ segment->rs_handle = handle;
+ segment->rs_length = length;
+ segment->rs_offset = offset;
+
+ trace_svcrdma_decode_rseg(&rctxt->rc_cid, chunk, segment);
+
+ chunk->ch_length += length;
+ chunk->ch_segcount++;
+}
+
+/**
+ * pcl_alloc_call - Construct a parsed chunk list for the Call body
+ * @rctxt: Ingress receive context
+ * @p: Start of an un-decoded Read list
+ *
+ * Assumptions:
+ * - The incoming Read list has already been sanity checked.
+ * - cl_count is already set to the number of segments in
+ * the un-decoded list.
+ * - The list might not be in order by position.
+ *
+ * Return values:
+ * %true: Parsed chunk list was successfully constructed, and
+ * cl_count is updated to be the number of chunks (ie.
+ * unique positions) in the Read list.
+ * %false: Memory allocation failed.
+ */
+bool pcl_alloc_call(struct svc_rdma_recv_ctxt *rctxt, __be32 *p)
+{
+ struct svc_rdma_pcl *pcl = &rctxt->rc_call_pcl;
+ unsigned int i, segcount = pcl->cl_count;
+
+ pcl->cl_count = 0;
+ for (i = 0; i < segcount; i++) {
+ struct svc_rdma_chunk *chunk;
+ u32 position, handle, length;
+ u64 offset;
+
+ p++; /* skip the list discriminator */
+ p = xdr_decode_read_segment(p, &position, &handle,
+ &length, &offset);
+ if (position != 0)
+ continue;
+
+ if (pcl_is_empty(pcl)) {
+ chunk = pcl_alloc_chunk(segcount, position);
+ if (!chunk)
+ return false;
+ pcl_insert_position(pcl, chunk);
+ } else {
+ chunk = list_first_entry(&pcl->cl_chunks,
+ struct svc_rdma_chunk,
+ ch_list);
+ }
+
+ pcl_set_read_segment(rctxt, chunk, handle, length, offset);
+ }
+
+ return true;
+}
+
+/**
+ * pcl_alloc_read - Construct a parsed chunk list for normal Read chunks
+ * @rctxt: Ingress receive context
+ * @p: Start of an un-decoded Read list
+ *
+ * Assumptions:
+ * - The incoming Read list has already been sanity checked.
+ * - cl_count is already set to the number of segments in
+ * the un-decoded list.
+ * - The list might not be in order by position.
+ *
+ * Return values:
+ * %true: Parsed chunk list was successfully constructed, and
+ * cl_count is updated to be the number of chunks (ie.
+ * unique position values) in the Read list.
+ * %false: Memory allocation failed.
+ *
+ * TODO:
+ * - Check for chunk range overlaps
+ */
+bool pcl_alloc_read(struct svc_rdma_recv_ctxt *rctxt, __be32 *p)
+{
+ struct svc_rdma_pcl *pcl = &rctxt->rc_read_pcl;
+ unsigned int i, segcount = pcl->cl_count;
+
+ pcl->cl_count = 0;
+ for (i = 0; i < segcount; i++) {
+ struct svc_rdma_chunk *chunk;
+ u32 position, handle, length;
+ u64 offset;
+
+ p++; /* skip the list discriminator */
+ p = xdr_decode_read_segment(p, &position, &handle,
+ &length, &offset);
+ if (position == 0)
+ continue;
+
+ chunk = pcl_lookup_position(pcl, position);
+ if (!chunk) {
+ chunk = pcl_alloc_chunk(segcount, position);
+ if (!chunk)
+ return false;
+ pcl_insert_position(pcl, chunk);
+ }
+
+ pcl_set_read_segment(rctxt, chunk, handle, length, offset);
+ }
+
+ return true;
+}
+
+/**
+ * pcl_alloc_write - Construct a parsed chunk list from a Write list
+ * @rctxt: Ingress receive context
+ * @pcl: Parsed chunk list to populate
+ * @p: Start of an un-decoded Write list
+ *
+ * Assumptions:
+ * - The incoming Write list has already been sanity checked, and
+ * - cl_count is set to the number of chunks in the un-decoded list.
+ *
+ * Return values:
+ * %true: Parsed chunk list was successfully constructed.
+ * %false: Memory allocation failed.
+ */
+bool pcl_alloc_write(struct svc_rdma_recv_ctxt *rctxt,
+ struct svc_rdma_pcl *pcl, __be32 *p)
+{
+ struct svc_rdma_segment *segment;
+ struct svc_rdma_chunk *chunk;
+ unsigned int i, j;
+ u32 segcount;
+
+ for (i = 0; i < pcl->cl_count; i++) {
+ p++; /* skip the list discriminator */
+ segcount = be32_to_cpup(p++);
+
+ chunk = pcl_alloc_chunk(segcount, 0);
+ if (!chunk)
+ return false;
+ list_add_tail(&chunk->ch_list, &pcl->cl_chunks);
+
+ for (j = 0; j < segcount; j++) {
+ segment = &chunk->ch_segments[j];
+ p = xdr_decode_rdma_segment(p, &segment->rs_handle,
+ &segment->rs_length,
+ &segment->rs_offset);
+ trace_svcrdma_decode_wseg(&rctxt->rc_cid, chunk, j);
+
+ chunk->ch_length += segment->rs_length;
+ chunk->ch_segcount++;
+ }
+ }
+ return true;
+}
+
+static int pcl_process_region(const struct xdr_buf *xdr,
+ unsigned int offset, unsigned int length,
+ int (*actor)(const struct xdr_buf *, void *),
+ void *data)
+{
+ struct xdr_buf subbuf;
+
+ if (!length)
+ return 0;
+ if (xdr_buf_subsegment(xdr, &subbuf, offset, length))
+ return -EMSGSIZE;
+ return actor(&subbuf, data);
+}
+
+/**
+ * pcl_process_nonpayloads - Process non-payload regions inside @xdr
+ * @pcl: Chunk list to process
+ * @xdr: xdr_buf to process
+ * @actor: Function to invoke on each non-payload region
+ * @data: Arguments for @actor
+ *
+ * This mechanism must ignore not only result payloads that were already
+ * sent via RDMA Write, but also XDR padding for those payloads that
+ * the upper layer has added.
+ *
+ * Assumptions:
+ * The xdr->len and ch_position fields are aligned to 4-byte multiples.
+ *
+ * Returns:
+ * On success, zero,
+ * %-EMSGSIZE on XDR buffer overflow, or
+ * The return value of @actor
+ */
+int pcl_process_nonpayloads(const struct svc_rdma_pcl *pcl,
+ const struct xdr_buf *xdr,
+ int (*actor)(const struct xdr_buf *, void *),
+ void *data)
+{
+ struct svc_rdma_chunk *chunk, *next;
+ unsigned int start;
+ int ret;
+
+ chunk = pcl_first_chunk(pcl);
+
+ /* No result payloads were generated */
+ if (!chunk || !chunk->ch_payload_length)
+ return actor(xdr, data);
+
+ /* Process the region before the first result payload */
+ ret = pcl_process_region(xdr, 0, chunk->ch_position, actor, data);
+ if (ret < 0)
+ return ret;
+
+ /* Process the regions between each middle result payload */
+ while ((next = pcl_next_chunk(pcl, chunk))) {
+ if (!next->ch_payload_length)
+ break;
+
+ start = pcl_chunk_end_offset(chunk);
+ ret = pcl_process_region(xdr, start, next->ch_position - start,
+ actor, data);
+ if (ret < 0)
+ return ret;
+
+ chunk = next;
+ }
+
+ /* Process the region after the last result payload */
+ start = pcl_chunk_end_offset(chunk);
+ ret = pcl_process_region(xdr, start, xdr->len - start, actor, data);
+ if (ret < 0)
+ return ret;
+
+ return 0;
+}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index c6ea2903c21a..cbdb71247755 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -93,6 +93,7 @@
* (see rdma_read_complete() below).
*/
+#include <linux/slab.h>
#include <linux/spinlock.h>
#include <asm/unaligned.h>
#include <rdma/ib_verbs.h>
@@ -143,6 +144,10 @@ svc_rdma_recv_ctxt_alloc(struct svcxprt_rdma *rdma)
goto fail2;
svc_rdma_recv_cid_init(rdma, &ctxt->rc_cid);
+ pcl_init(&ctxt->rc_call_pcl);
+ pcl_init(&ctxt->rc_read_pcl);
+ pcl_init(&ctxt->rc_write_pcl);
+ pcl_init(&ctxt->rc_reply_pcl);
ctxt->rc_recv_wr.next = NULL;
ctxt->rc_recv_wr.wr_cqe = &ctxt->rc_cqe;
@@ -189,8 +194,13 @@ void svc_rdma_recv_ctxts_destroy(struct svcxprt_rdma *rdma)
}
}
-static struct svc_rdma_recv_ctxt *
-svc_rdma_recv_ctxt_get(struct svcxprt_rdma *rdma)
+/**
+ * svc_rdma_recv_ctxt_get - Allocate a recv_ctxt
+ * @rdma: controlling svcxprt_rdma
+ *
+ * Returns a recv_ctxt or (rarely) NULL if none are available.
+ */
+struct svc_rdma_recv_ctxt *svc_rdma_recv_ctxt_get(struct svcxprt_rdma *rdma)
{
struct svc_rdma_recv_ctxt *ctxt;
struct llist_node *node;
@@ -202,7 +212,6 @@ svc_rdma_recv_ctxt_get(struct svcxprt_rdma *rdma)
out:
ctxt->rc_page_count = 0;
- ctxt->rc_read_payload_length = 0;
return ctxt;
out_empty:
@@ -226,6 +235,11 @@ void svc_rdma_recv_ctxt_put(struct svcxprt_rdma *rdma,
for (i = 0; i < ctxt->rc_page_count; i++)
put_page(ctxt->rc_pages[i]);
+ pcl_free(&ctxt->rc_call_pcl);
+ pcl_free(&ctxt->rc_read_pcl);
+ pcl_free(&ctxt->rc_write_pcl);
+ pcl_free(&ctxt->rc_reply_pcl);
+
if (!ctxt->rc_temp)
llist_add(&ctxt->rc_node, &rdma->sc_recv_ctxts);
else
@@ -385,100 +399,123 @@ static void svc_rdma_build_arg_xdr(struct svc_rqst *rqstp,
arg->len = ctxt->rc_byte_len;
}
-/* This accommodates the largest possible Write chunk.
- */
-#define MAX_BYTES_WRITE_CHUNK ((u32)(RPCSVC_MAXPAGES << PAGE_SHIFT))
-
-/* This accommodates the largest possible Position-Zero
- * Read chunk or Reply chunk.
- */
-#define MAX_BYTES_SPECIAL_CHUNK ((u32)((RPCSVC_MAXPAGES + 2) << PAGE_SHIFT))
-
-/* Sanity check the Read list.
- *
- * Implementation limits:
- * - This implementation supports only one Read chunk.
+/**
+ * xdr_count_read_segments - Count number of Read segments in Read list
+ * @rctxt: Ingress receive context
+ * @p: Start of an un-decoded Read list
*
- * Sanity checks:
- * - Read list does not overflow Receive buffer.
- * - Segment size limited by largest NFS data payload.
+ * Before allocating anything, ensure the ingress Read list is safe
+ * to use.
*
- * The segment count is limited to how many segments can
- * fit in the transport header without overflowing the
- * buffer. That's about 40 Read segments for a 1KB inline
- * threshold.
+ * The segment count is limited to how many segments can fit in the
+ * transport header without overflowing the buffer. That's about 40
+ * Read segments for a 1KB inline threshold.
*
* Return values:
- * %true: Read list is valid. @rctxt's xdr_stream is updated
- * to point to the first byte past the Read list.
- * %false: Read list is corrupt. @rctxt's xdr_stream is left
- * in an unknown state.
+ * %true: Read list is valid. @rctxt's xdr_stream is updated to point
+ * to the first byte past the Read list. rc_read_pcl and
+ * rc_call_pcl cl_count fields are set to the number of
+ * Read segments in the list.
+ * %false: Read list is corrupt. @rctxt's xdr_stream is left in an
+ * unknown state.
*/
-static bool xdr_check_read_list(struct svc_rdma_recv_ctxt *rctxt)
+static bool xdr_count_read_segments(struct svc_rdma_recv_ctxt *rctxt, __be32 *p)
{
- u32 position, len;
- bool first;
- __be32 *p;
-
- p = xdr_inline_decode(&rctxt->rc_stream, sizeof(*p));
- if (!p)
- return false;
-
- len = 0;
- first = true;
+ rctxt->rc_call_pcl.cl_count = 0;
+ rctxt->rc_read_pcl.cl_count = 0;
while (xdr_item_is_present(p)) {
+ u32 position, handle, length;
+ u64 offset;
+
p = xdr_inline_decode(&rctxt->rc_stream,
rpcrdma_readseg_maxsz * sizeof(*p));
if (!p)
return false;
- if (first) {
- position = be32_to_cpup(p);
- first = false;
- } else if (be32_to_cpup(p) != position) {
- return false;
+ xdr_decode_read_segment(p, &position, &handle,
+ &length, &offset);
+ if (position) {
+ if (position & 3)
+ return false;
+ ++rctxt->rc_read_pcl.cl_count;
+ } else {
+ ++rctxt->rc_call_pcl.cl_count;
}
- p += 2;
- len += be32_to_cpup(p);
p = xdr_inline_decode(&rctxt->rc_stream, sizeof(*p));
if (!p)
return false;
}
- return len <= MAX_BYTES_SPECIAL_CHUNK;
+ return true;
}
-/* The segment count is limited to how many segments can
- * fit in the transport header without overflowing the
- * buffer. That's about 60 Write segments for a 1KB inline
- * threshold.
+/* Sanity check the Read list.
+ *
+ * Sanity checks:
+ * - Read list does not overflow Receive buffer.
+ * - Chunk size limited by largest NFS data payload.
+ *
+ * Return values:
+ * %true: Read list is valid. @rctxt's xdr_stream is updated
+ * to point to the first byte past the Read list.
+ * %false: Read list is corrupt. @rctxt's xdr_stream is left
+ * in an unknown state.
*/
-static bool xdr_check_write_chunk(struct svc_rdma_recv_ctxt *rctxt, u32 maxlen)
+static bool xdr_check_read_list(struct svc_rdma_recv_ctxt *rctxt)
{
- u32 i, segcount, total;
__be32 *p;
p = xdr_inline_decode(&rctxt->rc_stream, sizeof(*p));
if (!p)
return false;
- segcount = be32_to_cpup(p);
+ if (!xdr_count_read_segments(rctxt, p))
+ return false;
+ if (!pcl_alloc_call(rctxt, p))
+ return false;
+ return pcl_alloc_read(rctxt, p);
+}
- total = 0;
- for (i = 0; i < segcount; i++) {
- u32 handle, length;
- u64 offset;
+static bool xdr_check_write_chunk(struct svc_rdma_recv_ctxt *rctxt)
+{
+ u32 segcount;
+ __be32 *p;
- p = xdr_inline_decode(&rctxt->rc_stream,
- rpcrdma_segment_maxsz * sizeof(*p));
- if (!p)
- return false;
+ if (xdr_stream_decode_u32(&rctxt->rc_stream, &segcount))
+ return false;
- xdr_decode_rdma_segment(p, &handle, &length, &offset);
- trace_svcrdma_decode_wseg(handle, length, offset);
+ /* A bogus segcount causes this buffer overflow check to fail. */
+ p = xdr_inline_decode(&rctxt->rc_stream,
+ segcount * rpcrdma_segment_maxsz * sizeof(*p));
+ return p != NULL;
+}
- total += length;
+/**
+ * xdr_count_write_chunks - Count number of Write chunks in Write list
+ * @rctxt: Received header and decoding state
+ * @p: start of an un-decoded Write list
+ *
+ * Before allocating anything, ensure the ingress Write list is
+ * safe to use.
+ *
+ * Return values:
+ * %true: Write list is valid. @rctxt's xdr_stream is updated
+ * to point to the first byte past the Write list, and
+ * the number of Write chunks is in rc_write_pcl.cl_count.
+ * %false: Write list is corrupt. @rctxt's xdr_stream is left
+ * in an indeterminate state.
+ */
+static bool xdr_count_write_chunks(struct svc_rdma_recv_ctxt *rctxt, __be32 *p)
+{
+ rctxt->rc_write_pcl.cl_count = 0;
+ while (xdr_item_is_present(p)) {
+ if (!xdr_check_write_chunk(rctxt))
+ return false;
+ ++rctxt->rc_write_pcl.cl_count;
+ p = xdr_inline_decode(&rctxt->rc_stream, sizeof(*p));
+ if (!p)
+ return false;
}
- return total <= maxlen;
+ return true;
}
/* Sanity check the Write list.
@@ -498,24 +535,18 @@ static bool xdr_check_write_chunk(struct svc_rdma_recv_ctxt *rctxt, u32 maxlen)
*/
static bool xdr_check_write_list(struct svc_rdma_recv_ctxt *rctxt)
{
- u32 chcount = 0;
__be32 *p;
p = xdr_inline_decode(&rctxt->rc_stream, sizeof(*p));
if (!p)
return false;
- rctxt->rc_write_list = p;
- while (xdr_item_is_present(p)) {
- if (!xdr_check_write_chunk(rctxt, MAX_BYTES_WRITE_CHUNK))
- return false;
- ++chcount;
- p = xdr_inline_decode(&rctxt->rc_stream, sizeof(*p));
- if (!p)
- return false;
- }
- if (!chcount)
- rctxt->rc_write_list = NULL;
- return chcount < 2;
+ if (!xdr_count_write_chunks(rctxt, p))
+ return false;
+ if (!pcl_alloc_write(rctxt, &rctxt->rc_write_pcl, p))
+ return false;
+
+ rctxt->rc_cur_result_payload = pcl_first_chunk(&rctxt->rc_write_pcl);
+ return true;
}
/* Sanity check the Reply chunk.
@@ -537,13 +568,14 @@ static bool xdr_check_reply_chunk(struct svc_rdma_recv_ctxt *rctxt)
p = xdr_inline_decode(&rctxt->rc_stream, sizeof(*p));
if (!p)
return false;
- rctxt->rc_reply_chunk = NULL;
- if (xdr_item_is_present(p)) {
- if (!xdr_check_write_chunk(rctxt, MAX_BYTES_SPECIAL_CHUNK))
- return false;
- rctxt->rc_reply_chunk = p;
- }
- return true;
+
+ if (!xdr_item_is_present(p))
+ return true;
+ if (!xdr_check_write_chunk(rctxt))
+ return false;
+
+ rctxt->rc_reply_pcl.cl_count = 1;
+ return pcl_alloc_write(rctxt, &rctxt->rc_reply_pcl, p);
}
/* RPC-over-RDMA Version One private extension: Remote Invalidation.
@@ -552,60 +584,53 @@ static bool xdr_check_reply_chunk(struct svc_rdma_recv_ctxt *rctxt)
*
* If there is exactly one distinct R_key in the received transport
* header, set rc_inv_rkey to that R_key. Otherwise, set it to zero.
- *
- * Perform this operation while the received transport header is
- * still in the CPU cache.
*/
static void svc_rdma_get_inv_rkey(struct svcxprt_rdma *rdma,
struct svc_rdma_recv_ctxt *ctxt)
{
- __be32 inv_rkey, *p;
- u32 i, segcount;
+ struct svc_rdma_segment *segment;
+ struct svc_rdma_chunk *chunk;
+ u32 inv_rkey;
ctxt->rc_inv_rkey = 0;
if (!rdma->sc_snd_w_inv)
return;
- inv_rkey = xdr_zero;
- p = ctxt->rc_recv_buf;
- p += rpcrdma_fixed_maxsz;
-
- /* Read list */
- while (xdr_item_is_present(p++)) {
- p++; /* position */
- if (inv_rkey == xdr_zero)
- inv_rkey = *p;
- else if (inv_rkey != *p)
- return;
- p += 4;
+ inv_rkey = 0;
+ pcl_for_each_chunk(chunk, &ctxt->rc_call_pcl) {
+ pcl_for_each_segment(segment, chunk) {
+ if (inv_rkey == 0)
+ inv_rkey = segment->rs_handle;
+ else if (inv_rkey != segment->rs_handle)
+ return;
+ }
}
-
- /* Write list */
- while (xdr_item_is_present(p++)) {
- segcount = be32_to_cpup(p++);
- for (i = 0; i < segcount; i++) {
- if (inv_rkey == xdr_zero)
- inv_rkey = *p;
- else if (inv_rkey != *p)
+ pcl_for_each_chunk(chunk, &ctxt->rc_read_pcl) {
+ pcl_for_each_segment(segment, chunk) {
+ if (inv_rkey == 0)
+ inv_rkey = segment->rs_handle;
+ else if (inv_rkey != segment->rs_handle)
return;
- p += 4;
}
}
-
- /* Reply chunk */
- if (xdr_item_is_present(p++)) {
- segcount = be32_to_cpup(p++);
- for (i = 0; i < segcount; i++) {
- if (inv_rkey == xdr_zero)
- inv_rkey = *p;
- else if (inv_rkey != *p)
+ pcl_for_each_chunk(chunk, &ctxt->rc_write_pcl) {
+ pcl_for_each_segment(segment, chunk) {
+ if (inv_rkey == 0)
+ inv_rkey = segment->rs_handle;
+ else if (inv_rkey != segment->rs_handle)
return;
- p += 4;
}
}
-
- ctxt->rc_inv_rkey = be32_to_cpu(inv_rkey);
+ pcl_for_each_chunk(chunk, &ctxt->rc_reply_pcl) {
+ pcl_for_each_segment(segment, chunk) {
+ if (inv_rkey == 0)
+ inv_rkey = segment->rs_handle;
+ else if (inv_rkey != segment->rs_handle)
+ return;
+ }
+ }
+ ctxt->rc_inv_rkey = inv_rkey;
}
/**
@@ -641,7 +666,8 @@ static int svc_rdma_xdr_decode_req(struct xdr_buf *rq_arg,
if (*p != rpcrdma_version)
goto out_version;
p += 2;
- switch (*p) {
+ rctxt->rc_msgtype = *p;
+ switch (rctxt->rc_msgtype) {
case rdma_msg:
break;
case rdma_nomsg:
@@ -735,30 +761,28 @@ static void svc_rdma_send_error(struct svcxprt_rdma *rdma,
* the RPC/RDMA header small and fixed in size, so it is
* straightforward to check the RPC header's direction field.
*/
-static bool svc_rdma_is_backchannel_reply(struct svc_xprt *xprt,
- __be32 *rdma_resp)
+static bool svc_rdma_is_reverse_direction_reply(struct svc_xprt *xprt,
+ struct svc_rdma_recv_ctxt *rctxt)
{
- __be32 *p;
+ __be32 *p = rctxt->rc_recv_buf;
if (!xprt->xpt_bc_xprt)
return false;
- p = rdma_resp + 3;
- if (*p++ != rdma_msg)
+ if (rctxt->rc_msgtype != rdma_msg)
return false;
- if (*p++ != xdr_zero)
+ if (!pcl_is_empty(&rctxt->rc_call_pcl))
return false;
- if (*p++ != xdr_zero)
+ if (!pcl_is_empty(&rctxt->rc_read_pcl))
return false;
- if (*p++ != xdr_zero)
+ if (!pcl_is_empty(&rctxt->rc_write_pcl))
return false;
-
- /* XID sanity */
- if (*p++ != *rdma_resp)
+ if (!pcl_is_empty(&rctxt->rc_reply_pcl))
return false;
- /* call direction */
- if (*p == cpu_to_be32(RPC_CALL))
+
+ /* RPC call direction */
+ if (*(p + 8) == cpu_to_be32(RPC_CALL))
return false;
return true;
@@ -800,7 +824,6 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
struct svcxprt_rdma *rdma_xprt =
container_of(xprt, struct svcxprt_rdma, sc_xprt);
struct svc_rdma_recv_ctxt *ctxt;
- __be32 *p;
int ret;
rqstp->rq_xprt_ctxt = NULL;
@@ -833,7 +856,6 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
rqstp->rq_respages = rqstp->rq_pages;
rqstp->rq_next_page = rqstp->rq_respages;
- p = (__be32 *)rqstp->rq_arg.head[0].iov_base;
ret = svc_rdma_xdr_decode_req(&rqstp->rq_arg, ctxt);
if (ret < 0)
goto out_err;
@@ -841,14 +863,14 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
goto out_drop;
rqstp->rq_xprt_hlen = ret;
- if (svc_rdma_is_backchannel_reply(xprt, p))
+ if (svc_rdma_is_reverse_direction_reply(xprt, ctxt))
goto out_backchannel;
svc_rdma_get_inv_rkey(rdma_xprt, ctxt);
- p += rpcrdma_fixed_maxsz;
- if (*p != xdr_zero)
- goto out_readchunk;
+ if (!pcl_is_empty(&ctxt->rc_read_pcl) ||
+ !pcl_is_empty(&ctxt->rc_call_pcl))
+ goto out_readlist;
complete:
rqstp->rq_xprt_ctxt = ctxt;
@@ -856,10 +878,10 @@ complete:
svc_xprt_copy_addrs(rqstp, xprt);
return rqstp->rq_arg.len;
-out_readchunk:
- ret = svc_rdma_recv_read_chunk(rdma_xprt, rqstp, ctxt, p);
+out_readlist:
+ ret = svc_rdma_process_read_list(rdma_xprt, rqstp, ctxt);
if (ret < 0)
- goto out_postfail;
+ goto out_readfail;
return 0;
out_err:
@@ -867,7 +889,7 @@ out_err:
svc_rdma_recv_ctxt_put(rdma_xprt, ctxt);
return 0;
-out_postfail:
+out_readfail:
if (ret == -EINVAL)
svc_rdma_send_error(rdma_xprt, ctxt, ret);
svc_rdma_recv_ctxt_put(rdma_xprt, ctxt);
diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c
index 80a0c0e87590..0b63e1321d74 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_rw.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c
@@ -190,14 +190,14 @@ static void svc_rdma_cc_release(struct svc_rdma_chunk_ctxt *cc,
* - Stores arguments for the SGL constructor functions
*/
struct svc_rdma_write_info {
+ const struct svc_rdma_chunk *wi_chunk;
+
/* write state of this chunk */
unsigned int wi_seg_off;
unsigned int wi_seg_no;
- unsigned int wi_nsegs;
- __be32 *wi_segs;
/* SGL constructor arguments */
- struct xdr_buf *wi_xdr;
+ const struct xdr_buf *wi_xdr;
unsigned char *wi_base;
unsigned int wi_next_off;
@@ -205,7 +205,8 @@ struct svc_rdma_write_info {
};
static struct svc_rdma_write_info *
-svc_rdma_write_info_alloc(struct svcxprt_rdma *rdma, __be32 *chunk)
+svc_rdma_write_info_alloc(struct svcxprt_rdma *rdma,
+ const struct svc_rdma_chunk *chunk)
{
struct svc_rdma_write_info *info;
@@ -213,10 +214,9 @@ svc_rdma_write_info_alloc(struct svcxprt_rdma *rdma, __be32 *chunk)
if (!info)
return info;
+ info->wi_chunk = chunk;
info->wi_seg_off = 0;
info->wi_seg_no = 0;
- info->wi_nsegs = be32_to_cpup(++chunk);
- info->wi_segs = ++chunk;
svc_rdma_cc_init(rdma, &info->wi_cc);
info->wi_cc.cc_cqe.done = svc_rdma_write_done;
return info;
@@ -258,11 +258,11 @@ static void svc_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc)
/* State for pulling a Read chunk.
*/
struct svc_rdma_read_info {
+ struct svc_rqst *ri_rqst;
struct svc_rdma_recv_ctxt *ri_readctxt;
- unsigned int ri_position;
unsigned int ri_pageno;
unsigned int ri_pageoff;
- unsigned int ri_chunklen;
+ unsigned int ri_totalbytes;
struct svc_rdma_chunk_ctxt ri_cc;
};
@@ -358,7 +358,6 @@ static int svc_rdma_post_chunk_ctxt(struct svc_rdma_chunk_ctxt *cc)
do {
if (atomic_sub_return(cc->cc_sqecount,
&rdma->sc_sq_avail) > 0) {
- trace_svcrdma_post_chunk(&cc->cc_cid, cc->cc_sqecount);
ret = ib_post_send(rdma->sc_qp, first_wr, &bad_wr);
if (ret)
break;
@@ -405,7 +404,7 @@ static void svc_rdma_pagelist_to_sg(struct svc_rdma_write_info *info,
struct svc_rdma_rw_ctxt *ctxt)
{
unsigned int sge_no, sge_bytes, page_off, page_no;
- struct xdr_buf *xdr = info->wi_xdr;
+ const struct xdr_buf *xdr = info->wi_xdr;
struct scatterlist *sg;
struct page **page;
@@ -443,40 +442,36 @@ svc_rdma_build_writes(struct svc_rdma_write_info *info,
{
struct svc_rdma_chunk_ctxt *cc = &info->wi_cc;
struct svcxprt_rdma *rdma = cc->cc_rdma;
+ const struct svc_rdma_segment *seg;
struct svc_rdma_rw_ctxt *ctxt;
- __be32 *seg;
int ret;
- seg = info->wi_segs + info->wi_seg_no * rpcrdma_segment_maxsz;
do {
unsigned int write_len;
- u32 handle, length;
u64 offset;
- if (info->wi_seg_no >= info->wi_nsegs)
+ seg = &info->wi_chunk->ch_segments[info->wi_seg_no];
+ if (!seg)
goto out_overflow;
- xdr_decode_rdma_segment(seg, &handle, &length, &offset);
- offset += info->wi_seg_off;
-
- write_len = min(remaining, length - info->wi_seg_off);
+ write_len = min(remaining, seg->rs_length - info->wi_seg_off);
+ if (!write_len)
+ goto out_overflow;
ctxt = svc_rdma_get_rw_ctxt(rdma,
(write_len >> PAGE_SHIFT) + 2);
if (!ctxt)
return -ENOMEM;
constructor(info, write_len, ctxt);
- ret = svc_rdma_rw_ctx_init(rdma, ctxt, offset, handle,
+ offset = seg->rs_offset + info->wi_seg_off;
+ ret = svc_rdma_rw_ctx_init(rdma, ctxt, offset, seg->rs_handle,
DMA_TO_DEVICE);
if (ret < 0)
return -EIO;
- trace_svcrdma_send_wseg(handle, write_len, offset);
-
list_add(&ctxt->rw_list, &cc->cc_rwctxts);
cc->cc_sqecount += ret;
- if (write_len == length - info->wi_seg_off) {
- seg += 4;
+ if (write_len == seg->rs_length - info->wi_seg_off) {
info->wi_seg_no++;
info->wi_seg_off = 0;
} else {
@@ -489,31 +484,46 @@ svc_rdma_build_writes(struct svc_rdma_write_info *info,
out_overflow:
trace_svcrdma_small_wrch_err(rdma, remaining, info->wi_seg_no,
- info->wi_nsegs);
+ info->wi_chunk->ch_segcount);
return -E2BIG;
}
-/* Send one of an xdr_buf's kvecs by itself. To send a Reply
- * chunk, the whole RPC Reply is written back to the client.
- * This function writes either the head or tail of the xdr_buf
- * containing the Reply.
+/**
+ * svc_rdma_iov_write - Construct RDMA Writes from an iov
+ * @info: pointer to write arguments
+ * @iov: kvec to write
+ *
+ * Returns:
+ * On succes, returns zero
+ * %-E2BIG if the client-provided Write chunk is too small
+ * %-ENOMEM if a resource has been exhausted
+ * %-EIO if an rdma-rw error occurred
*/
-static int svc_rdma_send_xdr_kvec(struct svc_rdma_write_info *info,
- struct kvec *vec)
+static int svc_rdma_iov_write(struct svc_rdma_write_info *info,
+ const struct kvec *iov)
{
- info->wi_base = vec->iov_base;
+ info->wi_base = iov->iov_base;
return svc_rdma_build_writes(info, svc_rdma_vec_to_sg,
- vec->iov_len);
+ iov->iov_len);
}
-/* Send an xdr_buf's page list by itself. A Write chunk is just
- * the page list. A Reply chunk is @xdr's head, page list, and
- * tail. This function is shared between the two types of chunk.
+/**
+ * svc_rdma_pages_write - Construct RDMA Writes from pages
+ * @info: pointer to write arguments
+ * @xdr: xdr_buf with pages to write
+ * @offset: offset into the content of @xdr
+ * @length: number of bytes to write
+ *
+ * Returns:
+ * On succes, returns zero
+ * %-E2BIG if the client-provided Write chunk is too small
+ * %-ENOMEM if a resource has been exhausted
+ * %-EIO if an rdma-rw error occurred
*/
-static int svc_rdma_send_xdr_pagelist(struct svc_rdma_write_info *info,
- struct xdr_buf *xdr,
- unsigned int offset,
- unsigned long length)
+static int svc_rdma_pages_write(struct svc_rdma_write_info *info,
+ const struct xdr_buf *xdr,
+ unsigned int offset,
+ unsigned long length)
{
info->wi_xdr = xdr;
info->wi_next_off = offset - xdr->head[0].iov_len;
@@ -522,12 +532,48 @@ static int svc_rdma_send_xdr_pagelist(struct svc_rdma_write_info *info,
}
/**
+ * svc_rdma_xb_write - Construct RDMA Writes to write an xdr_buf
+ * @xdr: xdr_buf to write
+ * @data: pointer to write arguments
+ *
+ * Returns:
+ * On succes, returns zero
+ * %-E2BIG if the client-provided Write chunk is too small
+ * %-ENOMEM if a resource has been exhausted
+ * %-EIO if an rdma-rw error occurred
+ */
+static int svc_rdma_xb_write(const struct xdr_buf *xdr, void *data)
+{
+ struct svc_rdma_write_info *info = data;
+ int ret;
+
+ if (xdr->head[0].iov_len) {
+ ret = svc_rdma_iov_write(info, &xdr->head[0]);
+ if (ret < 0)
+ return ret;
+ }
+
+ if (xdr->page_len) {
+ ret = svc_rdma_pages_write(info, xdr, xdr->head[0].iov_len,
+ xdr->page_len);
+ if (ret < 0)
+ return ret;
+ }
+
+ if (xdr->tail[0].iov_len) {
+ ret = svc_rdma_iov_write(info, &xdr->tail[0]);
+ if (ret < 0)
+ return ret;
+ }
+
+ return xdr->len;
+}
+
+/**
* svc_rdma_send_write_chunk - Write all segments in a Write chunk
* @rdma: controlling RDMA transport
- * @wr_ch: Write chunk provided by client
+ * @chunk: Write chunk provided by the client
* @xdr: xdr_buf containing the data payload
- * @offset: payload's byte offset in @xdr
- * @length: size of payload, in bytes
*
* Returns a non-negative number of bytes the chunk consumed, or
* %-E2BIG if the payload was larger than the Write chunk,
@@ -536,30 +582,28 @@ static int svc_rdma_send_xdr_pagelist(struct svc_rdma_write_info *info,
* %-ENOTCONN if posting failed (connection is lost),
* %-EIO if rdma_rw initialization failed (DMA mapping, etc).
*/
-int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma, __be32 *wr_ch,
- struct xdr_buf *xdr,
- unsigned int offset, unsigned long length)
+int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma,
+ const struct svc_rdma_chunk *chunk,
+ const struct xdr_buf *xdr)
{
struct svc_rdma_write_info *info;
+ struct svc_rdma_chunk_ctxt *cc;
int ret;
- if (!length)
- return 0;
-
- info = svc_rdma_write_info_alloc(rdma, wr_ch);
+ info = svc_rdma_write_info_alloc(rdma, chunk);
if (!info)
return -ENOMEM;
+ cc = &info->wi_cc;
- ret = svc_rdma_send_xdr_pagelist(info, xdr, offset, length);
- if (ret < 0)
+ ret = svc_rdma_xb_write(xdr, info);
+ if (ret != xdr->len)
goto out_err;
- ret = svc_rdma_post_chunk_ctxt(&info->wi_cc);
+ trace_svcrdma_post_write_chunk(&cc->cc_cid, cc->cc_sqecount);
+ ret = svc_rdma_post_chunk_ctxt(cc);
if (ret < 0)
goto out_err;
-
- trace_svcrdma_send_write_chunk(xdr->page_len);
- return length;
+ return xdr->len;
out_err:
svc_rdma_write_info_free(info);
@@ -581,62 +625,62 @@ out_err:
*/
int svc_rdma_send_reply_chunk(struct svcxprt_rdma *rdma,
const struct svc_rdma_recv_ctxt *rctxt,
- struct xdr_buf *xdr)
+ const struct xdr_buf *xdr)
{
struct svc_rdma_write_info *info;
- int consumed, ret;
+ struct svc_rdma_chunk_ctxt *cc;
+ struct svc_rdma_chunk *chunk;
+ int ret;
- info = svc_rdma_write_info_alloc(rdma, rctxt->rc_reply_chunk);
+ if (pcl_is_empty(&rctxt->rc_reply_pcl))
+ return 0;
+
+ chunk = pcl_first_chunk(&rctxt->rc_reply_pcl);
+ info = svc_rdma_write_info_alloc(rdma, chunk);
if (!info)
return -ENOMEM;
+ cc = &info->wi_cc;
- ret = svc_rdma_send_xdr_kvec(info, &xdr->head[0]);
+ ret = pcl_process_nonpayloads(&rctxt->rc_write_pcl, xdr,
+ svc_rdma_xb_write, info);
if (ret < 0)
goto out_err;
- consumed = xdr->head[0].iov_len;
-
- /* Send the page list in the Reply chunk only if the
- * client did not provide Write chunks.
- */
- if (!rctxt->rc_write_list && xdr->page_len) {
- ret = svc_rdma_send_xdr_pagelist(info, xdr,
- xdr->head[0].iov_len,
- xdr->page_len);
- if (ret < 0)
- goto out_err;
- consumed += xdr->page_len;
- }
-
- if (xdr->tail[0].iov_len) {
- ret = svc_rdma_send_xdr_kvec(info, &xdr->tail[0]);
- if (ret < 0)
- goto out_err;
- consumed += xdr->tail[0].iov_len;
- }
- ret = svc_rdma_post_chunk_ctxt(&info->wi_cc);
+ trace_svcrdma_post_reply_chunk(&cc->cc_cid, cc->cc_sqecount);
+ ret = svc_rdma_post_chunk_ctxt(cc);
if (ret < 0)
goto out_err;
- trace_svcrdma_send_reply_chunk(consumed);
- return consumed;
+ return xdr->len;
out_err:
svc_rdma_write_info_free(info);
return ret;
}
+/**
+ * svc_rdma_build_read_segment - Build RDMA Read WQEs to pull one RDMA segment
+ * @info: context for ongoing I/O
+ * @segment: co-ordinates of remote memory to be read
+ *
+ * Returns:
+ * %0: the Read WR chain was constructed successfully
+ * %-EINVAL: there were not enough rq_pages to finish
+ * %-ENOMEM: allocating a local resources failed
+ * %-EIO: a DMA mapping error occurred
+ */
static int svc_rdma_build_read_segment(struct svc_rdma_read_info *info,
- struct svc_rqst *rqstp,
- u32 rkey, u32 len, u64 offset)
+ const struct svc_rdma_segment *segment)
{
struct svc_rdma_recv_ctxt *head = info->ri_readctxt;
struct svc_rdma_chunk_ctxt *cc = &info->ri_cc;
+ struct svc_rqst *rqstp = info->ri_rqst;
struct svc_rdma_rw_ctxt *ctxt;
- unsigned int sge_no, seg_len;
+ unsigned int sge_no, seg_len, len;
struct scatterlist *sg;
int ret;
+ len = segment->rs_length;
sge_no = PAGE_ALIGN(info->ri_pageoff + len) >> PAGE_SHIFT;
ctxt = svc_rdma_get_rw_ctxt(cc->cc_rdma, sge_no);
if (!ctxt)
@@ -670,8 +714,8 @@ static int svc_rdma_build_read_segment(struct svc_rdma_read_info *info,
goto out_overrun;
}
- ret = svc_rdma_rw_ctx_init(cc->cc_rdma, ctxt, offset, rkey,
- DMA_FROM_DEVICE);
+ ret = svc_rdma_rw_ctx_init(cc->cc_rdma, ctxt, segment->rs_offset,
+ segment->rs_handle, DMA_FROM_DEVICE);
if (ret < 0)
return -EIO;
@@ -684,54 +728,177 @@ out_overrun:
return -EINVAL;
}
-/* Walk the segments in the Read chunk starting at @p and construct
- * RDMA Read operations to pull the chunk to the server.
+/**
+ * svc_rdma_build_read_chunk - Build RDMA Read WQEs to pull one RDMA chunk
+ * @info: context for ongoing I/O
+ * @chunk: Read chunk to pull
+ *
+ * Return values:
+ * %0: the Read WR chain was constructed successfully
+ * %-EINVAL: there were not enough resources to finish
+ * %-ENOMEM: allocating a local resources failed
+ * %-EIO: a DMA mapping error occurred
*/
-static int svc_rdma_build_read_chunk(struct svc_rqst *rqstp,
- struct svc_rdma_read_info *info,
- __be32 *p)
+static int svc_rdma_build_read_chunk(struct svc_rdma_read_info *info,
+ const struct svc_rdma_chunk *chunk)
{
+ const struct svc_rdma_segment *segment;
int ret;
ret = -EINVAL;
- info->ri_chunklen = 0;
- while (*p++ != xdr_zero && be32_to_cpup(p++) == info->ri_position) {
- u32 handle, length;
- u64 offset;
+ pcl_for_each_segment(segment, chunk) {
+ ret = svc_rdma_build_read_segment(info, segment);
+ if (ret < 0)
+ break;
+ info->ri_totalbytes += segment->rs_length;
+ }
+ return ret;
+}
+
+/**
+ * svc_rdma_copy_inline_range - Copy part of the inline content into pages
+ * @info: context for RDMA Reads
+ * @offset: offset into the Receive buffer of region to copy
+ * @remaining: length of region to copy
+ *
+ * Take a page at a time from rqstp->rq_pages and copy the inline
+ * content from the Receive buffer into that page. Update
+ * info->ri_pageno and info->ri_pageoff so that the next RDMA Read
+ * result will land contiguously with the copied content.
+ *
+ * Return values:
+ * %0: Inline content was successfully copied
+ * %-EINVAL: offset or length was incorrect
+ */
+static int svc_rdma_copy_inline_range(struct svc_rdma_read_info *info,
+ unsigned int offset,
+ unsigned int remaining)
+{
+ struct svc_rdma_recv_ctxt *head = info->ri_readctxt;
+ unsigned char *dst, *src = head->rc_recv_buf;
+ struct svc_rqst *rqstp = info->ri_rqst;
+ unsigned int page_no, numpages;
+
+ numpages = PAGE_ALIGN(info->ri_pageoff + remaining) >> PAGE_SHIFT;
+ for (page_no = 0; page_no < numpages; page_no++) {
+ unsigned int page_len;
+
+ page_len = min_t(unsigned int, remaining,
+ PAGE_SIZE - info->ri_pageoff);
+
+ head->rc_arg.pages[info->ri_pageno] =
+ rqstp->rq_pages[info->ri_pageno];
+ if (!info->ri_pageoff)
+ head->rc_page_count++;
+
+ dst = page_address(head->rc_arg.pages[info->ri_pageno]);
+ memcpy(dst + info->ri_pageno, src + offset, page_len);
+
+ info->ri_totalbytes += page_len;
+ info->ri_pageoff += page_len;
+ if (info->ri_pageoff == PAGE_SIZE) {
+ info->ri_pageno++;
+ info->ri_pageoff = 0;
+ }
+ remaining -= page_len;
+ offset += page_len;
+ }
+
+ return -EINVAL;
+}
+
+/**
+ * svc_rdma_read_multiple_chunks - Construct RDMA Reads to pull data item Read chunks
+ * @info: context for RDMA Reads
+ *
+ * The chunk data lands in head->rc_arg as a series of contiguous pages,
+ * like an incoming TCP call.
+ *
+ * Return values:
+ * %0: RDMA Read WQEs were successfully built
+ * %-EINVAL: client provided too many chunks or segments,
+ * %-ENOMEM: rdma_rw context pool was exhausted,
+ * %-ENOTCONN: posting failed (connection is lost),
+ * %-EIO: rdma_rw initialization failed (DMA mapping, etc).
+ */
+static noinline int svc_rdma_read_multiple_chunks(struct svc_rdma_read_info *info)
+{
+ struct svc_rdma_recv_ctxt *head = info->ri_readctxt;
+ const struct svc_rdma_pcl *pcl = &head->rc_read_pcl;
+ struct svc_rdma_chunk *chunk, *next;
+ struct xdr_buf *buf = &head->rc_arg;
+ unsigned int start, length;
+ int ret;
- p = xdr_decode_rdma_segment(p, &handle, &length, &offset);
- ret = svc_rdma_build_read_segment(info, rqstp, handle, length,
- offset);
+ start = 0;
+ chunk = pcl_first_chunk(pcl);
+ length = chunk->ch_position;
+ ret = svc_rdma_copy_inline_range(info, start, length);
+ if (ret < 0)
+ return ret;
+
+ pcl_for_each_chunk(chunk, pcl) {
+ ret = svc_rdma_build_read_chunk(info, chunk);
if (ret < 0)
+ return ret;
+
+ next = pcl_next_chunk(pcl, chunk);
+ if (!next)
break;
- trace_svcrdma_send_rseg(handle, length, offset);
- info->ri_chunklen += length;
+ start += length;
+ length = next->ch_position - info->ri_totalbytes;
+ ret = svc_rdma_copy_inline_range(info, start, length);
+ if (ret < 0)
+ return ret;
}
- return ret;
+ start += length;
+ length = head->rc_byte_len - start;
+ ret = svc_rdma_copy_inline_range(info, start, length);
+ if (ret < 0)
+ return ret;
+
+ buf->len += info->ri_totalbytes;
+ buf->buflen += info->ri_totalbytes;
+
+ head->rc_hdr_count = 1;
+ buf->head[0].iov_base = page_address(head->rc_pages[0]);
+ buf->head[0].iov_len = min_t(size_t, PAGE_SIZE, info->ri_totalbytes);
+ buf->page_len = info->ri_totalbytes - buf->head[0].iov_len;
+ return 0;
}
-/* Construct RDMA Reads to pull over a normal Read chunk. The chunk
- * data lands in the page list of head->rc_arg.pages.
+/**
+ * svc_rdma_read_data_item - Construct RDMA Reads to pull data item Read chunks
+ * @info: context for RDMA Reads
+ *
+ * The chunk data lands in the page list of head->rc_arg.pages.
*
* Currently NFSD does not look at the head->rc_arg.tail[0] iovec.
* Therefore, XDR round-up of the Read chunk and trailing
* inline content must both be added at the end of the pagelist.
+ *
+ * Return values:
+ * %0: RDMA Read WQEs were successfully built
+ * %-EINVAL: client provided too many chunks or segments,
+ * %-ENOMEM: rdma_rw context pool was exhausted,
+ * %-ENOTCONN: posting failed (connection is lost),
+ * %-EIO: rdma_rw initialization failed (DMA mapping, etc).
*/
-static int svc_rdma_build_normal_read_chunk(struct svc_rqst *rqstp,
- struct svc_rdma_read_info *info,
- __be32 *p)
+static int svc_rdma_read_data_item(struct svc_rdma_read_info *info)
{
struct svc_rdma_recv_ctxt *head = info->ri_readctxt;
+ struct xdr_buf *buf = &head->rc_arg;
+ struct svc_rdma_chunk *chunk;
+ unsigned int length;
int ret;
- ret = svc_rdma_build_read_chunk(rqstp, info, p);
+ chunk = pcl_first_chunk(&head->rc_read_pcl);
+ ret = svc_rdma_build_read_chunk(info, chunk);
if (ret < 0)
goto out;
- trace_svcrdma_send_read_chunk(info->ri_chunklen, info->ri_position);
-
head->rc_hdr_count = 0;
/* Split the Receive buffer between the head and tail
@@ -739,11 +906,9 @@ static int svc_rdma_build_normal_read_chunk(struct svc_rqst *rqstp,
* chunk is not included in either the pagelist or in
* the tail.
*/
- head->rc_arg.tail[0].iov_base =
- head->rc_arg.head[0].iov_base + info->ri_position;
- head->rc_arg.tail[0].iov_len =
- head->rc_arg.head[0].iov_len - info->ri_position;
- head->rc_arg.head[0].iov_len = info->ri_position;
+ buf->tail[0].iov_base = buf->head[0].iov_base + chunk->ch_position;
+ buf->tail[0].iov_len = buf->head[0].iov_len - chunk->ch_position;
+ buf->head[0].iov_len = chunk->ch_position;
/* Read chunk may need XDR roundup (see RFC 8166, s. 3.4.5.2).
*
@@ -754,50 +919,149 @@ static int svc_rdma_build_normal_read_chunk(struct svc_rqst *rqstp,
* Currently these chunks always start at page offset 0,
* thus the rounded-up length never crosses a page boundary.
*/
- info->ri_chunklen = XDR_QUADLEN(info->ri_chunklen) << 2;
-
- head->rc_arg.page_len = info->ri_chunklen;
- head->rc_arg.len += info->ri_chunklen;
- head->rc_arg.buflen += info->ri_chunklen;
+ length = XDR_QUADLEN(info->ri_totalbytes) << 2;
+ buf->page_len = length;
+ buf->len += length;
+ buf->buflen += length;
out:
return ret;
}
-/* Construct RDMA Reads to pull over a Position Zero Read chunk.
- * The start of the data lands in the first page just after
- * the Transport header, and the rest lands in the page list of
+/**
+ * svc_rdma_read_chunk_range - Build RDMA Read WQEs for portion of a chunk
+ * @info: context for RDMA Reads
+ * @chunk: parsed Call chunk to pull
+ * @offset: offset of region to pull
+ * @length: length of region to pull
+ *
+ * Return values:
+ * %0: RDMA Read WQEs were successfully built
+ * %-EINVAL: there were not enough resources to finish
+ * %-ENOMEM: rdma_rw context pool was exhausted,
+ * %-ENOTCONN: posting failed (connection is lost),
+ * %-EIO: rdma_rw initialization failed (DMA mapping, etc).
+ */
+static int svc_rdma_read_chunk_range(struct svc_rdma_read_info *info,
+ const struct svc_rdma_chunk *chunk,
+ unsigned int offset, unsigned int length)
+{
+ const struct svc_rdma_segment *segment;
+ int ret;
+
+ ret = -EINVAL;
+ pcl_for_each_segment(segment, chunk) {
+ struct svc_rdma_segment dummy;
+
+ if (offset > segment->rs_length) {
+ offset -= segment->rs_length;
+ continue;
+ }
+
+ dummy.rs_handle = segment->rs_handle;
+ dummy.rs_length = min_t(u32, length, segment->rs_length) - offset;
+ dummy.rs_offset = segment->rs_offset + offset;
+
+ ret = svc_rdma_build_read_segment(info, &dummy);
+ if (ret < 0)
+ break;
+
+ info->ri_totalbytes += dummy.rs_length;
+ length -= dummy.rs_length;
+ offset = 0;
+ }
+ return ret;
+}
+
+/**
+ * svc_rdma_read_call_chunk - Build RDMA Read WQEs to pull a Long Message
+ * @info: context for RDMA Reads
+ *
+ * Return values:
+ * %0: RDMA Read WQEs were successfully built
+ * %-EINVAL: there were not enough resources to finish
+ * %-ENOMEM: rdma_rw context pool was exhausted,
+ * %-ENOTCONN: posting failed (connection is lost),
+ * %-EIO: rdma_rw initialization failed (DMA mapping, etc).
+ */
+static int svc_rdma_read_call_chunk(struct svc_rdma_read_info *info)
+{
+ struct svc_rdma_recv_ctxt *head = info->ri_readctxt;
+ const struct svc_rdma_chunk *call_chunk =
+ pcl_first_chunk(&head->rc_call_pcl);
+ const struct svc_rdma_pcl *pcl = &head->rc_read_pcl;
+ struct svc_rdma_chunk *chunk, *next;
+ unsigned int start, length;
+ int ret;
+
+ if (pcl_is_empty(pcl))
+ return svc_rdma_build_read_chunk(info, call_chunk);
+
+ start = 0;
+ chunk = pcl_first_chunk(pcl);
+ length = chunk->ch_position;
+ ret = svc_rdma_read_chunk_range(info, call_chunk, start, length);
+ if (ret < 0)
+ return ret;
+
+ pcl_for_each_chunk(chunk, pcl) {
+ ret = svc_rdma_build_read_chunk(info, chunk);
+ if (ret < 0)
+ return ret;
+
+ next = pcl_next_chunk(pcl, chunk);
+ if (!next)
+ break;
+
+ start += length;
+ length = next->ch_position - info->ri_totalbytes;
+ ret = svc_rdma_read_chunk_range(info, call_chunk,
+ start, length);
+ if (ret < 0)
+ return ret;
+ }
+
+ start += length;
+ length = call_chunk->ch_length - start;
+ return svc_rdma_read_chunk_range(info, call_chunk, start, length);
+}
+
+/**
+ * svc_rdma_read_special - Build RDMA Read WQEs to pull a Long Message
+ * @info: context for RDMA Reads
+ *
+ * The start of the data lands in the first page just after the
+ * Transport header, and the rest lands in the page list of
* head->rc_arg.pages.
*
* Assumptions:
- * - A PZRC has an XDR-aligned length (no implicit round-up).
- * - There can be no trailing inline content (IOW, we assume
- * a PZRC is never sent in an RDMA_MSG message, though it's
- * allowed by spec).
+ * - A PZRC is never sent in an RDMA_MSG message, though it's
+ * allowed by spec.
+ *
+ * Return values:
+ * %0: RDMA Read WQEs were successfully built
+ * %-EINVAL: client provided too many chunks or segments,
+ * %-ENOMEM: rdma_rw context pool was exhausted,
+ * %-ENOTCONN: posting failed (connection is lost),
+ * %-EIO: rdma_rw initialization failed (DMA mapping, etc).
*/
-static int svc_rdma_build_pz_read_chunk(struct svc_rqst *rqstp,
- struct svc_rdma_read_info *info,
- __be32 *p)
+static noinline int svc_rdma_read_special(struct svc_rdma_read_info *info)
{
struct svc_rdma_recv_ctxt *head = info->ri_readctxt;
+ struct xdr_buf *buf = &head->rc_arg;
int ret;
- ret = svc_rdma_build_read_chunk(rqstp, info, p);
+ ret = svc_rdma_read_call_chunk(info);
if (ret < 0)
goto out;
- trace_svcrdma_send_pzr(info->ri_chunklen);
-
- head->rc_arg.len += info->ri_chunklen;
- head->rc_arg.buflen += info->ri_chunklen;
+ buf->len += info->ri_totalbytes;
+ buf->buflen += info->ri_totalbytes;
head->rc_hdr_count = 1;
- head->rc_arg.head[0].iov_base = page_address(head->rc_pages[0]);
- head->rc_arg.head[0].iov_len = min_t(size_t, PAGE_SIZE,
- info->ri_chunklen);
-
- head->rc_arg.page_len = info->ri_chunklen -
- head->rc_arg.head[0].iov_len;
+ buf->head[0].iov_base = page_address(head->rc_pages[0]);
+ buf->head[0].iov_len = min_t(size_t, PAGE_SIZE, info->ri_totalbytes);
+ buf->page_len = info->ri_totalbytes - buf->head[0].iov_len;
out:
return ret;
@@ -824,26 +1088,34 @@ static void svc_rdma_save_io_pages(struct svc_rqst *rqstp,
}
/**
- * svc_rdma_recv_read_chunk - Pull a Read chunk from the client
+ * svc_rdma_process_read_list - Pull list of Read chunks from the client
* @rdma: controlling RDMA transport
* @rqstp: set of pages to use as Read sink buffers
* @head: pages under I/O collect here
- * @p: pointer to start of Read chunk
*
- * Returns:
- * %0 if all needed RDMA Reads were posted successfully,
- * %-EINVAL if client provided too many segments,
- * %-ENOMEM if rdma_rw context pool was exhausted,
- * %-ENOTCONN if posting failed (connection is lost),
- * %-EIO if rdma_rw initialization failed (DMA mapping, etc).
+ * The RPC/RDMA protocol assumes that the upper layer's XDR decoders
+ * pull each Read chunk as they decode an incoming RPC message.
*
- * Assumptions:
- * - All Read segments in @p have the same Position value.
+ * On Linux, however, the server needs to have a fully-constructed RPC
+ * message in rqstp->rq_arg when there is a positive return code from
+ * ->xpo_recvfrom. So the Read list is safety-checked immediately when
+ * it is received, then here the whole Read list is pulled all at once.
+ * The ingress RPC message is fully reconstructed once all associated
+ * RDMA Reads have completed.
+ *
+ * Return values:
+ * %1: all needed RDMA Reads were posted successfully,
+ * %-EINVAL: client provided too many chunks or segments,
+ * %-ENOMEM: rdma_rw context pool was exhausted,
+ * %-ENOTCONN: posting failed (connection is lost),
+ * %-EIO: rdma_rw initialization failed (DMA mapping, etc).
*/
-int svc_rdma_recv_read_chunk(struct svcxprt_rdma *rdma, struct svc_rqst *rqstp,
- struct svc_rdma_recv_ctxt *head, __be32 *p)
+int svc_rdma_process_read_list(struct svcxprt_rdma *rdma,
+ struct svc_rqst *rqstp,
+ struct svc_rdma_recv_ctxt *head)
{
struct svc_rdma_read_info *info;
+ struct svc_rdma_chunk_ctxt *cc;
int ret;
/* The request (with page list) is constructed in
@@ -861,23 +1133,29 @@ int svc_rdma_recv_read_chunk(struct svcxprt_rdma *rdma, struct svc_rqst *rqstp,
info = svc_rdma_read_info_alloc(rdma);
if (!info)
return -ENOMEM;
+ cc = &info->ri_cc;
+ info->ri_rqst = rqstp;
info->ri_readctxt = head;
info->ri_pageno = 0;
info->ri_pageoff = 0;
-
- info->ri_position = be32_to_cpup(p + 1);
- if (info->ri_position)
- ret = svc_rdma_build_normal_read_chunk(rqstp, info, p);
- else
- ret = svc_rdma_build_pz_read_chunk(rqstp, info, p);
+ info->ri_totalbytes = 0;
+
+ if (pcl_is_empty(&head->rc_call_pcl)) {
+ if (head->rc_read_pcl.cl_count == 1)
+ ret = svc_rdma_read_data_item(info);
+ else
+ ret = svc_rdma_read_multiple_chunks(info);
+ } else
+ ret = svc_rdma_read_special(info);
if (ret < 0)
goto out_err;
- ret = svc_rdma_post_chunk_ctxt(&info->ri_cc);
+ trace_svcrdma_post_read_chunk(&cc->cc_cid, cc->cc_sqecount);
+ ret = svc_rdma_post_chunk_ctxt(cc);
if (ret < 0)
goto out_err;
svc_rdma_save_io_pages(rqstp, 0, head->rc_page_count);
- return 0;
+ return 1;
out_err:
svc_rdma_read_info_free(info);
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index c3d588b149aa..68af79d4f04f 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -358,49 +358,42 @@ static ssize_t svc_rdma_encode_read_list(struct svc_rdma_send_ctxt *sctxt)
/**
* svc_rdma_encode_write_segment - Encode one Write segment
- * @src: matching Write chunk in the RPC Call header
* @sctxt: Send context for the RPC Reply
+ * @chunk: Write chunk to push
* @remaining: remaining bytes of the payload left in the Write chunk
+ * @segno: which segment in the chunk
*
* Return values:
* On success, returns length in bytes of the Reply XDR buffer
- * that was consumed by the Write segment
+ * that was consumed by the Write segment, and updates @remaining
* %-EMSGSIZE on XDR buffer overflow
*/
-static ssize_t svc_rdma_encode_write_segment(__be32 *src,
- struct svc_rdma_send_ctxt *sctxt,
- unsigned int *remaining)
+static ssize_t svc_rdma_encode_write_segment(struct svc_rdma_send_ctxt *sctxt,
+ const struct svc_rdma_chunk *chunk,
+ u32 *remaining, unsigned int segno)
{
+ const struct svc_rdma_segment *segment = &chunk->ch_segments[segno];
+ const size_t len = rpcrdma_segment_maxsz * sizeof(__be32);
+ u32 length;
__be32 *p;
- const size_t len = rpcrdma_segment_maxsz * sizeof(*p);
- u32 handle, length;
- u64 offset;
p = xdr_reserve_space(&sctxt->sc_stream, len);
if (!p)
return -EMSGSIZE;
- xdr_decode_rdma_segment(src, &handle, &length, &offset);
-
- if (*remaining < length) {
- /* segment only partly filled */
- length = *remaining;
- *remaining = 0;
- } else {
- /* entire segment was consumed */
- *remaining -= length;
- }
- xdr_encode_rdma_segment(p, handle, length, offset);
-
- trace_svcrdma_encode_wseg(handle, length, offset);
+ length = min_t(u32, *remaining, segment->rs_length);
+ *remaining -= length;
+ xdr_encode_rdma_segment(p, segment->rs_handle, length,
+ segment->rs_offset);
+ trace_svcrdma_encode_wseg(sctxt, segno, segment->rs_handle, length,
+ segment->rs_offset);
return len;
}
/**
* svc_rdma_encode_write_chunk - Encode one Write chunk
- * @src: matching Write chunk in the RPC Call header
* @sctxt: Send context for the RPC Reply
- * @remaining: size in bytes of the payload in the Write chunk
+ * @chunk: Write chunk to push
*
* Copy a Write chunk from the Call transport header to the
* Reply transport header. Update each segment's length field
@@ -411,33 +404,28 @@ static ssize_t svc_rdma_encode_write_segment(__be32 *src,
* that was consumed by the Write chunk
* %-EMSGSIZE on XDR buffer overflow
*/
-static ssize_t svc_rdma_encode_write_chunk(__be32 *src,
- struct svc_rdma_send_ctxt *sctxt,
- unsigned int remaining)
+static ssize_t svc_rdma_encode_write_chunk(struct svc_rdma_send_ctxt *sctxt,
+ const struct svc_rdma_chunk *chunk)
{
- unsigned int i, nsegs;
+ u32 remaining = chunk->ch_payload_length;
+ unsigned int segno;
ssize_t len, ret;
len = 0;
- trace_svcrdma_encode_write_chunk(remaining);
-
- src++;
ret = xdr_stream_encode_item_present(&sctxt->sc_stream);
if (ret < 0)
- return -EMSGSIZE;
+ return ret;
len += ret;
- nsegs = be32_to_cpup(src++);
- ret = xdr_stream_encode_u32(&sctxt->sc_stream, nsegs);
+ ret = xdr_stream_encode_u32(&sctxt->sc_stream, chunk->ch_segcount);
if (ret < 0)
- return -EMSGSIZE;
+ return ret;
len += ret;
- for (i = nsegs; i; i--) {
- ret = svc_rdma_encode_write_segment(src, sctxt, &remaining);
+ for (segno = 0; segno < chunk->ch_segcount; segno++) {
+ ret = svc_rdma_encode_write_segment(sctxt, chunk, &remaining, segno);
if (ret < 0)
- return -EMSGSIZE;
- src += rpcrdma_segment_maxsz;
+ return ret;
len += ret;
}
@@ -448,32 +436,25 @@ static ssize_t svc_rdma_encode_write_chunk(__be32 *src,
* svc_rdma_encode_write_list - Encode RPC Reply's Write chunk list
* @rctxt: Reply context with information about the RPC Call
* @sctxt: Send context for the RPC Reply
- * @length: size in bytes of the payload in the first Write chunk
- *
- * The client provides a Write chunk list in the Call message. Fill
- * in the segments in the first Write chunk in the Reply's transport
- * header with the number of bytes consumed in each segment.
- * Remaining chunks are returned unused.
- *
- * Assumptions:
- * - Client has provided only one Write chunk
*
* Return values:
* On success, returns length in bytes of the Reply XDR buffer
* that was consumed by the Reply's Write list
* %-EMSGSIZE on XDR buffer overflow
*/
-static ssize_t
-svc_rdma_encode_write_list(const struct svc_rdma_recv_ctxt *rctxt,
- struct svc_rdma_send_ctxt *sctxt,
- unsigned int length)
+static ssize_t svc_rdma_encode_write_list(struct svc_rdma_recv_ctxt *rctxt,
+ struct svc_rdma_send_ctxt *sctxt)
{
+ struct svc_rdma_chunk *chunk;
ssize_t len, ret;
- ret = svc_rdma_encode_write_chunk(rctxt->rc_write_list, sctxt, length);
- if (ret < 0)
- return ret;
- len = ret;
+ len = 0;
+ pcl_for_each_chunk(chunk, &rctxt->rc_write_pcl) {
+ ret = svc_rdma_encode_write_chunk(sctxt, chunk);
+ if (ret < 0)
+ return ret;
+ len += ret;
+ }
/* Terminate the Write list */
ret = xdr_stream_encode_item_absent(&sctxt->sc_stream);
@@ -489,56 +470,174 @@ svc_rdma_encode_write_list(const struct svc_rdma_recv_ctxt *rctxt,
* @sctxt: Send context for the RPC Reply
* @length: size in bytes of the payload in the Reply chunk
*
- * Assumptions:
- * - Reply can always fit in the client-provided Reply chunk
- *
* Return values:
* On success, returns length in bytes of the Reply XDR buffer
* that was consumed by the Reply's Reply chunk
* %-EMSGSIZE on XDR buffer overflow
+ * %-E2BIG if the RPC message is larger than the Reply chunk
*/
static ssize_t
-svc_rdma_encode_reply_chunk(const struct svc_rdma_recv_ctxt *rctxt,
+svc_rdma_encode_reply_chunk(struct svc_rdma_recv_ctxt *rctxt,
struct svc_rdma_send_ctxt *sctxt,
unsigned int length)
{
- return svc_rdma_encode_write_chunk(rctxt->rc_reply_chunk, sctxt,
- length);
+ struct svc_rdma_chunk *chunk;
+
+ if (pcl_is_empty(&rctxt->rc_reply_pcl))
+ return xdr_stream_encode_item_absent(&sctxt->sc_stream);
+
+ chunk = pcl_first_chunk(&rctxt->rc_reply_pcl);
+ if (length > chunk->ch_length)
+ return -E2BIG;
+
+ chunk->ch_payload_length = length;
+ return svc_rdma_encode_write_chunk(sctxt, chunk);
}
-static int svc_rdma_dma_map_page(struct svcxprt_rdma *rdma,
- struct svc_rdma_send_ctxt *ctxt,
- struct page *page,
- unsigned long offset,
- unsigned int len)
+struct svc_rdma_map_data {
+ struct svcxprt_rdma *md_rdma;
+ struct svc_rdma_send_ctxt *md_ctxt;
+};
+
+/**
+ * svc_rdma_page_dma_map - DMA map one page
+ * @data: pointer to arguments
+ * @page: struct page to DMA map
+ * @offset: offset into the page
+ * @len: number of bytes to map
+ *
+ * Returns:
+ * %0 if DMA mapping was successful
+ * %-EIO if the page cannot be DMA mapped
+ */
+static int svc_rdma_page_dma_map(void *data, struct page *page,
+ unsigned long offset, unsigned int len)
{
+ struct svc_rdma_map_data *args = data;
+ struct svcxprt_rdma *rdma = args->md_rdma;
+ struct svc_rdma_send_ctxt *ctxt = args->md_ctxt;
struct ib_device *dev = rdma->sc_cm_id->device;
dma_addr_t dma_addr;
+ ++ctxt->sc_cur_sge_no;
+
dma_addr = ib_dma_map_page(dev, page, offset, len, DMA_TO_DEVICE);
- trace_svcrdma_dma_map_page(rdma, dma_addr, len);
if (ib_dma_mapping_error(dev, dma_addr))
goto out_maperr;
+ trace_svcrdma_dma_map_page(rdma, dma_addr, len);
ctxt->sc_sges[ctxt->sc_cur_sge_no].addr = dma_addr;
ctxt->sc_sges[ctxt->sc_cur_sge_no].length = len;
ctxt->sc_send_wr.num_sge++;
return 0;
out_maperr:
+ trace_svcrdma_dma_map_err(rdma, dma_addr, len);
return -EIO;
}
-/* ib_dma_map_page() is used here because svc_rdma_dma_unmap()
+/**
+ * svc_rdma_iov_dma_map - DMA map an iovec
+ * @data: pointer to arguments
+ * @iov: kvec to DMA map
+ *
+ * ib_dma_map_page() is used here because svc_rdma_dma_unmap()
* handles DMA-unmap and it uses ib_dma_unmap_page() exclusively.
+ *
+ * Returns:
+ * %0 if DMA mapping was successful
+ * %-EIO if the iovec cannot be DMA mapped
*/
-static int svc_rdma_dma_map_buf(struct svcxprt_rdma *rdma,
- struct svc_rdma_send_ctxt *ctxt,
- unsigned char *base,
- unsigned int len)
+static int svc_rdma_iov_dma_map(void *data, const struct kvec *iov)
{
- return svc_rdma_dma_map_page(rdma, ctxt, virt_to_page(base),
- offset_in_page(base), len);
+ if (!iov->iov_len)
+ return 0;
+ return svc_rdma_page_dma_map(data, virt_to_page(iov->iov_base),
+ offset_in_page(iov->iov_base),
+ iov->iov_len);
+}
+
+/**
+ * svc_rdma_xb_dma_map - DMA map all segments of an xdr_buf
+ * @xdr: xdr_buf containing portion of an RPC message to transmit
+ * @data: pointer to arguments
+ *
+ * Returns:
+ * %0 if DMA mapping was successful
+ * %-EIO if DMA mapping failed
+ *
+ * On failure, any DMA mappings that have been already done must be
+ * unmapped by the caller.
+ */
+static int svc_rdma_xb_dma_map(const struct xdr_buf *xdr, void *data)
+{
+ unsigned int len, remaining;
+ unsigned long pageoff;
+ struct page **ppages;
+ int ret;
+
+ ret = svc_rdma_iov_dma_map(data, &xdr->head[0]);
+ if (ret < 0)
+ return ret;
+
+ ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT);
+ pageoff = offset_in_page(xdr->page_base);
+ remaining = xdr->page_len;
+ while (remaining) {
+ len = min_t(u32, PAGE_SIZE - pageoff, remaining);
+
+ ret = svc_rdma_page_dma_map(data, *ppages++, pageoff, len);
+ if (ret < 0)
+ return ret;
+
+ remaining -= len;
+ pageoff = 0;
+ }
+
+ ret = svc_rdma_iov_dma_map(data, &xdr->tail[0]);
+ if (ret < 0)
+ return ret;
+
+ return xdr->len;
+}
+
+struct svc_rdma_pullup_data {
+ u8 *pd_dest;
+ unsigned int pd_length;
+ unsigned int pd_num_sges;
+};
+
+/**
+ * svc_rdma_xb_count_sges - Count how many SGEs will be needed
+ * @xdr: xdr_buf containing portion of an RPC message to transmit
+ * @data: pointer to arguments
+ *
+ * Returns:
+ * Number of SGEs needed to Send the contents of @xdr inline
+ */
+static int svc_rdma_xb_count_sges(const struct xdr_buf *xdr,
+ void *data)
+{
+ struct svc_rdma_pullup_data *args = data;
+ unsigned int remaining;
+ unsigned long offset;
+
+ if (xdr->head[0].iov_len)
+ ++args->pd_num_sges;
+
+ offset = offset_in_page(xdr->page_base);
+ remaining = xdr->page_len;
+ while (remaining) {
+ ++args->pd_num_sges;
+ remaining -= min_t(u32, PAGE_SIZE - offset, remaining);
+ offset = 0;
+ }
+
+ if (xdr->tail[0].iov_len)
+ ++args->pd_num_sges;
+
+ args->pd_length += xdr->len;
+ return 0;
}
/**
@@ -549,48 +648,71 @@ static int svc_rdma_dma_map_buf(struct svcxprt_rdma *rdma,
* @xdr: xdr_buf containing RPC message to transmit
*
* Returns:
- * %true if pull-up must be used
- * %false otherwise
+ * %true if pull-up must be used
+ * %false otherwise
*/
-static bool svc_rdma_pull_up_needed(struct svcxprt_rdma *rdma,
- struct svc_rdma_send_ctxt *sctxt,
+static bool svc_rdma_pull_up_needed(const struct svcxprt_rdma *rdma,
+ const struct svc_rdma_send_ctxt *sctxt,
const struct svc_rdma_recv_ctxt *rctxt,
- struct xdr_buf *xdr)
+ const struct xdr_buf *xdr)
{
- int elements;
+ /* Resources needed for the transport header */
+ struct svc_rdma_pullup_data args = {
+ .pd_length = sctxt->sc_hdrbuf.len,
+ .pd_num_sges = 1,
+ };
+ int ret;
- /* For small messages, copying bytes is cheaper than DMA mapping.
- */
- if (sctxt->sc_hdrbuf.len + xdr->len < RPCRDMA_PULLUP_THRESH)
+ ret = pcl_process_nonpayloads(&rctxt->rc_write_pcl, xdr,
+ svc_rdma_xb_count_sges, &args);
+ if (ret < 0)
+ return false;
+
+ if (args.pd_length < RPCRDMA_PULLUP_THRESH)
return true;
+ return args.pd_num_sges >= rdma->sc_max_send_sges;
+}
- /* Check whether the xdr_buf has more elements than can
- * fit in a single RDMA Send.
- */
- /* xdr->head */
- elements = 1;
-
- /* xdr->pages */
- if (!rctxt || !rctxt->rc_write_list) {
- unsigned int remaining;
- unsigned long pageoff;
-
- pageoff = xdr->page_base & ~PAGE_MASK;
- remaining = xdr->page_len;
- while (remaining) {
- ++elements;
- remaining -= min_t(u32, PAGE_SIZE - pageoff,
- remaining);
- pageoff = 0;
- }
+/**
+ * svc_rdma_xb_linearize - Copy region of xdr_buf to flat buffer
+ * @xdr: xdr_buf containing portion of an RPC message to copy
+ * @data: pointer to arguments
+ *
+ * Returns:
+ * Always zero.
+ */
+static int svc_rdma_xb_linearize(const struct xdr_buf *xdr,
+ void *data)
+{
+ struct svc_rdma_pullup_data *args = data;
+ unsigned int len, remaining;
+ unsigned long pageoff;
+ struct page **ppages;
+
+ if (xdr->head[0].iov_len) {
+ memcpy(args->pd_dest, xdr->head[0].iov_base, xdr->head[0].iov_len);
+ args->pd_dest += xdr->head[0].iov_len;
}
- /* xdr->tail */
- if (xdr->tail[0].iov_len)
- ++elements;
+ ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT);
+ pageoff = offset_in_page(xdr->page_base);
+ remaining = xdr->page_len;
+ while (remaining) {
+ len = min_t(u32, PAGE_SIZE - pageoff, remaining);
+ memcpy(args->pd_dest, page_address(*ppages) + pageoff, len);
+ remaining -= len;
+ args->pd_dest += len;
+ pageoff = 0;
+ ppages++;
+ }
- /* assume 1 SGE is needed for the transport header */
- return elements >= rdma->sc_max_send_sges;
+ if (xdr->tail[0].iov_len) {
+ memcpy(args->pd_dest, xdr->tail[0].iov_base, xdr->tail[0].iov_len);
+ args->pd_dest += xdr->tail[0].iov_len;
+ }
+
+ args->pd_length += xdr->len;
+ return 0;
}
/**
@@ -603,54 +725,30 @@ static bool svc_rdma_pull_up_needed(struct svcxprt_rdma *rdma,
* The device is not capable of sending the reply directly.
* Assemble the elements of @xdr into the transport header buffer.
*
- * Returns zero on success, or a negative errno on failure.
+ * Assumptions:
+ * pull_up_needed has determined that @xdr will fit in the buffer.
+ *
+ * Returns:
+ * %0 if pull-up was successful
+ * %-EMSGSIZE if a buffer manipulation problem occurred
*/
-static int svc_rdma_pull_up_reply_msg(struct svcxprt_rdma *rdma,
+static int svc_rdma_pull_up_reply_msg(const struct svcxprt_rdma *rdma,
struct svc_rdma_send_ctxt *sctxt,
const struct svc_rdma_recv_ctxt *rctxt,
const struct xdr_buf *xdr)
{
- unsigned char *dst, *tailbase;
- unsigned int taillen;
-
- dst = sctxt->sc_xprt_buf + sctxt->sc_hdrbuf.len;
- memcpy(dst, xdr->head[0].iov_base, xdr->head[0].iov_len);
- dst += xdr->head[0].iov_len;
-
- tailbase = xdr->tail[0].iov_base;
- taillen = xdr->tail[0].iov_len;
- if (rctxt && rctxt->rc_write_list) {
- u32 xdrpad;
-
- xdrpad = xdr_pad_size(xdr->page_len);
- if (taillen && xdrpad) {
- tailbase += xdrpad;
- taillen -= xdrpad;
- }
- } else {
- unsigned int len, remaining;
- unsigned long pageoff;
- struct page **ppages;
-
- ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT);
- pageoff = xdr->page_base & ~PAGE_MASK;
- remaining = xdr->page_len;
- while (remaining) {
- len = min_t(u32, PAGE_SIZE - pageoff, remaining);
-
- memcpy(dst, page_address(*ppages) + pageoff, len);
- remaining -= len;
- dst += len;
- pageoff = 0;
- ppages++;
- }
- }
+ struct svc_rdma_pullup_data args = {
+ .pd_dest = sctxt->sc_xprt_buf + sctxt->sc_hdrbuf.len,
+ };
+ int ret;
- if (taillen)
- memcpy(dst, tailbase, taillen);
+ ret = pcl_process_nonpayloads(&rctxt->rc_write_pcl, xdr,
+ svc_rdma_xb_linearize, &args);
+ if (ret < 0)
+ return ret;
- sctxt->sc_sges[0].length += xdr->len;
- trace_svcrdma_send_pullup(sctxt->sc_sges[0].length);
+ sctxt->sc_sges[0].length = sctxt->sc_hdrbuf.len + args.pd_length;
+ trace_svcrdma_send_pullup(sctxt, args.pd_length);
return 0;
}
@@ -660,22 +758,22 @@ static int svc_rdma_pull_up_reply_msg(struct svcxprt_rdma *rdma,
* @rctxt: Write and Reply chunks provided by client
* @xdr: prepared xdr_buf containing RPC message
*
- * Load the xdr_buf into the ctxt's sge array, and DMA map each
- * element as it is added. The Send WR's num_sge field is set.
+ * Returns:
+ * %0 if DMA mapping was successful.
+ * %-EMSGSIZE if a buffer manipulation problem occurred
+ * %-EIO if DMA mapping failed
*
- * Returns zero on success, or a negative errno on failure.
+ * The Send WR's num_sge field is set in all cases.
*/
int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma,
struct svc_rdma_send_ctxt *sctxt,
const struct svc_rdma_recv_ctxt *rctxt,
- struct xdr_buf *xdr)
+ const struct xdr_buf *xdr)
{
- unsigned int len, remaining;
- unsigned long page_off;
- struct page **ppages;
- unsigned char *base;
- u32 xdr_pad;
- int ret;
+ struct svc_rdma_map_data args = {
+ .md_rdma = rdma,
+ .md_ctxt = sctxt,
+ };
/* Set up the (persistently-mapped) transport header SGE. */
sctxt->sc_send_wr.num_sge = 1;
@@ -684,7 +782,7 @@ int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma,
/* If there is a Reply chunk, nothing follows the transport
* header, and we're done here.
*/
- if (rctxt && rctxt->rc_reply_chunk)
+ if (!pcl_is_empty(&rctxt->rc_reply_pcl))
return 0;
/* For pull-up, svc_rdma_send() will sync the transport header.
@@ -693,58 +791,8 @@ int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma,
if (svc_rdma_pull_up_needed(rdma, sctxt, rctxt, xdr))
return svc_rdma_pull_up_reply_msg(rdma, sctxt, rctxt, xdr);
- ++sctxt->sc_cur_sge_no;
- ret = svc_rdma_dma_map_buf(rdma, sctxt,
- xdr->head[0].iov_base,
- xdr->head[0].iov_len);
- if (ret < 0)
- return ret;
-
- /* If a Write chunk is present, the xdr_buf's page list
- * is not included inline. However the Upper Layer may
- * have added XDR padding in the tail buffer, and that
- * should not be included inline.
- */
- if (rctxt && rctxt->rc_write_list) {
- base = xdr->tail[0].iov_base;
- len = xdr->tail[0].iov_len;
- xdr_pad = xdr_pad_size(xdr->page_len);
-
- if (len && xdr_pad) {
- base += xdr_pad;
- len -= xdr_pad;
- }
-
- goto tail;
- }
-
- ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT);
- page_off = xdr->page_base & ~PAGE_MASK;
- remaining = xdr->page_len;
- while (remaining) {
- len = min_t(u32, PAGE_SIZE - page_off, remaining);
-
- ++sctxt->sc_cur_sge_no;
- ret = svc_rdma_dma_map_page(rdma, sctxt, *ppages++,
- page_off, len);
- if (ret < 0)
- return ret;
-
- remaining -= len;
- page_off = 0;
- }
-
- base = xdr->tail[0].iov_base;
- len = xdr->tail[0].iov_len;
-tail:
- if (len) {
- ++sctxt->sc_cur_sge_no;
- ret = svc_rdma_dma_map_buf(rdma, sctxt, base, len);
- if (ret < 0)
- return ret;
- }
-
- return 0;
+ return pcl_process_nonpayloads(&rctxt->rc_write_pcl, xdr,
+ svc_rdma_xb_dma_map, &args);
}
/* The svc_rqst and all resources it owns are released as soon as
@@ -894,9 +942,6 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
container_of(xprt, struct svcxprt_rdma, sc_xprt);
struct svc_rdma_recv_ctxt *rctxt = rqstp->rq_xprt_ctxt;
__be32 *rdma_argp = rctxt->rc_recv_buf;
- __be32 *wr_lst = rctxt->rc_write_list;
- __be32 *rp_ch = rctxt->rc_reply_chunk;
- struct xdr_buf *xdr = &rqstp->rq_res;
struct svc_rdma_send_ctxt *sctxt;
__be32 *p;
int ret;
@@ -914,45 +959,22 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
rpcrdma_fixed_maxsz * sizeof(*p));
if (!p)
goto err0;
+
+ ret = svc_rdma_send_reply_chunk(rdma, rctxt, &rqstp->rq_res);
+ if (ret < 0)
+ goto err2;
+
*p++ = *rdma_argp;
*p++ = *(rdma_argp + 1);
*p++ = rdma->sc_fc_credits;
- *p = rp_ch ? rdma_nomsg : rdma_msg;
+ *p = pcl_is_empty(&rctxt->rc_reply_pcl) ? rdma_msg : rdma_nomsg;
if (svc_rdma_encode_read_list(sctxt) < 0)
goto err0;
- if (wr_lst) {
- /* XXX: Presume the client sent only one Write chunk */
- unsigned long offset;
- unsigned int length;
-
- if (rctxt->rc_read_payload_length) {
- offset = rctxt->rc_read_payload_offset;
- length = rctxt->rc_read_payload_length;
- } else {
- offset = xdr->head[0].iov_len;
- length = xdr->page_len;
- }
- ret = svc_rdma_send_write_chunk(rdma, wr_lst, xdr, offset,
- length);
- if (ret < 0)
- goto err2;
- if (svc_rdma_encode_write_list(rctxt, sctxt, length) < 0)
- goto err0;
- } else {
- if (xdr_stream_encode_item_absent(&sctxt->sc_stream) < 0)
- goto err0;
- }
- if (rp_ch) {
- ret = svc_rdma_send_reply_chunk(rdma, rctxt, &rqstp->rq_res);
- if (ret < 0)
- goto err2;
- if (svc_rdma_encode_reply_chunk(rctxt, sctxt, ret) < 0)
- goto err0;
- } else {
- if (xdr_stream_encode_item_absent(&sctxt->sc_stream) < 0)
- goto err0;
- }
+ if (svc_rdma_encode_write_list(rctxt, sctxt) < 0)
+ goto err0;
+ if (svc_rdma_encode_reply_chunk(rctxt, sctxt, ret) < 0)
+ goto err0;
ret = svc_rdma_send_reply_msg(rdma, sctxt, rctxt, rqstp);
if (ret < 0)
@@ -979,28 +1001,46 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
}
/**
- * svc_rdma_read_payload - special processing for a READ payload
+ * svc_rdma_result_payload - special processing for a result payload
* @rqstp: svc_rqst to operate on
* @offset: payload's byte offset in @xdr
* @length: size of payload, in bytes
*
- * Returns zero on success.
- *
- * For the moment, just record the xdr_buf location of the READ
- * payload. svc_rdma_sendto will use that location later when
- * we actually send the payload.
+ * Return values:
+ * %0 if successful or nothing needed to be done
+ * %-EMSGSIZE on XDR buffer overflow
+ * %-E2BIG if the payload was larger than the Write chunk
+ * %-EINVAL if client provided too many segments
+ * %-ENOMEM if rdma_rw context pool was exhausted
+ * %-ENOTCONN if posting failed (connection is lost)
+ * %-EIO if rdma_rw initialization failed (DMA mapping, etc)
*/
-int svc_rdma_read_payload(struct svc_rqst *rqstp, unsigned int offset,
- unsigned int length)
+int svc_rdma_result_payload(struct svc_rqst *rqstp, unsigned int offset,
+ unsigned int length)
{
struct svc_rdma_recv_ctxt *rctxt = rqstp->rq_xprt_ctxt;
+ struct svc_rdma_chunk *chunk;
+ struct svcxprt_rdma *rdma;
+ struct xdr_buf subbuf;
+ int ret;
- /* XXX: Just one READ payload slot for now, since our
- * transport implementation currently supports only one
- * Write chunk.
- */
- rctxt->rc_read_payload_offset = offset;
- rctxt->rc_read_payload_length = length;
+ chunk = rctxt->rc_cur_result_payload;
+ if (!length || !chunk)
+ return 0;
+ rctxt->rc_cur_result_payload =
+ pcl_next_chunk(&rctxt->rc_write_pcl, chunk);
+ if (length > chunk->ch_length)
+ return -E2BIG;
+ chunk->ch_position = offset;
+ chunk->ch_payload_length = length;
+
+ if (xdr_buf_subsegment(&rqstp->rq_res, &subbuf, offset, length))
+ return -EMSGSIZE;
+
+ rdma = container_of(rqstp->rq_xprt, struct svcxprt_rdma, sc_xprt);
+ ret = svc_rdma_send_write_chunk(rdma, chunk, &subbuf);
+ if (ret < 0)
+ return ret;
return 0;
}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index fb044792b571..afba4e9d5425 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -80,7 +80,7 @@ static const struct svc_xprt_ops svc_rdma_ops = {
.xpo_create = svc_rdma_create,
.xpo_recvfrom = svc_rdma_recvfrom,
.xpo_sendto = svc_rdma_sendto,
- .xpo_read_payload = svc_rdma_read_payload,
+ .xpo_result_payload = svc_rdma_result_payload,
.xpo_release_rqst = svc_rdma_release_rqst,
.xpo_detach = svc_rdma_detach,
.xpo_free = svc_rdma_free,
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 8915e42240d3..78d29d1bcc20 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -599,11 +599,12 @@ static void
xprt_rdma_free(struct rpc_task *task)
{
struct rpc_rqst *rqst = task->tk_rqstp;
- struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt);
struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
- if (!list_empty(&req->rl_registered))
- frwr_unmap_sync(r_xprt, req);
+ if (unlikely(!list_empty(&req->rl_registered))) {
+ trace_xprtrdma_mrs_zap(task);
+ frwr_unmap_sync(rpcx_to_rdmax(rqst->rq_xprt), req);
+ }
/* XXX: If the RPC is completing because of a signal and
* not because a reply was received, we ought to ensure
@@ -768,6 +769,7 @@ static struct xprt_class xprt_rdma = {
.owner = THIS_MODULE,
.ident = XPRT_TRANSPORT_RDMA,
.setup = xprt_setup_rdma,
+ .netid = { "rdma", "rdma6", "" },
};
void xprt_rdma_cleanup(void)
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index ad6e2e4994ce..ec912cf9c618 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -167,7 +167,7 @@ static void rpcrdma_wc_send(struct ib_cq *cq, struct ib_wc *wc)
struct rpcrdma_xprt *r_xprt = cq->cq_context;
/* WARNING: Only wr_cqe and status are reliable at this point */
- trace_xprtrdma_wc_send(sc, wc);
+ trace_xprtrdma_wc_send(wc, &sc->sc_cid);
rpcrdma_sendctx_put_locked(r_xprt, sc);
rpcrdma_flush_disconnect(r_xprt, wc);
}
@@ -186,7 +186,7 @@ static void rpcrdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc)
struct rpcrdma_xprt *r_xprt = cq->cq_context;
/* WARNING: Only wr_cqe and status are reliable at this point */
- trace_xprtrdma_wc_receive(wc);
+ trace_xprtrdma_wc_receive(wc, &rep->rr_cid);
--r_xprt->rx_ep->re_receive_count;
if (wc->status != IB_WC_SUCCESS)
goto out_flushed;
@@ -643,6 +643,9 @@ static struct rpcrdma_sendctx *rpcrdma_sendctx_create(struct rpcrdma_ep *ep)
return NULL;
sc->sc_cqe.done = rpcrdma_wc_send;
+ sc->sc_cid.ci_queue_id = ep->re_attr.send_cq->res.id;
+ sc->sc_cid.ci_completion_id =
+ atomic_inc_return(&ep->re_completion_ids);
return sc;
}
@@ -972,6 +975,9 @@ struct rpcrdma_rep *rpcrdma_rep_create(struct rpcrdma_xprt *r_xprt,
if (!rpcrdma_regbuf_dma_map(r_xprt, rep->rr_rdmabuf))
goto out_free_regbuf;
+ rep->rr_cid.ci_completion_id =
+ atomic_inc_return(&r_xprt->rx_ep->re_completion_ids);
+
xdr_buf_init(&rep->rr_hdrbuf, rdmab_data(rep->rr_rdmabuf),
rdmab_length(rep->rr_rdmabuf));
rep->rr_cqe.done = rpcrdma_wc_receive;
@@ -1179,25 +1185,6 @@ rpcrdma_mr_get(struct rpcrdma_xprt *r_xprt)
}
/**
- * rpcrdma_mr_put - DMA unmap an MR and release it
- * @mr: MR to release
- *
- */
-void rpcrdma_mr_put(struct rpcrdma_mr *mr)
-{
- struct rpcrdma_xprt *r_xprt = mr->mr_xprt;
-
- if (mr->mr_dir != DMA_NONE) {
- trace_xprtrdma_mr_unmap(mr);
- ib_dma_unmap_sg(r_xprt->rx_ep->re_id->device,
- mr->mr_sg, mr->mr_nents, mr->mr_dir);
- mr->mr_dir = DMA_NONE;
- }
-
- rpcrdma_mr_push(mr, &mr->mr_req->rl_free_mrs);
-}
-
-/**
* rpcrdma_buffer_get - Get a request buffer
* @buffers: Buffer pool from which to obtain a buffer
*
@@ -1411,6 +1398,7 @@ void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp)
if (!rep)
break;
+ rep->rr_cid.ci_queue_id = ep->re_attr.recv_cq->res.id;
trace_xprtrdma_post_recv(rep);
rep->rr_recv_wr.next = wr;
wr = &rep->rr_recv_wr;
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 43974ef39a50..94b28657aeeb 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -53,6 +53,7 @@
#include <rdma/ib_verbs.h> /* RDMA verbs api */
#include <linux/sunrpc/clnt.h> /* rpc_xprt */
+#include <linux/sunrpc/rpc_rdma_cid.h> /* completion IDs */
#include <linux/sunrpc/rpc_rdma.h> /* RPC/RDMA protocol */
#include <linux/sunrpc/xprtrdma.h> /* xprt parameters */
@@ -93,6 +94,8 @@ struct rpcrdma_ep {
unsigned int re_max_requests; /* depends on device */
unsigned int re_inline_send; /* negotiated */
unsigned int re_inline_recv; /* negotiated */
+
+ atomic_t re_completion_ids;
};
/* Pre-allocate extra Work Requests for handling backward receives
@@ -180,6 +183,8 @@ enum {
struct rpcrdma_rep {
struct ib_cqe rr_cqe;
+ struct rpc_rdma_cid rr_cid;
+
__be32 rr_xid;
__be32 rr_vers;
__be32 rr_proc;
@@ -211,6 +216,7 @@ enum {
struct rpcrdma_req;
struct rpcrdma_sendctx {
struct ib_cqe sc_cqe;
+ struct rpc_rdma_cid sc_cid;
struct rpcrdma_req *sc_req;
unsigned int sc_unmap_count;
struct ib_sge sc_sges[];
@@ -225,6 +231,7 @@ struct rpcrdma_sendctx {
struct rpcrdma_frwr {
struct ib_mr *fr_mr;
struct ib_cqe fr_cqe;
+ struct rpc_rdma_cid fr_cid;
struct completion fr_linv_done;
union {
struct ib_reg_wr fr_regwr;
@@ -236,6 +243,7 @@ struct rpcrdma_req;
struct rpcrdma_mr {
struct list_head mr_list;
struct rpcrdma_req *mr_req;
+ struct ib_device *mr_device;
struct scatterlist *mr_sg;
int mr_nents;
enum dma_data_direction mr_dir;
@@ -466,7 +474,6 @@ void rpcrdma_buffer_destroy(struct rpcrdma_buffer *);
struct rpcrdma_sendctx *rpcrdma_sendctx_get_locked(struct rpcrdma_xprt *r_xprt);
struct rpcrdma_mr *rpcrdma_mr_get(struct rpcrdma_xprt *r_xprt);
-void rpcrdma_mr_put(struct rpcrdma_mr *mr);
void rpcrdma_mrs_refresh(struct rpcrdma_xprt *r_xprt);
struct rpcrdma_req *rpcrdma_buffer_get(struct rpcrdma_buffer *);
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 7090bbee0ec5..c56a66cdf4ac 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -433,7 +433,8 @@ xs_read_xdr_buf(struct socket *sock, struct msghdr *msg, int flags,
if (ret <= 0)
goto sock_err;
xs_flush_bvec(buf->bvec, ret, seek + buf->page_base);
- offset += ret - buf->page_base;
+ ret -= buf->page_base;
+ offset += ret;
if (offset == count || msg->msg_flags & (MSG_EOR|MSG_TRUNC))
goto out;
if (ret != want)
@@ -3059,6 +3060,7 @@ static struct xprt_class xs_local_transport = {
.owner = THIS_MODULE,
.ident = XPRT_TRANSPORT_LOCAL,
.setup = xs_setup_local,
+ .netid = { "" },
};
static struct xprt_class xs_udp_transport = {
@@ -3067,6 +3069,7 @@ static struct xprt_class xs_udp_transport = {
.owner = THIS_MODULE,
.ident = XPRT_TRANSPORT_UDP,
.setup = xs_setup_udp,
+ .netid = { "udp", "udp6", "" },
};
static struct xprt_class xs_tcp_transport = {
@@ -3075,6 +3078,7 @@ static struct xprt_class xs_tcp_transport = {
.owner = THIS_MODULE,
.ident = XPRT_TRANSPORT_TCP,
.setup = xs_setup_tcp,
+ .netid = { "tcp", "tcp6", "" },
};
static struct xprt_class xs_bc_tcp_transport = {
@@ -3083,6 +3087,7 @@ static struct xprt_class xs_bc_tcp_transport = {
.owner = THIS_MODULE,
.ident = XPRT_TRANSPORT_BC_TCP,
.setup = xs_setup_bc_tcp,
+ .netid = { "" },
};
/**
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 2f1517827995..d01ca1a18418 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -1021,7 +1021,8 @@ static void xfrm_state_look_at(struct xfrm_policy *pol, struct xfrm_state *x,
if ((x->sel.family &&
(x->sel.family != family ||
!xfrm_selector_match(&x->sel, fl, family))) ||
- !security_xfrm_state_pol_flow_match(x, pol, fl))
+ !security_xfrm_state_pol_flow_match(x, pol,
+ &fl->u.__fl_common))
return;
if (!*best ||
@@ -1036,7 +1037,8 @@ static void xfrm_state_look_at(struct xfrm_policy *pol, struct xfrm_state *x,
if ((!x->sel.family ||
(x->sel.family == family &&
xfrm_selector_match(&x->sel, fl, family))) &&
- security_xfrm_state_pol_flow_match(x, pol, fl))
+ security_xfrm_state_pol_flow_match(x, pol,
+ &fl->u.__fl_common))
*error = -ESRCH;
}
}