summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2019-02-06 17:00:15 -0800
committerDavid S. Miller <davem@davemloft.net>2019-02-06 17:00:15 -0800
commit042a41977b7c49dddad7560ea8697b007fbf33bc (patch)
tree44cee956f02c52f9f5fc0f5f3a87a2b5157625e8
parente90b1fd83c94d536375d8b9f4916afd15f4db0ed (diff)
parentfd261ce6a30e01ad67c416e2c67e263024b3a6f9 (diff)
downloadlinux-042a41977b7c49dddad7560ea8697b007fbf33bc.tar.bz2
Merge branch 'for_net-next-5.1/rds-tos-v4' of git://git.kernel.org/pub/scm/linux/kernel/git/ssantosh/linux
Santosh Shilimkar says: ==================== rds: add tos support RDS applications make use of tos to classify database traffic. This feature has been used in shipping products from 2.6.32 based kernels. Its tied with RDS v4.1 protocol version and the compatibility gets negotiated as part of connections setup. Patchset keeps full backward compatibility using existing connection negotiation scheme. Currently the feature is exploited by RDMA transport and for TCP transport the user tos values are mapped to same default class (0). For RDMA transports, RDMA CM service type API is used to set up different SL(service lanes) and the IB fabric is configured for tos mapping using Subnet Manager(SL to VL mappings). Similarly for ROCE fabric, user priority is mapped with different DSCP code points which are associated with different switch queues in the fabric. The original code was developed by Bang Nguyen in downstream kernel back in 2.6.32 kernel days and it has evolved significantly over period of time. Thanks to Yanjun for doing testing with various combinations of host like v3.1<->v4.1, v4.1.<->v3.1, v4.1 upstream to shipping v4.1 etc etc ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--include/uapi/linux/rds.h11
-rw-r--r--net/rds/af_rds.c37
-rw-r--r--net/rds/connection.c21
-rw-r--r--net/rds/ib.c11
-rw-r--r--net/rds/ib.h4
-rw-r--r--net/rds/ib_cm.c72
-rw-r--r--net/rds/ib_recv.c4
-rw-r--r--net/rds/ib_send.c5
-rw-r--r--net/rds/rdma_transport.c14
-rw-r--r--net/rds/rdma_transport.h6
-rw-r--r--net/rds/rds.h14
-rw-r--r--net/rds/recv.c1
-rw-r--r--net/rds/send.c7
-rw-r--r--net/rds/tcp.c8
-rw-r--r--net/rds/tcp_listen.c2
-rw-r--r--net/rds/threads.c1
16 files changed, 166 insertions, 52 deletions
diff --git a/include/uapi/linux/rds.h b/include/uapi/linux/rds.h
index 8b73cb603c5f..5d0f76c780e5 100644
--- a/include/uapi/linux/rds.h
+++ b/include/uapi/linux/rds.h
@@ -69,6 +69,12 @@
#define RDS_TRANS_COUNT 3
#define RDS_TRANS_NONE (~0)
+/* IOCTLS commands for SOL_RDS */
+#define SIOCRDSSETTOS (SIOCPROTOPRIVATE)
+#define SIOCRDSGETTOS (SIOCPROTOPRIVATE + 1)
+
+typedef __u8 rds_tos_t;
+
/*
* Control message types for SOL_RDS.
*
@@ -149,6 +155,7 @@ struct rds_info_connection {
__be32 faddr;
__u8 transport[TRANSNAMSIZ]; /* null term ascii */
__u8 flags;
+ __u8 tos;
} __attribute__((packed));
struct rds6_info_connection {
@@ -171,6 +178,7 @@ struct rds_info_message {
__be16 lport;
__be16 fport;
__u8 flags;
+ __u8 tos;
} __attribute__((packed));
struct rds6_info_message {
@@ -214,6 +222,7 @@ struct rds_info_tcp_socket {
__u32 last_sent_nxt;
__u32 last_expected_una;
__u32 last_seen_una;
+ __u8 tos;
} __attribute__((packed));
struct rds6_info_tcp_socket {
@@ -240,6 +249,7 @@ struct rds_info_rdma_connection {
__u32 max_send_sge;
__u32 rdma_mr_max;
__u32 rdma_mr_size;
+ __u8 tos;
};
struct rds6_info_rdma_connection {
@@ -253,6 +263,7 @@ struct rds6_info_rdma_connection {
__u32 max_send_sge;
__u32 rdma_mr_max;
__u32 rdma_mr_size;
+ __u8 tos;
};
/* RDS message Receive Path Latency points */
diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c
index 65571a6273c3..d6cc97fbbbb0 100644
--- a/net/rds/af_rds.c
+++ b/net/rds/af_rds.c
@@ -254,7 +254,40 @@ static __poll_t rds_poll(struct file *file, struct socket *sock,
static int rds_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
{
- return -ENOIOCTLCMD;
+ struct rds_sock *rs = rds_sk_to_rs(sock->sk);
+ rds_tos_t utos, tos = 0;
+
+ switch (cmd) {
+ case SIOCRDSSETTOS:
+ if (get_user(utos, (rds_tos_t __user *)arg))
+ return -EFAULT;
+
+ if (rs->rs_transport &&
+ rs->rs_transport->get_tos_map)
+ tos = rs->rs_transport->get_tos_map(utos);
+ else
+ return -ENOIOCTLCMD;
+
+ spin_lock_bh(&rds_sock_lock);
+ if (rs->rs_tos || rs->rs_conn) {
+ spin_unlock_bh(&rds_sock_lock);
+ return -EINVAL;
+ }
+ rs->rs_tos = tos;
+ spin_unlock_bh(&rds_sock_lock);
+ break;
+ case SIOCRDSGETTOS:
+ spin_lock_bh(&rds_sock_lock);
+ tos = rs->rs_tos;
+ spin_unlock_bh(&rds_sock_lock);
+ if (put_user(tos, (rds_tos_t __user *)arg))
+ return -EFAULT;
+ break;
+ default:
+ return -ENOIOCTLCMD;
+ }
+
+ return 0;
}
static int rds_cancel_sent_to(struct rds_sock *rs, char __user *optval,
@@ -650,6 +683,8 @@ static int __rds_create(struct socket *sock, struct sock *sk, int protocol)
spin_lock_init(&rs->rs_rdma_lock);
rs->rs_rdma_keys = RB_ROOT;
rs->rs_rx_traces = 0;
+ rs->rs_tos = 0;
+ rs->rs_conn = NULL;
spin_lock_bh(&rds_sock_lock);
list_add_tail(&rs->rs_item, &rds_sock_list);
diff --git a/net/rds/connection.c b/net/rds/connection.c
index 3bd2f4a5a30d..7ea134f9a825 100644
--- a/net/rds/connection.c
+++ b/net/rds/connection.c
@@ -84,7 +84,7 @@ static struct rds_connection *rds_conn_lookup(struct net *net,
const struct in6_addr *laddr,
const struct in6_addr *faddr,
struct rds_transport *trans,
- int dev_if)
+ u8 tos, int dev_if)
{
struct rds_connection *conn, *ret = NULL;
@@ -92,6 +92,7 @@ static struct rds_connection *rds_conn_lookup(struct net *net,
if (ipv6_addr_equal(&conn->c_faddr, faddr) &&
ipv6_addr_equal(&conn->c_laddr, laddr) &&
conn->c_trans == trans &&
+ conn->c_tos == tos &&
net == rds_conn_net(conn) &&
conn->c_dev_if == dev_if) {
ret = conn;
@@ -139,6 +140,7 @@ static void __rds_conn_path_init(struct rds_connection *conn,
atomic_set(&cp->cp_state, RDS_CONN_DOWN);
cp->cp_send_gen = 0;
cp->cp_reconnect_jiffies = 0;
+ cp->cp_conn->c_proposed_version = RDS_PROTOCOL_VERSION;
INIT_DELAYED_WORK(&cp->cp_send_w, rds_send_worker);
INIT_DELAYED_WORK(&cp->cp_recv_w, rds_recv_worker);
INIT_DELAYED_WORK(&cp->cp_conn_w, rds_connect_worker);
@@ -159,7 +161,7 @@ static struct rds_connection *__rds_conn_create(struct net *net,
const struct in6_addr *laddr,
const struct in6_addr *faddr,
struct rds_transport *trans,
- gfp_t gfp,
+ gfp_t gfp, u8 tos,
int is_outgoing,
int dev_if)
{
@@ -171,7 +173,7 @@ static struct rds_connection *__rds_conn_create(struct net *net,
int npaths = (trans->t_mp_capable ? RDS_MPATH_WORKERS : 1);
rcu_read_lock();
- conn = rds_conn_lookup(net, head, laddr, faddr, trans, dev_if);
+ conn = rds_conn_lookup(net, head, laddr, faddr, trans, tos, dev_if);
if (conn &&
conn->c_loopback &&
conn->c_trans != &rds_loop_transport &&
@@ -205,6 +207,7 @@ static struct rds_connection *__rds_conn_create(struct net *net,
conn->c_isv6 = !ipv6_addr_v4mapped(laddr);
conn->c_faddr = *faddr;
conn->c_dev_if = dev_if;
+ conn->c_tos = tos;
#if IS_ENABLED(CONFIG_IPV6)
/* If the local address is link local, set c_bound_if to be the
@@ -297,7 +300,7 @@ static struct rds_connection *__rds_conn_create(struct net *net,
struct rds_connection *found;
found = rds_conn_lookup(net, head, laddr, faddr, trans,
- dev_if);
+ tos, dev_if);
if (found) {
struct rds_conn_path *cp;
int i;
@@ -332,10 +335,10 @@ out:
struct rds_connection *rds_conn_create(struct net *net,
const struct in6_addr *laddr,
const struct in6_addr *faddr,
- struct rds_transport *trans, gfp_t gfp,
- int dev_if)
+ struct rds_transport *trans, u8 tos,
+ gfp_t gfp, int dev_if)
{
- return __rds_conn_create(net, laddr, faddr, trans, gfp, 0, dev_if);
+ return __rds_conn_create(net, laddr, faddr, trans, gfp, tos, 0, dev_if);
}
EXPORT_SYMBOL_GPL(rds_conn_create);
@@ -343,9 +346,9 @@ struct rds_connection *rds_conn_create_outgoing(struct net *net,
const struct in6_addr *laddr,
const struct in6_addr *faddr,
struct rds_transport *trans,
- gfp_t gfp, int dev_if)
+ u8 tos, gfp_t gfp, int dev_if)
{
- return __rds_conn_create(net, laddr, faddr, trans, gfp, 1, dev_if);
+ return __rds_conn_create(net, laddr, faddr, trans, gfp, tos, 1, dev_if);
}
EXPORT_SYMBOL_GPL(rds_conn_create_outgoing);
diff --git a/net/rds/ib.c b/net/rds/ib.c
index 9d7b7586f240..2da9b75bad16 100644
--- a/net/rds/ib.c
+++ b/net/rds/ib.c
@@ -301,6 +301,7 @@ static int rds_ib_conn_info_visitor(struct rds_connection *conn,
iinfo->src_addr = conn->c_laddr.s6_addr32[3];
iinfo->dst_addr = conn->c_faddr.s6_addr32[3];
+ iinfo->tos = conn->c_tos;
memset(&iinfo->src_gid, 0, sizeof(iinfo->src_gid));
memset(&iinfo->dst_gid, 0, sizeof(iinfo->dst_gid));
@@ -514,6 +515,15 @@ void rds_ib_exit(void)
rds_ib_mr_exit();
}
+static u8 rds_ib_get_tos_map(u8 tos)
+{
+ /* 1:1 user to transport map for RDMA transport.
+ * In future, if custom map is desired, hook can export
+ * user configurable map.
+ */
+ return tos;
+}
+
struct rds_transport rds_ib_transport = {
.laddr_check = rds_ib_laddr_check,
.xmit_path_complete = rds_ib_xmit_path_complete,
@@ -536,6 +546,7 @@ struct rds_transport rds_ib_transport = {
.sync_mr = rds_ib_sync_mr,
.free_mr = rds_ib_free_mr,
.flush_mrs = rds_ib_flush_mrs,
+ .get_tos_map = rds_ib_get_tos_map,
.t_owner = THIS_MODULE,
.t_name = "infiniband",
.t_unloading = rds_ib_is_unloading,
diff --git a/net/rds/ib.h b/net/rds/ib.h
index 71ff356ee702..752f92235a38 100644
--- a/net/rds/ib.h
+++ b/net/rds/ib.h
@@ -67,7 +67,9 @@ struct rds_ib_conn_priv_cmn {
u8 ricpc_protocol_major;
u8 ricpc_protocol_minor;
__be16 ricpc_protocol_minor_mask; /* bitmask */
- __be32 ricpc_reserved1;
+ u8 ricpc_dp_toss;
+ u8 ripc_reserved1;
+ __be16 ripc_reserved2;
__be64 ricpc_ack_seq;
__be32 ricpc_credit; /* non-zero enables flow ctl */
};
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
index bfbb31f0c7fd..66c6eb56072b 100644
--- a/net/rds/ib_cm.c
+++ b/net/rds/ib_cm.c
@@ -133,23 +133,24 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
rds_ib_set_flow_control(conn, be32_to_cpu(credit));
}
- if (conn->c_version < RDS_PROTOCOL(3, 1)) {
- pr_notice("RDS/IB: Connection <%pI6c,%pI6c> version %u.%u no longer supported\n",
- &conn->c_laddr, &conn->c_faddr,
- RDS_PROTOCOL_MAJOR(conn->c_version),
- RDS_PROTOCOL_MINOR(conn->c_version));
- set_bit(RDS_DESTROY_PENDING, &conn->c_path[0].cp_flags);
- rds_conn_destroy(conn);
- return;
- } else {
- pr_notice("RDS/IB: %s conn connected <%pI6c,%pI6c> version %u.%u%s\n",
- ic->i_active_side ? "Active" : "Passive",
- &conn->c_laddr, &conn->c_faddr,
- RDS_PROTOCOL_MAJOR(conn->c_version),
- RDS_PROTOCOL_MINOR(conn->c_version),
- ic->i_flowctl ? ", flow control" : "");
+ if (conn->c_version < RDS_PROTOCOL_VERSION) {
+ if (conn->c_version != RDS_PROTOCOL_COMPAT_VERSION) {
+ pr_notice("RDS/IB: Connection <%pI6c,%pI6c> version %u.%u no longer supported\n",
+ &conn->c_laddr, &conn->c_faddr,
+ RDS_PROTOCOL_MAJOR(conn->c_version),
+ RDS_PROTOCOL_MINOR(conn->c_version));
+ rds_conn_destroy(conn);
+ return;
+ }
}
+ pr_notice("RDS/IB: %s conn connected <%pI6c,%pI6c,%d> version %u.%u%s\n",
+ ic->i_active_side ? "Active" : "Passive",
+ &conn->c_laddr, &conn->c_faddr, conn->c_tos,
+ RDS_PROTOCOL_MAJOR(conn->c_version),
+ RDS_PROTOCOL_MINOR(conn->c_version),
+ ic->i_flowctl ? ", flow control" : "");
+
atomic_set(&ic->i_cq_quiesce, 0);
/* Init rings and fill recv. this needs to wait until protocol
@@ -184,6 +185,7 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
NULL);
}
+ conn->c_proposed_version = conn->c_version;
rds_connect_complete(conn);
}
@@ -220,6 +222,7 @@ static void rds_ib_cm_fill_conn_param(struct rds_connection *conn,
cpu_to_be16(RDS_IB_SUPPORTED_PROTOCOLS);
dp->ricp_v6.dp_ack_seq =
cpu_to_be64(rds_ib_piggyb_ack(ic));
+ dp->ricp_v6.dp_cmn.ricpc_dp_toss = conn->c_tos;
conn_param->private_data = &dp->ricp_v6;
conn_param->private_data_len = sizeof(dp->ricp_v6);
@@ -234,6 +237,7 @@ static void rds_ib_cm_fill_conn_param(struct rds_connection *conn,
cpu_to_be16(RDS_IB_SUPPORTED_PROTOCOLS);
dp->ricp_v4.dp_ack_seq =
cpu_to_be64(rds_ib_piggyb_ack(ic));
+ dp->ricp_v4.dp_cmn.ricpc_dp_toss = conn->c_tos;
conn_param->private_data = &dp->ricp_v4;
conn_param->private_data_len = sizeof(dp->ricp_v4);
@@ -389,10 +393,9 @@ static void rds_ib_qp_event_handler(struct ib_event *event, void *data)
rdma_notify(ic->i_cm_id, IB_EVENT_COMM_EST);
break;
default:
- rdsdebug("Fatal QP Event %u (%s) "
- "- connection %pI6c->%pI6c, reconnecting\n",
- event->event, ib_event_msg(event->event),
- &conn->c_laddr, &conn->c_faddr);
+ rdsdebug("Fatal QP Event %u (%s) - connection %pI6c->%pI6c, reconnecting\n",
+ event->event, ib_event_msg(event->event),
+ &conn->c_laddr, &conn->c_faddr);
rds_conn_drop(conn);
break;
}
@@ -660,13 +663,16 @@ static u32 rds_ib_protocol_compatible(struct rdma_cm_event *event, bool isv6)
/* Even if len is crap *now* I still want to check it. -ASG */
if (event->param.conn.private_data_len < data_len || major == 0)
- return RDS_PROTOCOL_3_0;
+ return RDS_PROTOCOL_4_0;
common = be16_to_cpu(mask) & RDS_IB_SUPPORTED_PROTOCOLS;
- if (major == 3 && common) {
- version = RDS_PROTOCOL_3_0;
+ if (major == 4 && common) {
+ version = RDS_PROTOCOL_4_0;
while ((common >>= 1) != 0)
version++;
+ } else if (RDS_PROTOCOL_COMPAT_VERSION ==
+ RDS_PROTOCOL(major, minor)) {
+ version = RDS_PROTOCOL_COMPAT_VERSION;
} else {
if (isv6)
printk_ratelimited(KERN_NOTICE "RDS: Connection from %pI6c using incompatible protocol version %u.%u\n",
@@ -729,8 +735,10 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
/* Check whether the remote protocol version matches ours. */
version = rds_ib_protocol_compatible(event, isv6);
- if (!version)
+ if (!version) {
+ err = RDS_RDMA_REJ_INCOMPAT;
goto out;
+ }
dp = event->param.conn.private_data;
if (isv6) {
@@ -771,15 +779,16 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
daddr6 = &d_mapped_addr;
}
- rdsdebug("saddr %pI6c daddr %pI6c RDSv%u.%u lguid 0x%llx fguid "
- "0x%llx\n", saddr6, daddr6,
- RDS_PROTOCOL_MAJOR(version), RDS_PROTOCOL_MINOR(version),
+ rdsdebug("saddr %pI6c daddr %pI6c RDSv%u.%u lguid 0x%llx fguid 0x%llx, tos:%d\n",
+ saddr6, daddr6, RDS_PROTOCOL_MAJOR(version),
+ RDS_PROTOCOL_MINOR(version),
(unsigned long long)be64_to_cpu(lguid),
- (unsigned long long)be64_to_cpu(fguid));
+ (unsigned long long)be64_to_cpu(fguid), dp_cmn->ricpc_dp_toss);
/* RDS/IB is not currently netns aware, thus init_net */
conn = rds_conn_create(&init_net, daddr6, saddr6,
- &rds_ib_transport, GFP_KERNEL, ifindex);
+ &rds_ib_transport, dp_cmn->ricpc_dp_toss,
+ GFP_KERNEL, ifindex);
if (IS_ERR(conn)) {
rdsdebug("rds_conn_create failed (%ld)\n", PTR_ERR(conn));
conn = NULL;
@@ -846,7 +855,7 @@ out:
if (conn)
mutex_unlock(&conn->c_cm_lock);
if (err)
- rdma_reject(cm_id, NULL, 0);
+ rdma_reject(cm_id, &err, sizeof(int));
return destroy;
}
@@ -861,7 +870,7 @@ int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id, bool isv6)
/* If the peer doesn't do protocol negotiation, we must
* default to RDSv3.0 */
- rds_ib_set_protocol(conn, RDS_PROTOCOL_3_0);
+ rds_ib_set_protocol(conn, RDS_PROTOCOL_4_1);
ic->i_flowctl = rds_ib_sysctl_flow_control; /* advertise flow control */
ret = rds_ib_setup_qp(conn);
@@ -870,7 +879,8 @@ int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id, bool isv6)
goto out;
}
- rds_ib_cm_fill_conn_param(conn, &conn_param, &dp, RDS_PROTOCOL_VERSION,
+ rds_ib_cm_fill_conn_param(conn, &conn_param, &dp,
+ conn->c_proposed_version,
UINT_MAX, UINT_MAX, isv6);
ret = rdma_connect(cm_id, &conn_param);
if (ret)
diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c
index 2f16146e4ec9..d395eec98959 100644
--- a/net/rds/ib_recv.c
+++ b/net/rds/ib_recv.c
@@ -986,9 +986,9 @@ void rds_ib_recv_cqe_handler(struct rds_ib_connection *ic,
} else {
/* We expect errors as the qp is drained during shutdown */
if (rds_conn_up(conn) || rds_conn_connecting(conn))
- rds_ib_conn_error(conn, "recv completion on <%pI6c,%pI6c> had status %u (%s), disconnecting and reconnecting\n",
+ rds_ib_conn_error(conn, "recv completion on <%pI6c,%pI6c, %d> had status %u (%s), disconnecting and reconnecting\n",
&conn->c_laddr, &conn->c_faddr,
- wc->status,
+ conn->c_tos, wc->status,
ib_wc_status_msg(wc->status));
}
diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c
index 4e0c36acf866..09c46f2e97fa 100644
--- a/net/rds/ib_send.c
+++ b/net/rds/ib_send.c
@@ -305,8 +305,9 @@ void rds_ib_send_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc)
/* We expect errors as the qp is drained during shutdown */
if (wc->status != IB_WC_SUCCESS && rds_conn_up(conn)) {
- rds_ib_conn_error(conn, "send completion on <%pI6c,%pI6c> had status %u (%s), disconnecting and reconnecting\n",
- &conn->c_laddr, &conn->c_faddr, wc->status,
+ rds_ib_conn_error(conn, "send completion on <%pI6c,%pI6c,%d> had status %u (%s), disconnecting and reconnecting\n",
+ &conn->c_laddr, &conn->c_faddr,
+ conn->c_tos, wc->status,
ib_wc_status_msg(wc->status));
}
}
diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c
index 6b0f57c83a2a..46bce8389066 100644
--- a/net/rds/rdma_transport.c
+++ b/net/rds/rdma_transport.c
@@ -51,6 +51,8 @@ static int rds_rdma_cm_event_handler_cmn(struct rdma_cm_id *cm_id,
struct rds_connection *conn = cm_id->context;
struct rds_transport *trans;
int ret = 0;
+ int *err;
+ u8 len;
rdsdebug("conn %p id %p handling event %u (%s)\n", conn, cm_id,
event->event, rdma_event_msg(event->event));
@@ -81,6 +83,7 @@ static int rds_rdma_cm_event_handler_cmn(struct rdma_cm_id *cm_id,
break;
case RDMA_CM_EVENT_ADDR_RESOLVED:
+ rdma_set_service_type(cm_id, conn->c_tos);
/* XXX do we need to clean up if this fails? */
ret = rdma_resolve_route(cm_id,
RDS_RDMA_RESOLVE_TIMEOUT_MS);
@@ -106,8 +109,19 @@ static int rds_rdma_cm_event_handler_cmn(struct rdma_cm_id *cm_id,
break;
case RDMA_CM_EVENT_REJECTED:
+ if (!conn)
+ break;
+ err = (int *)rdma_consumer_reject_data(cm_id, event, &len);
+ if (!err || (err && ((*err) == RDS_RDMA_REJ_INCOMPAT))) {
+ pr_warn("RDS/RDMA: conn <%pI6c, %pI6c> rejected, dropping connection\n",
+ &conn->c_laddr, &conn->c_faddr);
+ conn->c_proposed_version = RDS_PROTOCOL_COMPAT_VERSION;
+ conn->c_tos = 0;
+ rds_conn_drop(conn);
+ }
rdsdebug("Connection rejected: %s\n",
rdma_reject_msg(cm_id, event->status));
+ break;
/* FALLTHROUGH */
case RDMA_CM_EVENT_ADDR_ERROR:
case RDMA_CM_EVENT_ROUTE_ERROR:
diff --git a/net/rds/rdma_transport.h b/net/rds/rdma_transport.h
index 200d3134aaae..bfafd4a6d827 100644
--- a/net/rds/rdma_transport.h
+++ b/net/rds/rdma_transport.h
@@ -11,6 +11,12 @@
#define RDS_RDMA_RESOLVE_TIMEOUT_MS 5000
+/* Below reject reason is for legacy interoperability issue with non-linux
+ * RDS endpoints where older version incompatibility is conveyed via value 1.
+ * For future version(s), proper encoded reject reason should be be used.
+ */
+#define RDS_RDMA_REJ_INCOMPAT 1
+
int rds_rdma_conn_connect(struct rds_connection *conn);
int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
struct rdma_cm_event *event);
diff --git a/net/rds/rds.h b/net/rds/rds.h
index 4ffe100ff5e6..0d8f67cadd74 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -19,10 +19,13 @@
*/
#define RDS_PROTOCOL_3_0 0x0300
#define RDS_PROTOCOL_3_1 0x0301
+#define RDS_PROTOCOL_4_0 0x0400
+#define RDS_PROTOCOL_4_1 0x0401
#define RDS_PROTOCOL_VERSION RDS_PROTOCOL_3_1
#define RDS_PROTOCOL_MAJOR(v) ((v) >> 8)
#define RDS_PROTOCOL_MINOR(v) ((v) & 255)
#define RDS_PROTOCOL(maj, min) (((maj) << 8) | min)
+#define RDS_PROTOCOL_COMPAT_VERSION RDS_PROTOCOL_3_1
/* The following ports, 16385, 18634, 18635, are registered with IANA as
* the ports to be used for RDS over TCP and UDP. Currently, only RDS over
@@ -151,9 +154,13 @@ struct rds_connection {
struct rds_cong_map *c_fcong;
/* Protocol version */
+ unsigned int c_proposed_version;
unsigned int c_version;
possible_net_t c_net;
+ /* TOS */
+ u8 c_tos;
+
struct list_head c_map_item;
unsigned long c_map_queued;
@@ -567,6 +574,7 @@ struct rds_transport {
void (*free_mr)(void *trans_private, int invalidate);
void (*flush_mrs)(void);
bool (*t_unloading)(struct rds_connection *conn);
+ u8 (*get_tos_map)(u8 tos);
};
/* Bind hash table key length. It is the sum of the size of a struct
@@ -648,6 +656,7 @@ struct rds_sock {
u8 rs_rx_traces;
u8 rs_rx_trace[RDS_MSG_RX_DGRAM_TRACE_MAX];
struct rds_msg_zcopy_queue rs_zcookie_queue;
+ u8 rs_tos;
};
static inline struct rds_sock *rds_sk_to_rs(const struct sock *sk)
@@ -756,13 +765,14 @@ void rds_conn_exit(void);
struct rds_connection *rds_conn_create(struct net *net,
const struct in6_addr *laddr,
const struct in6_addr *faddr,
- struct rds_transport *trans, gfp_t gfp,
+ struct rds_transport *trans,
+ u8 tos, gfp_t gfp,
int dev_if);
struct rds_connection *rds_conn_create_outgoing(struct net *net,
const struct in6_addr *laddr,
const struct in6_addr *faddr,
struct rds_transport *trans,
- gfp_t gfp, int dev_if);
+ u8 tos, gfp_t gfp, int dev_if);
void rds_conn_shutdown(struct rds_conn_path *cpath);
void rds_conn_destroy(struct rds_connection *conn);
void rds_conn_drop(struct rds_connection *conn);
diff --git a/net/rds/recv.c b/net/rds/recv.c
index 6bb6b16ca270..853de4876088 100644
--- a/net/rds/recv.c
+++ b/net/rds/recv.c
@@ -782,6 +782,7 @@ void rds_inc_info_copy(struct rds_incoming *inc,
minfo.seq = be64_to_cpu(inc->i_hdr.h_sequence);
minfo.len = be32_to_cpu(inc->i_hdr.h_len);
+ minfo.tos = inc->i_conn->c_tos;
if (flip) {
minfo.laddr = daddr;
diff --git a/net/rds/send.c b/net/rds/send.c
index fd8b687d5c05..166dd578c1cc 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -1277,12 +1277,13 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
/* rds_conn_create has a spinlock that runs with IRQ off.
* Caching the conn in the socket helps a lot. */
- if (rs->rs_conn && ipv6_addr_equal(&rs->rs_conn->c_faddr, &daddr))
+ if (rs->rs_conn && ipv6_addr_equal(&rs->rs_conn->c_faddr, &daddr) &&
+ rs->rs_tos == rs->rs_conn->c_tos) {
conn = rs->rs_conn;
- else {
+ } else {
conn = rds_conn_create_outgoing(sock_net(sock->sk),
&rs->rs_bound_addr, &daddr,
- rs->rs_transport,
+ rs->rs_transport, rs->rs_tos,
sock->sk->sk_allocation,
scope_id);
if (IS_ERR(conn)) {
diff --git a/net/rds/tcp.c b/net/rds/tcp.c
index c16f0a362c32..fd2694174607 100644
--- a/net/rds/tcp.c
+++ b/net/rds/tcp.c
@@ -267,6 +267,7 @@ static void rds_tcp_tc_info(struct socket *rds_sock, unsigned int len,
tsinfo.last_sent_nxt = tc->t_last_sent_nxt;
tsinfo.last_expected_una = tc->t_last_expected_una;
tsinfo.last_seen_una = tc->t_last_seen_una;
+ tsinfo.tos = tc->t_cpath->cp_conn->c_tos;
rds_info_copy(iter, &tsinfo, sizeof(tsinfo));
}
@@ -452,6 +453,12 @@ static void rds_tcp_destroy_conns(void)
static void rds_tcp_exit(void);
+static u8 rds_tcp_get_tos_map(u8 tos)
+{
+ /* all user tos mapped to default 0 for TCP transport */
+ return 0;
+}
+
struct rds_transport rds_tcp_transport = {
.laddr_check = rds_tcp_laddr_check,
.xmit_path_prepare = rds_tcp_xmit_path_prepare,
@@ -466,6 +473,7 @@ struct rds_transport rds_tcp_transport = {
.inc_free = rds_tcp_inc_free,
.stats_info_copy = rds_tcp_stats_info_copy,
.exit = rds_tcp_exit,
+ .get_tos_map = rds_tcp_get_tos_map,
.t_owner = THIS_MODULE,
.t_name = "tcp",
.t_type = RDS_TRANS_TCP,
diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c
index c12203f646da..810a3a49e947 100644
--- a/net/rds/tcp_listen.c
+++ b/net/rds/tcp_listen.c
@@ -200,7 +200,7 @@ int rds_tcp_accept_one(struct socket *sock)
conn = rds_conn_create(sock_net(sock->sk),
my_addr, peer_addr,
- &rds_tcp_transport, GFP_KERNEL, dev_if);
+ &rds_tcp_transport, 0, GFP_KERNEL, dev_if);
if (IS_ERR(conn)) {
ret = PTR_ERR(conn);
diff --git a/net/rds/threads.c b/net/rds/threads.c
index e64f9e4c3cda..32dc50f0a303 100644
--- a/net/rds/threads.c
+++ b/net/rds/threads.c
@@ -93,6 +93,7 @@ void rds_connect_path_complete(struct rds_conn_path *cp, int curr)
queue_delayed_work(rds_wq, &cp->cp_recv_w, 0);
}
rcu_read_unlock();
+ cp->cp_conn->c_proposed_version = RDS_PROTOCOL_VERSION;
}
EXPORT_SYMBOL_GPL(rds_connect_path_complete);