summaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
Diffstat (limited to 'net')
-rw-r--r--net/9p/trans_fd.c14
-rw-r--r--net/bluetooth/l2cap_core.c80
-rw-r--r--net/ceph/mon_client.c6
-rw-r--r--net/ceph/osd_client.c5
-rw-r--r--net/core/dev.c25
-rw-r--r--net/core/filter.c51
-rw-r--r--net/core/gen_estimator.c4
-rw-r--r--net/core/netclassid_cgroup.c2
-rw-r--r--net/ipv4/inet_connection_sock.c2
-rw-r--r--net/ipv4/ip_sockglue.c8
-rw-r--r--net/ipv4/ip_tunnel.c6
-rw-r--r--net/ipv4/netfilter/arp_tables.c1
-rw-r--r--net/ipv4/netfilter/ip_tables.c1
-rw-r--r--net/ipv4/tcp_ipv4.c6
-rw-r--r--net/ipv4/tcp_output.c58
-rw-r--r--net/ipv6/ip6_fib.c25
-rw-r--r--net/ipv6/ip6_gre.c21
-rw-r--r--net/ipv6/ip6_tunnel.c6
-rw-r--r--net/ipv6/netfilter/ip6_tables.c1
-rw-r--r--net/ipv6/seg6_local.c4
-rw-r--r--net/ipv6/tcp_ipv6.c6
-rw-r--r--net/ipv6/udp.c1
-rw-r--r--net/netfilter/core.c2
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_sctp.c8
-rw-r--r--net/netfilter/nf_nat_core.c146
-rw-r--r--net/netfilter/xt_hashlimit.c16
-rw-r--r--net/openvswitch/datapath.c3
-rw-r--r--net/sched/act_api.c23
-rw-r--r--net/sched/cls_api.c63
-rw-r--r--net/sched/cls_matchall.c1
-rw-r--r--net/sched/cls_rsvp.h2
-rw-r--r--net/sctp/sctp_diag.c33
-rw-r--r--net/sctp/socket.c40
-rw-r--r--net/sctp/ulpqueue.c3
-rw-r--r--net/sunrpc/backchannel_rqst.c4
-rw-r--r--net/sunrpc/clnt.c12
-rw-r--r--net/sunrpc/svc.c6
-rw-r--r--net/sunrpc/svcsock.c12
-rw-r--r--net/sunrpc/xprt.c57
-rw-r--r--net/sunrpc/xprtrdma/backchannel.c71
-rw-r--r--net/sunrpc/xprtrdma/fmr_ops.c10
-rw-r--r--net/sunrpc/xprtrdma/frwr_ops.c12
-rw-r--r--net/sunrpc/xprtrdma/rpc_rdma.c902
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_backchannel.c9
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_rw.c116
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_transport.c40
-rw-r--r--net/sunrpc/xprtrdma/transport.c8
-rw-r--r--net/sunrpc/xprtrdma/verbs.c22
-rw-r--r--net/sunrpc/xprtrdma/xprt_rdma.h63
-rw-r--r--net/sunrpc/xprtsock.c90
-rw-r--r--net/tls/tls_sw.c2
51 files changed, 1126 insertions, 983 deletions
diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c
index ddfa86648f95..903a190319b9 100644
--- a/net/9p/trans_fd.c
+++ b/net/9p/trans_fd.c
@@ -272,6 +272,7 @@ static int p9_fd_read(struct p9_client *client, void *v, int len)
{
int ret;
struct p9_trans_fd *ts = NULL;
+ loff_t pos;
if (client && client->status != Disconnected)
ts = client->trans;
@@ -282,7 +283,8 @@ static int p9_fd_read(struct p9_client *client, void *v, int len)
if (!(ts->rd->f_flags & O_NONBLOCK))
p9_debug(P9_DEBUG_ERROR, "blocking read ...\n");
- ret = kernel_read(ts->rd, ts->rd->f_pos, v, len);
+ pos = ts->rd->f_pos;
+ ret = kernel_read(ts->rd, v, len, &pos);
if (ret <= 0 && ret != -ERESTARTSYS && ret != -EAGAIN)
client->status = Disconnected;
return ret;
@@ -420,8 +422,7 @@ error:
static int p9_fd_write(struct p9_client *client, void *v, int len)
{
- int ret;
- mm_segment_t oldfs;
+ ssize_t ret;
struct p9_trans_fd *ts = NULL;
if (client && client->status != Disconnected)
@@ -433,12 +434,7 @@ static int p9_fd_write(struct p9_client *client, void *v, int len)
if (!(ts->wr->f_flags & O_NONBLOCK))
p9_debug(P9_DEBUG_ERROR, "blocking write ...\n");
- oldfs = get_fs();
- set_fs(get_ds());
- /* The cast to a user pointer is valid due to the set_fs() */
- ret = vfs_write(ts->wr, (__force void __user *)v, len, &ts->wr->f_pos);
- set_fs(oldfs);
-
+ ret = kernel_write(ts->wr, v, len, &ts->wr->f_pos);
if (ret <= 0 && ret != -ERESTARTSYS && ret != -EAGAIN)
client->status = Disconnected;
return ret;
diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c
index 303c779bfe38..43ba91c440bc 100644
--- a/net/bluetooth/l2cap_core.c
+++ b/net/bluetooth/l2cap_core.c
@@ -58,7 +58,7 @@ static struct sk_buff *l2cap_build_cmd(struct l2cap_conn *conn,
u8 code, u8 ident, u16 dlen, void *data);
static void l2cap_send_cmd(struct l2cap_conn *conn, u8 ident, u8 code, u16 len,
void *data);
-static int l2cap_build_conf_req(struct l2cap_chan *chan, void *data);
+static int l2cap_build_conf_req(struct l2cap_chan *chan, void *data, size_t data_size);
static void l2cap_send_disconn_req(struct l2cap_chan *chan, int err);
static void l2cap_tx(struct l2cap_chan *chan, struct l2cap_ctrl *control,
@@ -1473,7 +1473,7 @@ static void l2cap_conn_start(struct l2cap_conn *conn)
set_bit(CONF_REQ_SENT, &chan->conf_state);
l2cap_send_cmd(conn, l2cap_get_ident(conn), L2CAP_CONF_REQ,
- l2cap_build_conf_req(chan, buf), buf);
+ l2cap_build_conf_req(chan, buf, sizeof(buf)), buf);
chan->num_conf_req++;
}
@@ -2987,12 +2987,15 @@ static inline int l2cap_get_conf_opt(void **ptr, int *type, int *olen,
return len;
}
-static void l2cap_add_conf_opt(void **ptr, u8 type, u8 len, unsigned long val)
+static void l2cap_add_conf_opt(void **ptr, u8 type, u8 len, unsigned long val, size_t size)
{
struct l2cap_conf_opt *opt = *ptr;
BT_DBG("type 0x%2.2x len %u val 0x%lx", type, len, val);
+ if (size < L2CAP_CONF_OPT_SIZE + len)
+ return;
+
opt->type = type;
opt->len = len;
@@ -3017,7 +3020,7 @@ static void l2cap_add_conf_opt(void **ptr, u8 type, u8 len, unsigned long val)
*ptr += L2CAP_CONF_OPT_SIZE + len;
}
-static void l2cap_add_opt_efs(void **ptr, struct l2cap_chan *chan)
+static void l2cap_add_opt_efs(void **ptr, struct l2cap_chan *chan, size_t size)
{
struct l2cap_conf_efs efs;
@@ -3045,7 +3048,7 @@ static void l2cap_add_opt_efs(void **ptr, struct l2cap_chan *chan)
}
l2cap_add_conf_opt(ptr, L2CAP_CONF_EFS, sizeof(efs),
- (unsigned long) &efs);
+ (unsigned long) &efs, size);
}
static void l2cap_ack_timeout(struct work_struct *work)
@@ -3191,11 +3194,12 @@ static inline void l2cap_txwin_setup(struct l2cap_chan *chan)
chan->ack_win = chan->tx_win;
}
-static int l2cap_build_conf_req(struct l2cap_chan *chan, void *data)
+static int l2cap_build_conf_req(struct l2cap_chan *chan, void *data, size_t data_size)
{
struct l2cap_conf_req *req = data;
struct l2cap_conf_rfc rfc = { .mode = chan->mode };
void *ptr = req->data;
+ void *endptr = data + data_size;
u16 size;
BT_DBG("chan %p", chan);
@@ -3220,7 +3224,7 @@ static int l2cap_build_conf_req(struct l2cap_chan *chan, void *data)
done:
if (chan->imtu != L2CAP_DEFAULT_MTU)
- l2cap_add_conf_opt(&ptr, L2CAP_CONF_MTU, 2, chan->imtu);
+ l2cap_add_conf_opt(&ptr, L2CAP_CONF_MTU, 2, chan->imtu, endptr - ptr);
switch (chan->mode) {
case L2CAP_MODE_BASIC:
@@ -3239,7 +3243,7 @@ done:
rfc.max_pdu_size = 0;
l2cap_add_conf_opt(&ptr, L2CAP_CONF_RFC, sizeof(rfc),
- (unsigned long) &rfc);
+ (unsigned long) &rfc, endptr - ptr);
break;
case L2CAP_MODE_ERTM:
@@ -3259,21 +3263,21 @@ done:
L2CAP_DEFAULT_TX_WINDOW);
l2cap_add_conf_opt(&ptr, L2CAP_CONF_RFC, sizeof(rfc),
- (unsigned long) &rfc);
+ (unsigned long) &rfc, endptr - ptr);
if (test_bit(FLAG_EFS_ENABLE, &chan->flags))
- l2cap_add_opt_efs(&ptr, chan);
+ l2cap_add_opt_efs(&ptr, chan, endptr - ptr);
if (test_bit(FLAG_EXT_CTRL, &chan->flags))
l2cap_add_conf_opt(&ptr, L2CAP_CONF_EWS, 2,
- chan->tx_win);
+ chan->tx_win, endptr - ptr);
if (chan->conn->feat_mask & L2CAP_FEAT_FCS)
if (chan->fcs == L2CAP_FCS_NONE ||
test_bit(CONF_RECV_NO_FCS, &chan->conf_state)) {
chan->fcs = L2CAP_FCS_NONE;
l2cap_add_conf_opt(&ptr, L2CAP_CONF_FCS, 1,
- chan->fcs);
+ chan->fcs, endptr - ptr);
}
break;
@@ -3291,17 +3295,17 @@ done:
rfc.max_pdu_size = cpu_to_le16(size);
l2cap_add_conf_opt(&ptr, L2CAP_CONF_RFC, sizeof(rfc),
- (unsigned long) &rfc);
+ (unsigned long) &rfc, endptr - ptr);
if (test_bit(FLAG_EFS_ENABLE, &chan->flags))
- l2cap_add_opt_efs(&ptr, chan);
+ l2cap_add_opt_efs(&ptr, chan, endptr - ptr);
if (chan->conn->feat_mask & L2CAP_FEAT_FCS)
if (chan->fcs == L2CAP_FCS_NONE ||
test_bit(CONF_RECV_NO_FCS, &chan->conf_state)) {
chan->fcs = L2CAP_FCS_NONE;
l2cap_add_conf_opt(&ptr, L2CAP_CONF_FCS, 1,
- chan->fcs);
+ chan->fcs, endptr - ptr);
}
break;
}
@@ -3312,10 +3316,11 @@ done:
return ptr - data;
}
-static int l2cap_parse_conf_req(struct l2cap_chan *chan, void *data)
+static int l2cap_parse_conf_req(struct l2cap_chan *chan, void *data, size_t data_size)
{
struct l2cap_conf_rsp *rsp = data;
void *ptr = rsp->data;
+ void *endptr = data + data_size;
void *req = chan->conf_req;
int len = chan->conf_len;
int type, hint, olen;
@@ -3417,7 +3422,7 @@ done:
return -ECONNREFUSED;
l2cap_add_conf_opt(&ptr, L2CAP_CONF_RFC, sizeof(rfc),
- (unsigned long) &rfc);
+ (unsigned long) &rfc, endptr - ptr);
}
if (result == L2CAP_CONF_SUCCESS) {
@@ -3430,7 +3435,7 @@ done:
chan->omtu = mtu;
set_bit(CONF_MTU_DONE, &chan->conf_state);
}
- l2cap_add_conf_opt(&ptr, L2CAP_CONF_MTU, 2, chan->omtu);
+ l2cap_add_conf_opt(&ptr, L2CAP_CONF_MTU, 2, chan->omtu, endptr - ptr);
if (remote_efs) {
if (chan->local_stype != L2CAP_SERV_NOTRAFIC &&
@@ -3444,7 +3449,7 @@ done:
l2cap_add_conf_opt(&ptr, L2CAP_CONF_EFS,
sizeof(efs),
- (unsigned long) &efs);
+ (unsigned long) &efs, endptr - ptr);
} else {
/* Send PENDING Conf Rsp */
result = L2CAP_CONF_PENDING;
@@ -3477,7 +3482,7 @@ done:
set_bit(CONF_MODE_DONE, &chan->conf_state);
l2cap_add_conf_opt(&ptr, L2CAP_CONF_RFC,
- sizeof(rfc), (unsigned long) &rfc);
+ sizeof(rfc), (unsigned long) &rfc, endptr - ptr);
if (test_bit(FLAG_EFS_ENABLE, &chan->flags)) {
chan->remote_id = efs.id;
@@ -3491,7 +3496,7 @@ done:
le32_to_cpu(efs.sdu_itime);
l2cap_add_conf_opt(&ptr, L2CAP_CONF_EFS,
sizeof(efs),
- (unsigned long) &efs);
+ (unsigned long) &efs, endptr - ptr);
}
break;
@@ -3505,7 +3510,7 @@ done:
set_bit(CONF_MODE_DONE, &chan->conf_state);
l2cap_add_conf_opt(&ptr, L2CAP_CONF_RFC, sizeof(rfc),
- (unsigned long) &rfc);
+ (unsigned long) &rfc, endptr - ptr);
break;
@@ -3527,10 +3532,11 @@ done:
}
static int l2cap_parse_conf_rsp(struct l2cap_chan *chan, void *rsp, int len,
- void *data, u16 *result)
+ void *data, size_t size, u16 *result)
{
struct l2cap_conf_req *req = data;
void *ptr = req->data;
+ void *endptr = data + size;
int type, olen;
unsigned long val;
struct l2cap_conf_rfc rfc = { .mode = L2CAP_MODE_BASIC };
@@ -3548,13 +3554,13 @@ static int l2cap_parse_conf_rsp(struct l2cap_chan *chan, void *rsp, int len,
chan->imtu = L2CAP_DEFAULT_MIN_MTU;
} else
chan->imtu = val;
- l2cap_add_conf_opt(&ptr, L2CAP_CONF_MTU, 2, chan->imtu);
+ l2cap_add_conf_opt(&ptr, L2CAP_CONF_MTU, 2, chan->imtu, endptr - ptr);
break;
case L2CAP_CONF_FLUSH_TO:
chan->flush_to = val;
l2cap_add_conf_opt(&ptr, L2CAP_CONF_FLUSH_TO,
- 2, chan->flush_to);
+ 2, chan->flush_to, endptr - ptr);
break;
case L2CAP_CONF_RFC:
@@ -3568,13 +3574,13 @@ static int l2cap_parse_conf_rsp(struct l2cap_chan *chan, void *rsp, int len,
chan->fcs = 0;
l2cap_add_conf_opt(&ptr, L2CAP_CONF_RFC,
- sizeof(rfc), (unsigned long) &rfc);
+ sizeof(rfc), (unsigned long) &rfc, endptr - ptr);
break;
case L2CAP_CONF_EWS:
chan->ack_win = min_t(u16, val, chan->ack_win);
l2cap_add_conf_opt(&ptr, L2CAP_CONF_EWS, 2,
- chan->tx_win);
+ chan->tx_win, endptr - ptr);
break;
case L2CAP_CONF_EFS:
@@ -3587,7 +3593,7 @@ static int l2cap_parse_conf_rsp(struct l2cap_chan *chan, void *rsp, int len,
return -ECONNREFUSED;
l2cap_add_conf_opt(&ptr, L2CAP_CONF_EFS, sizeof(efs),
- (unsigned long) &efs);
+ (unsigned long) &efs, endptr - ptr);
break;
case L2CAP_CONF_FCS:
@@ -3692,7 +3698,7 @@ void __l2cap_connect_rsp_defer(struct l2cap_chan *chan)
return;
l2cap_send_cmd(conn, l2cap_get_ident(conn), L2CAP_CONF_REQ,
- l2cap_build_conf_req(chan, buf), buf);
+ l2cap_build_conf_req(chan, buf, sizeof(buf)), buf);
chan->num_conf_req++;
}
@@ -3900,7 +3906,7 @@ sendresp:
u8 buf[128];
set_bit(CONF_REQ_SENT, &chan->conf_state);
l2cap_send_cmd(conn, l2cap_get_ident(conn), L2CAP_CONF_REQ,
- l2cap_build_conf_req(chan, buf), buf);
+ l2cap_build_conf_req(chan, buf, sizeof(buf)), buf);
chan->num_conf_req++;
}
@@ -3978,7 +3984,7 @@ static int l2cap_connect_create_rsp(struct l2cap_conn *conn,
break;
l2cap_send_cmd(conn, l2cap_get_ident(conn), L2CAP_CONF_REQ,
- l2cap_build_conf_req(chan, req), req);
+ l2cap_build_conf_req(chan, req, sizeof(req)), req);
chan->num_conf_req++;
break;
@@ -4090,7 +4096,7 @@ static inline int l2cap_config_req(struct l2cap_conn *conn,
}
/* Complete config. */
- len = l2cap_parse_conf_req(chan, rsp);
+ len = l2cap_parse_conf_req(chan, rsp, sizeof(rsp));
if (len < 0) {
l2cap_send_disconn_req(chan, ECONNRESET);
goto unlock;
@@ -4124,7 +4130,7 @@ static inline int l2cap_config_req(struct l2cap_conn *conn,
if (!test_and_set_bit(CONF_REQ_SENT, &chan->conf_state)) {
u8 buf[64];
l2cap_send_cmd(conn, l2cap_get_ident(conn), L2CAP_CONF_REQ,
- l2cap_build_conf_req(chan, buf), buf);
+ l2cap_build_conf_req(chan, buf, sizeof(buf)), buf);
chan->num_conf_req++;
}
@@ -4184,7 +4190,7 @@ static inline int l2cap_config_rsp(struct l2cap_conn *conn,
char buf[64];
len = l2cap_parse_conf_rsp(chan, rsp->data, len,
- buf, &result);
+ buf, sizeof(buf), &result);
if (len < 0) {
l2cap_send_disconn_req(chan, ECONNRESET);
goto done;
@@ -4214,7 +4220,7 @@ static inline int l2cap_config_rsp(struct l2cap_conn *conn,
/* throw out any old stored conf requests */
result = L2CAP_CONF_SUCCESS;
len = l2cap_parse_conf_rsp(chan, rsp->data, len,
- req, &result);
+ req, sizeof(req), &result);
if (len < 0) {
l2cap_send_disconn_req(chan, ECONNRESET);
goto done;
@@ -4791,7 +4797,7 @@ static void l2cap_do_create(struct l2cap_chan *chan, int result,
set_bit(CONF_REQ_SENT, &chan->conf_state);
l2cap_send_cmd(chan->conn, l2cap_get_ident(chan->conn),
L2CAP_CONF_REQ,
- l2cap_build_conf_req(chan, buf), buf);
+ l2cap_build_conf_req(chan, buf, sizeof(buf)), buf);
chan->num_conf_req++;
}
}
@@ -7465,7 +7471,7 @@ static void l2cap_security_cfm(struct hci_conn *hcon, u8 status, u8 encrypt)
set_bit(CONF_REQ_SENT, &chan->conf_state);
l2cap_send_cmd(conn, l2cap_get_ident(conn),
L2CAP_CONF_REQ,
- l2cap_build_conf_req(chan, buf),
+ l2cap_build_conf_req(chan, buf, sizeof(buf)),
buf);
chan->num_conf_req++;
}
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index 875675765531..63edc6e5f026 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -676,7 +676,8 @@ bad:
/*
* Do a synchronous statfs().
*/
-int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
+int ceph_monc_do_statfs(struct ceph_mon_client *monc, u64 data_pool,
+ struct ceph_statfs *buf)
{
struct ceph_mon_generic_request *req;
struct ceph_mon_statfs *h;
@@ -696,6 +697,7 @@ int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
goto out;
req->u.st = buf;
+ req->request->hdr.version = cpu_to_le16(2);
mutex_lock(&monc->mutex);
register_generic_request(req);
@@ -705,6 +707,8 @@ int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
h->monhdr.session_mon = cpu_to_le16(-1);
h->monhdr.session_mon_tid = 0;
h->fsid = monc->monmap->fsid;
+ h->contains_data_pool = (data_pool != CEPH_NOPOOL);
+ h->data_pool = cpu_to_le64(data_pool);
send_generic_request(monc, req);
mutex_unlock(&monc->mutex);
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index dcfbdd74dfd1..e02f01f534e2 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -863,8 +863,6 @@ static u32 osd_req_encode_op(struct ceph_osd_op *dst,
dst->cls.method_len = src->cls.method_len;
dst->cls.indata_len = cpu_to_le32(src->cls.indata_len);
break;
- case CEPH_OSD_OP_STARTSYNC:
- break;
case CEPH_OSD_OP_WATCH:
dst->watch.cookie = cpu_to_le64(src->watch.cookie);
dst->watch.ver = cpu_to_le64(0);
@@ -916,9 +914,6 @@ static u32 osd_req_encode_op(struct ceph_osd_op *dst,
* if the file was recently truncated, we include information about its
* old and new size so that the object can be updated appropriately. (we
* avoid synchronously deleting truncated objects because it's slow.)
- *
- * if @do_sync, include a 'startsync' command so that the osd will flush
- * data quickly.
*/
struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
struct ceph_file_layout *layout,
diff --git a/net/core/dev.c b/net/core/dev.c
index 6f845e4fec17..fb766d906148 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3981,8 +3981,13 @@ static int netif_rx_internal(struct sk_buff *skb)
trace_netif_rx(skb);
if (static_key_false(&generic_xdp_needed)) {
- int ret = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog),
- skb);
+ int ret;
+
+ preempt_disable();
+ rcu_read_lock();
+ ret = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), skb);
+ rcu_read_unlock();
+ preempt_enable();
/* Consider XDP consuming the packet a success from
* the netdev point of view we do not want to count
@@ -4500,18 +4505,20 @@ static int netif_receive_skb_internal(struct sk_buff *skb)
if (skb_defer_rx_timestamp(skb))
return NET_RX_SUCCESS;
- rcu_read_lock();
-
if (static_key_false(&generic_xdp_needed)) {
- int ret = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog),
- skb);
+ int ret;
- if (ret != XDP_PASS) {
- rcu_read_unlock();
+ preempt_disable();
+ rcu_read_lock();
+ ret = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), skb);
+ rcu_read_unlock();
+ preempt_enable();
+
+ if (ret != XDP_PASS)
return NET_RX_DROP;
- }
}
+ rcu_read_lock();
#ifdef CONFIG_RPS
if (static_key_false(&rps_needed)) {
struct rps_dev_flow voidflow, *rflow = &voidflow;
diff --git a/net/core/filter.c b/net/core/filter.c
index 5912c738a7b2..24dd33dd9f04 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1794,6 +1794,7 @@ struct redirect_info {
u32 flags;
struct bpf_map *map;
struct bpf_map *map_to_flush;
+ const struct bpf_prog *map_owner;
};
static DEFINE_PER_CPU(struct redirect_info, redirect_info);
@@ -1807,7 +1808,6 @@ BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags)
ri->ifindex = ifindex;
ri->flags = flags;
- ri->map = NULL;
return TC_ACT_REDIRECT;
}
@@ -2504,13 +2504,21 @@ static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp,
struct bpf_prog *xdp_prog)
{
struct redirect_info *ri = this_cpu_ptr(&redirect_info);
+ const struct bpf_prog *map_owner = ri->map_owner;
struct bpf_map *map = ri->map;
+ struct net_device *fwd = NULL;
u32 index = ri->ifindex;
- struct net_device *fwd;
int err;
ri->ifindex = 0;
ri->map = NULL;
+ ri->map_owner = NULL;
+
+ if (unlikely(map_owner != xdp_prog)) {
+ err = -EFAULT;
+ map = NULL;
+ goto err;
+ }
fwd = __dev_map_lookup_elem(map, index);
if (!fwd) {
@@ -2566,13 +2574,27 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,
struct bpf_prog *xdp_prog)
{
struct redirect_info *ri = this_cpu_ptr(&redirect_info);
+ const struct bpf_prog *map_owner = ri->map_owner;
+ struct bpf_map *map = ri->map;
+ struct net_device *fwd = NULL;
u32 index = ri->ifindex;
- struct net_device *fwd;
unsigned int len;
int err = 0;
- fwd = dev_get_by_index_rcu(dev_net(dev), index);
ri->ifindex = 0;
+ ri->map = NULL;
+ ri->map_owner = NULL;
+
+ if (map) {
+ if (unlikely(map_owner != xdp_prog)) {
+ err = -EFAULT;
+ map = NULL;
+ goto err;
+ }
+ fwd = __dev_map_lookup_elem(map, index);
+ } else {
+ fwd = dev_get_by_index_rcu(dev_net(dev), index);
+ }
if (unlikely(!fwd)) {
err = -EINVAL;
goto err;
@@ -2590,10 +2612,12 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,
}
skb->dev = fwd;
- _trace_xdp_redirect(dev, xdp_prog, index);
+ map ? _trace_xdp_redirect_map(dev, xdp_prog, fwd, map, index)
+ : _trace_xdp_redirect(dev, xdp_prog, index);
return 0;
err:
- _trace_xdp_redirect_err(dev, xdp_prog, index, err);
+ map ? _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map, index, err)
+ : _trace_xdp_redirect_err(dev, xdp_prog, index, err);
return err;
}
EXPORT_SYMBOL_GPL(xdp_do_generic_redirect);
@@ -2607,6 +2631,8 @@ BPF_CALL_2(bpf_xdp_redirect, u32, ifindex, u64, flags)
ri->ifindex = ifindex;
ri->flags = flags;
+ ri->map = NULL;
+ ri->map_owner = NULL;
return XDP_REDIRECT;
}
@@ -2619,7 +2645,8 @@ static const struct bpf_func_proto bpf_xdp_redirect_proto = {
.arg2_type = ARG_ANYTHING,
};
-BPF_CALL_3(bpf_xdp_redirect_map, struct bpf_map *, map, u32, ifindex, u64, flags)
+BPF_CALL_4(bpf_xdp_redirect_map, struct bpf_map *, map, u32, ifindex, u64, flags,
+ const struct bpf_prog *, map_owner)
{
struct redirect_info *ri = this_cpu_ptr(&redirect_info);
@@ -2629,10 +2656,14 @@ BPF_CALL_3(bpf_xdp_redirect_map, struct bpf_map *, map, u32, ifindex, u64, flags
ri->ifindex = ifindex;
ri->flags = flags;
ri->map = map;
+ ri->map_owner = map_owner;
return XDP_REDIRECT;
}
+/* Note, arg4 is hidden from users and populated by the verifier
+ * with the right pointer.
+ */
static const struct bpf_func_proto bpf_xdp_redirect_map_proto = {
.func = bpf_xdp_redirect_map,
.gpl_only = false,
@@ -3592,7 +3623,11 @@ static bool xdp_is_valid_access(int off, int size,
void bpf_warn_invalid_xdp_action(u32 act)
{
- WARN_ONCE(1, "Illegal XDP return value %u, expect packet loss\n", act);
+ const u32 act_max = XDP_REDIRECT;
+
+ WARN_ONCE(1, "%s XDP return value %u, expect packet loss!\n",
+ act > act_max ? "Illegal" : "Driver unsupported",
+ act);
}
EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action);
diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c
index 0385dece1f6f..7c1ffd6f9501 100644
--- a/net/core/gen_estimator.c
+++ b/net/core/gen_estimator.c
@@ -83,10 +83,10 @@ static void est_timer(unsigned long arg)
u64 rate, brate;
est_fetch_counters(est, &b);
- brate = (b.bytes - est->last_bytes) << (8 - est->ewma_log);
+ brate = (b.bytes - est->last_bytes) << (10 - est->ewma_log - est->intvl_log);
brate -= (est->avbps >> est->ewma_log);
- rate = (u64)(b.packets - est->last_packets) << (8 - est->ewma_log);
+ rate = (u64)(b.packets - est->last_packets) << (10 - est->ewma_log - est->intvl_log);
rate -= (est->avpps >> est->ewma_log);
write_seqcount_begin(&est->seq);
diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c
index 029a61ac6cdd..5e4f04004a49 100644
--- a/net/core/netclassid_cgroup.c
+++ b/net/core/netclassid_cgroup.c
@@ -100,7 +100,7 @@ static int write_classid(struct cgroup_subsys_state *css, struct cftype *cft,
cs->classid = (u32)value;
- css_task_iter_start(css, &it);
+ css_task_iter_start(css, 0, &it);
while ((p = css_task_iter_next(&it))) {
task_lock(p);
iterate_fd(p->files, 0, update_classid_sock,
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 4089c013cb03..b9c64b40a83a 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -916,7 +916,6 @@ static void inet_child_forget(struct sock *sk, struct request_sock *req,
tcp_sk(child)->fastopen_rsk = NULL;
}
inet_csk_destroy_sock(child);
- reqsk_put(req);
}
struct sock *inet_csk_reqsk_queue_add(struct sock *sk,
@@ -987,6 +986,7 @@ void inet_csk_listen_stop(struct sock *sk)
sock_hold(child);
inet_child_forget(sk, req, child);
+ reqsk_put(req);
bh_unlock_sock(child);
local_bh_enable();
sock_put(child);
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index e558e4f9597b..a599aa83fdad 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -1207,7 +1207,6 @@ e_inval:
void ipv4_pktinfo_prepare(const struct sock *sk, struct sk_buff *skb)
{
struct in_pktinfo *pktinfo = PKTINFO_SKB_CB(skb);
- bool l3slave = ipv4_l3mdev_skb(IPCB(skb)->flags);
bool prepare = (inet_sk(sk)->cmsg_flags & IP_CMSG_PKTINFO) ||
ipv6_sk_rxinfo(sk);
@@ -1221,8 +1220,13 @@ void ipv4_pktinfo_prepare(const struct sock *sk, struct sk_buff *skb)
* (e.g., process binds socket to eth0 for Tx which is
* redirected to loopback in the rtable/dst).
*/
- if (pktinfo->ipi_ifindex == LOOPBACK_IFINDEX || l3slave)
+ struct rtable *rt = skb_rtable(skb);
+ bool l3slave = ipv4_l3mdev_skb(IPCB(skb)->flags);
+
+ if (pktinfo->ipi_ifindex == LOOPBACK_IFINDEX)
pktinfo->ipi_ifindex = inet_iif(skb);
+ else if (l3slave && rt && rt->rt_iif)
+ pktinfo->ipi_ifindex = rt->rt_iif;
pktinfo->ipi_spec_dst.s_addr = fib_compute_spec_dst(skb);
} else {
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index 129d1a3616f8..e9805ad664ac 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -176,7 +176,7 @@ skip_key_lookup:
return cand;
t = rcu_dereference(itn->collect_md_tun);
- if (t)
+ if (t && t->dev->flags & IFF_UP)
return t;
if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP)
@@ -618,8 +618,8 @@ void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, u8 proto)
ip_rt_put(rt);
goto tx_dropped;
}
- iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, key->tos,
- key->ttl, df, !net_eq(tunnel->net, dev_net(dev)));
+ iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, tos, ttl,
+ df, !net_eq(tunnel->net, dev_net(dev)));
return;
tx_error:
dev->stats.tx_errors++;
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index e04457198f93..9e2770fd00be 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -629,6 +629,7 @@ static void get_counters(const struct xt_table_info *t,
ADD_COUNTER(counters[i], bcnt, pcnt);
++i;
+ cond_resched();
}
}
}
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 576cba2b57e9..39286e543ee6 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -776,6 +776,7 @@ get_counters(const struct xt_table_info *t,
ADD_COUNTER(counters[i], bcnt, pcnt);
++i; /* macro does multi eval of i */
+ cond_resched();
}
}
}
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index a63486afa7a7..d9416b5162bc 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1669,9 +1669,9 @@ process:
*/
sock_hold(sk);
refcounted = true;
- if (tcp_filter(sk, skb))
- goto discard_and_relse;
- nsk = tcp_check_req(sk, skb, req, false);
+ nsk = NULL;
+ if (!tcp_filter(sk, skb))
+ nsk = tcp_check_req(sk, skb, req, false);
if (!nsk) {
reqsk_put(req);
goto discard_and_relse;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 5b6690d05abb..517d737059d1 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -991,6 +991,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
struct tcp_skb_cb *tcb;
struct tcp_out_options opts;
unsigned int tcp_options_size, tcp_header_size;
+ struct sk_buff *oskb = NULL;
struct tcp_md5sig_key *md5;
struct tcphdr *th;
int err;
@@ -998,12 +999,10 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
BUG_ON(!skb || !tcp_skb_pcount(skb));
tp = tcp_sk(sk);
- skb->skb_mstamp = tp->tcp_mstamp;
if (clone_it) {
TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq
- tp->snd_una;
- tcp_rate_skb_sent(sk, skb);
-
+ oskb = skb;
if (unlikely(skb_cloned(skb)))
skb = pskb_copy(skb, gfp_mask);
else
@@ -1011,6 +1010,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
if (unlikely(!skb))
return -ENOBUFS;
}
+ skb->skb_mstamp = tp->tcp_mstamp;
inet = inet_sk(sk);
tcb = TCP_SKB_CB(skb);
@@ -1122,12 +1122,15 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl);
- if (likely(err <= 0))
- return err;
-
- tcp_enter_cwr(sk);
-
- return net_xmit_eval(err);
+ if (unlikely(err > 0)) {
+ tcp_enter_cwr(sk);
+ err = net_xmit_eval(err);
+ }
+ if (!err && oskb) {
+ oskb->skb_mstamp = tp->tcp_mstamp;
+ tcp_rate_skb_sent(sk, oskb);
+ }
+ return err;
}
/* This routine just queues the buffer for sending.
@@ -1803,40 +1806,6 @@ static bool tcp_snd_wnd_test(const struct tcp_sock *tp,
return !after(end_seq, tcp_wnd_end(tp));
}
-/* This checks if the data bearing packet SKB (usually tcp_send_head(sk))
- * should be put on the wire right now. If so, it returns the number of
- * packets allowed by the congestion window.
- */
-static unsigned int tcp_snd_test(const struct sock *sk, struct sk_buff *skb,
- unsigned int cur_mss, int nonagle)
-{
- const struct tcp_sock *tp = tcp_sk(sk);
- unsigned int cwnd_quota;
-
- tcp_init_tso_segs(skb, cur_mss);
-
- if (!tcp_nagle_test(tp, skb, cur_mss, nonagle))
- return 0;
-
- cwnd_quota = tcp_cwnd_test(tp, skb);
- if (cwnd_quota && !tcp_snd_wnd_test(tp, skb, cur_mss))
- cwnd_quota = 0;
-
- return cwnd_quota;
-}
-
-/* Test if sending is allowed right now. */
-bool tcp_may_send_now(struct sock *sk)
-{
- const struct tcp_sock *tp = tcp_sk(sk);
- struct sk_buff *skb = tcp_send_head(sk);
-
- return skb &&
- tcp_snd_test(sk, skb, tcp_current_mss(sk),
- (tcp_skb_is_last(sk, skb) ?
- tp->nonagle : TCP_NAGLE_PUSH));
-}
-
/* Trim TSO SKB to LEN bytes, put the remaining data into a new packet
* which is put after SKB on the list. It is very much like
* tcp_fragment() except that it may make several kinds of assumptions
@@ -2869,10 +2838,11 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
skb_headroom(skb) >= 0xFFFF)) {
struct sk_buff *nskb;
- skb->skb_mstamp = tp->tcp_mstamp;
nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC);
err = nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) :
-ENOBUFS;
+ if (!err)
+ skb->skb_mstamp = tp->tcp_mstamp;
} else {
err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
}
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index a3b5c163325f..e5308d7cbd75 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -191,6 +191,12 @@ void rt6_free_pcpu(struct rt6_info *non_pcpu_rt)
}
EXPORT_SYMBOL_GPL(rt6_free_pcpu);
+static void fib6_free_table(struct fib6_table *table)
+{
+ inetpeer_invalidate_tree(&table->tb6_peers);
+ kfree(table);
+}
+
static void fib6_link_table(struct net *net, struct fib6_table *tb)
{
unsigned int h;
@@ -2022,15 +2028,22 @@ out_timer:
static void fib6_net_exit(struct net *net)
{
+ unsigned int i;
+
rt6_ifdown(net, NULL);
del_timer_sync(&net->ipv6.ip6_fib_timer);
-#ifdef CONFIG_IPV6_MULTIPLE_TABLES
- inetpeer_invalidate_tree(&net->ipv6.fib6_local_tbl->tb6_peers);
- kfree(net->ipv6.fib6_local_tbl);
-#endif
- inetpeer_invalidate_tree(&net->ipv6.fib6_main_tbl->tb6_peers);
- kfree(net->ipv6.fib6_main_tbl);
+ for (i = 0; i < FIB6_TABLE_HASHSZ; i++) {
+ struct hlist_head *head = &net->ipv6.fib_table_hash[i];
+ struct hlist_node *tmp;
+ struct fib6_table *tb;
+
+ hlist_for_each_entry_safe(tb, tmp, head, tb6_hlist) {
+ hlist_del(&tb->tb6_hlist);
+ fib6_free_table(tb);
+ }
+ }
+
kfree(net->ipv6.fib_table_hash);
kfree(net->ipv6.rt6_stats);
fib6_notifier_exit(net);
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index b7a72d409334..20f66f4c9460 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -940,24 +940,25 @@ done:
}
static int ip6gre_header(struct sk_buff *skb, struct net_device *dev,
- unsigned short type,
- const void *daddr, const void *saddr, unsigned int len)
+ unsigned short type, const void *daddr,
+ const void *saddr, unsigned int len)
{
struct ip6_tnl *t = netdev_priv(dev);
- struct ipv6hdr *ipv6h = skb_push(skb, t->hlen);
- __be16 *p = (__be16 *)(ipv6h+1);
+ struct ipv6hdr *ipv6h;
+ __be16 *p;
- ip6_flow_hdr(ipv6h, 0,
- ip6_make_flowlabel(dev_net(dev), skb,
- t->fl.u.ip6.flowlabel, true,
- &t->fl.u.ip6));
+ ipv6h = skb_push(skb, t->hlen + sizeof(*ipv6h));
+ ip6_flow_hdr(ipv6h, 0, ip6_make_flowlabel(dev_net(dev), skb,
+ t->fl.u.ip6.flowlabel,
+ true, &t->fl.u.ip6));
ipv6h->hop_limit = t->parms.hop_limit;
ipv6h->nexthdr = NEXTHDR_GRE;
ipv6h->saddr = t->parms.laddr;
ipv6h->daddr = t->parms.raddr;
- p[0] = t->parms.o_flags;
- p[1] = htons(type);
+ p = (__be16 *)(ipv6h + 1);
+ p[0] = t->parms.o_flags;
+ p[1] = htons(type);
/*
* Set the source hardware address.
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 3a0ba2ae4b0f..f2f21c24915f 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -171,7 +171,7 @@ ip6_tnl_lookup(struct net *net, const struct in6_addr *remote, const struct in6_
}
t = rcu_dereference(ip6n->collect_md_tun);
- if (t)
+ if (t && t->dev->flags & IFF_UP)
return t;
t = rcu_dereference(ip6n->tnls_wc[0]);
@@ -1184,6 +1184,7 @@ route_lookup:
init_tel_txopt(&opt, encap_limit);
ipv6_push_frag_opts(skb, &opt.ops, &proto);
}
+ hop_limit = hop_limit ? : ip6_dst_hoplimit(dst);
/* Calculate max headroom for all the headers and adjust
* needed_headroom if necessary.
@@ -2258,6 +2259,9 @@ static int __init ip6_tunnel_init(void)
{
int err;
+ if (!ipv6_mod_enabled())
+ return -EOPNOTSUPP;
+
err = register_pernet_device(&ip6_tnl_net_ops);
if (err < 0)
goto out_pernet;
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 54b1e75eded1..01bd3ee5ebc6 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -795,6 +795,7 @@ get_counters(const struct xt_table_info *t,
ADD_COUNTER(counters[i], bcnt, pcnt);
++i;
+ cond_resched();
}
}
}
diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c
index 7ff54db73a48..825b8e01f947 100644
--- a/net/ipv6/seg6_local.c
+++ b/net/ipv6/seg6_local.c
@@ -72,10 +72,6 @@ static struct ipv6_sr_hdr *get_srh(struct sk_buff *skb)
srh = (struct ipv6_sr_hdr *)(skb->data + srhoff);
- /* make sure it's a Segment Routing header (Routing Type 4) */
- if (srh->type != IPV6_SRCRT_TYPE_4)
- return NULL;
-
len = (srh->hdrlen + 1) << 3;
if (!pskb_may_pull(skb, srhoff + len))
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 38f76d8b231e..64d94afa427f 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1460,9 +1460,9 @@ process:
}
sock_hold(sk);
refcounted = true;
- if (tcp_filter(sk, skb))
- goto discard_and_relse;
- nsk = tcp_check_req(sk, skb, req, false);
+ nsk = NULL;
+ if (!tcp_filter(sk, skb))
+ nsk = tcp_check_req(sk, skb, req, false);
if (!nsk) {
reqsk_put(req);
goto discard_and_relse;
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index e2ecfb137297..40d7234c27b9 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1015,6 +1015,7 @@ static void udp6_hwcsum_outgoing(struct sock *sk, struct sk_buff *skb,
*/
offset = skb_transport_offset(skb);
skb->csum = skb_checksum(skb, offset, skb->len - offset, 0);
+ csum = skb->csum;
skb->ip_summed = CHECKSUM_NONE;
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index 04fe25abc5f6..52cd2901a097 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -215,7 +215,7 @@ static void *__nf_hook_entries_try_shrink(struct nf_hook_entries __rcu **pp)
if (skip == hook_entries)
goto out_assign;
- if (WARN_ON(skip == 0))
+ if (skip == 0)
return NULL;
hook_entries -= skip;
diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c
index e1efa446b305..57c8ee66491e 100644
--- a/net/netfilter/ipvs/ip_vs_proto_sctp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c
@@ -24,9 +24,13 @@ sctp_conn_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb,
if (sh) {
sch = skb_header_pointer(skb, iph->len + sizeof(_sctph),
sizeof(_schunkh), &_schunkh);
- if (sch && (sch->type == SCTP_CID_INIT ||
- sysctl_sloppy_sctp(ipvs)))
+ if (sch) {
+ if (sch->type == SCTP_CID_ABORT ||
+ !(sysctl_sloppy_sctp(ipvs) ||
+ sch->type == SCTP_CID_INIT))
+ return 1;
ports = &sh->source;
+ }
}
} else {
ports = skb_header_pointer(
diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
index 40573aa6c133..f393a7086025 100644
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -30,19 +30,17 @@
#include <net/netfilter/nf_conntrack_zones.h>
#include <linux/netfilter/nf_nat.h>
+static spinlock_t nf_nat_locks[CONNTRACK_LOCKS];
+
static DEFINE_MUTEX(nf_nat_proto_mutex);
static const struct nf_nat_l3proto __rcu *nf_nat_l3protos[NFPROTO_NUMPROTO]
__read_mostly;
static const struct nf_nat_l4proto __rcu **nf_nat_l4protos[NFPROTO_NUMPROTO]
__read_mostly;
-struct nf_nat_conn_key {
- const struct net *net;
- const struct nf_conntrack_tuple *tuple;
- const struct nf_conntrack_zone *zone;
-};
-
-static struct rhltable nf_nat_bysource_table;
+static struct hlist_head *nf_nat_bysource __read_mostly;
+static unsigned int nf_nat_htable_size __read_mostly;
+static unsigned int nf_nat_hash_rnd __read_mostly;
inline const struct nf_nat_l3proto *
__nf_nat_l3proto_find(u8 family)
@@ -118,17 +116,19 @@ int nf_xfrm_me_harder(struct net *net, struct sk_buff *skb, unsigned int family)
EXPORT_SYMBOL(nf_xfrm_me_harder);
#endif /* CONFIG_XFRM */
-static u32 nf_nat_bysource_hash(const void *data, u32 len, u32 seed)
+/* We keep an extra hash for each conntrack, for fast searching. */
+static unsigned int
+hash_by_src(const struct net *n, const struct nf_conntrack_tuple *tuple)
{
- const struct nf_conntrack_tuple *t;
- const struct nf_conn *ct = data;
+ unsigned int hash;
+
+ get_random_once(&nf_nat_hash_rnd, sizeof(nf_nat_hash_rnd));
- t = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
/* Original src, to ensure we map it consistently if poss. */
+ hash = jhash2((u32 *)&tuple->src, sizeof(tuple->src) / sizeof(u32),
+ tuple->dst.protonum ^ nf_nat_hash_rnd ^ net_hash_mix(n));
- seed ^= net_hash_mix(nf_ct_net(ct));
- return jhash2((const u32 *)&t->src, sizeof(t->src) / sizeof(u32),
- t->dst.protonum ^ seed);
+ return reciprocal_scale(hash, nf_nat_htable_size);
}
/* Is this tuple already taken? (not by us) */
@@ -184,28 +184,6 @@ same_src(const struct nf_conn *ct,
t->src.u.all == tuple->src.u.all);
}
-static int nf_nat_bysource_cmp(struct rhashtable_compare_arg *arg,
- const void *obj)
-{
- const struct nf_nat_conn_key *key = arg->key;
- const struct nf_conn *ct = obj;
-
- if (!same_src(ct, key->tuple) ||
- !net_eq(nf_ct_net(ct), key->net) ||
- !nf_ct_zone_equal(ct, key->zone, IP_CT_DIR_ORIGINAL))
- return 1;
-
- return 0;
-}
-
-static struct rhashtable_params nf_nat_bysource_params = {
- .head_offset = offsetof(struct nf_conn, nat_bysource),
- .obj_hashfn = nf_nat_bysource_hash,
- .obj_cmpfn = nf_nat_bysource_cmp,
- .nelem_hint = 256,
- .min_size = 1024,
-};
-
/* Only called for SRC manip */
static int
find_appropriate_src(struct net *net,
@@ -216,26 +194,22 @@ find_appropriate_src(struct net *net,
struct nf_conntrack_tuple *result,
const struct nf_nat_range *range)
{
+ unsigned int h = hash_by_src(net, tuple);
const struct nf_conn *ct;
- struct nf_nat_conn_key key = {
- .net = net,
- .tuple = tuple,
- .zone = zone
- };
- struct rhlist_head *hl, *h;
-
- hl = rhltable_lookup(&nf_nat_bysource_table, &key,
- nf_nat_bysource_params);
-
- rhl_for_each_entry_rcu(ct, h, hl, nat_bysource) {
- nf_ct_invert_tuplepr(result,
- &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
- result->dst = tuple->dst;
- if (in_range(l3proto, l4proto, result, range))
- return 1;
+ hlist_for_each_entry_rcu(ct, &nf_nat_bysource[h], nat_bysource) {
+ if (same_src(ct, tuple) &&
+ net_eq(net, nf_ct_net(ct)) &&
+ nf_ct_zone_equal(ct, zone, IP_CT_DIR_ORIGINAL)) {
+ /* Copy source part from reply tuple. */
+ nf_ct_invert_tuplepr(result,
+ &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
+ result->dst = tuple->dst;
+
+ if (in_range(l3proto, l4proto, result, range))
+ return 1;
+ }
}
-
return 0;
}
@@ -408,6 +382,7 @@ nf_nat_setup_info(struct nf_conn *ct,
const struct nf_nat_range *range,
enum nf_nat_manip_type maniptype)
{
+ struct net *net = nf_ct_net(ct);
struct nf_conntrack_tuple curr_tuple, new_tuple;
/* Can't setup nat info for confirmed ct. */
@@ -416,7 +391,9 @@ nf_nat_setup_info(struct nf_conn *ct,
WARN_ON(maniptype != NF_NAT_MANIP_SRC &&
maniptype != NF_NAT_MANIP_DST);
- BUG_ON(nf_nat_initialized(ct, maniptype));
+
+ if (WARN_ON(nf_nat_initialized(ct, maniptype)))
+ return NF_DROP;
/* What we've got will look like inverse of reply. Normally
* this is what is in the conntrack, except for prior
@@ -447,19 +424,16 @@ nf_nat_setup_info(struct nf_conn *ct,
}
if (maniptype == NF_NAT_MANIP_SRC) {
- struct nf_nat_conn_key key = {
- .net = nf_ct_net(ct),
- .tuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
- .zone = nf_ct_zone(ct),
- };
- int err;
-
- err = rhltable_insert_key(&nf_nat_bysource_table,
- &key,
- &ct->nat_bysource,
- nf_nat_bysource_params);
- if (err)
- return NF_DROP;
+ unsigned int srchash;
+ spinlock_t *lock;
+
+ srchash = hash_by_src(net,
+ &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
+ lock = &nf_nat_locks[srchash % ARRAY_SIZE(nf_nat_locks)];
+ spin_lock_bh(lock);
+ hlist_add_head_rcu(&ct->nat_bysource,
+ &nf_nat_bysource[srchash]);
+ spin_unlock_bh(lock);
}
/* It's done. */
@@ -553,6 +527,16 @@ static int nf_nat_proto_remove(struct nf_conn *i, void *data)
return i->status & IPS_NAT_MASK ? 1 : 0;
}
+static void __nf_nat_cleanup_conntrack(struct nf_conn *ct)
+{
+ unsigned int h;
+
+ h = hash_by_src(nf_ct_net(ct), &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
+ spin_lock_bh(&nf_nat_locks[h % ARRAY_SIZE(nf_nat_locks)]);
+ hlist_del_rcu(&ct->nat_bysource);
+ spin_unlock_bh(&nf_nat_locks[h % ARRAY_SIZE(nf_nat_locks)]);
+}
+
static int nf_nat_proto_clean(struct nf_conn *ct, void *data)
{
if (nf_nat_proto_remove(ct, data))
@@ -568,8 +552,7 @@ static int nf_nat_proto_clean(struct nf_conn *ct, void *data)
* will delete entry from already-freed table.
*/
clear_bit(IPS_SRC_NAT_DONE_BIT, &ct->status);
- rhltable_remove(&nf_nat_bysource_table, &ct->nat_bysource,
- nf_nat_bysource_params);
+ __nf_nat_cleanup_conntrack(ct);
/* don't delete conntrack. Although that would make things a lot
* simpler, we'd end up flushing all conntracks on nat rmmod.
@@ -698,8 +681,7 @@ EXPORT_SYMBOL_GPL(nf_nat_l3proto_unregister);
static void nf_nat_cleanup_conntrack(struct nf_conn *ct)
{
if (ct->status & IPS_SRC_NAT_DONE)
- rhltable_remove(&nf_nat_bysource_table, &ct->nat_bysource,
- nf_nat_bysource_params);
+ __nf_nat_cleanup_conntrack(ct);
}
static struct nf_ct_ext_type nat_extend __read_mostly = {
@@ -821,19 +803,27 @@ static struct nf_ct_helper_expectfn follow_master_nat = {
static int __init nf_nat_init(void)
{
- int ret;
+ int ret, i;
- ret = rhltable_init(&nf_nat_bysource_table, &nf_nat_bysource_params);
- if (ret)
- return ret;
+ /* Leave them the same for the moment. */
+ nf_nat_htable_size = nf_conntrack_htable_size;
+ if (nf_nat_htable_size < ARRAY_SIZE(nf_nat_locks))
+ nf_nat_htable_size = ARRAY_SIZE(nf_nat_locks);
+
+ nf_nat_bysource = nf_ct_alloc_hashtable(&nf_nat_htable_size, 0);
+ if (!nf_nat_bysource)
+ return -ENOMEM;
ret = nf_ct_extend_register(&nat_extend);
if (ret < 0) {
- rhltable_destroy(&nf_nat_bysource_table);
+ nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size);
printk(KERN_ERR "nf_nat_core: Unable to register extension\n");
return ret;
}
+ for (i = 0; i < ARRAY_SIZE(nf_nat_locks); i++)
+ spin_lock_init(&nf_nat_locks[i]);
+
nf_ct_helper_expectfn_register(&follow_master_nat);
BUG_ON(nfnetlink_parse_nat_setup_hook != NULL);
@@ -863,8 +853,8 @@ static void __exit nf_nat_cleanup(void)
for (i = 0; i < NFPROTO_NUMPROTO; i++)
kfree(nf_nat_l4protos[i]);
-
- rhltable_destroy(&nf_nat_bysource_table);
+ synchronize_net();
+ nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size);
}
MODULE_LICENSE("GPL");
diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
index 10d48234f5f4..5da8746f7b88 100644
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c
@@ -35,6 +35,7 @@
#include <linux/netfilter_ipv6/ip6_tables.h>
#include <linux/netfilter/xt_hashlimit.h>
#include <linux/mutex.h>
+#include <linux/kernel.h>
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
@@ -279,7 +280,7 @@ static int htable_create(struct net *net, struct hashlimit_cfg3 *cfg,
size = cfg->size;
} else {
size = (totalram_pages << PAGE_SHIFT) / 16384 /
- sizeof(struct list_head);
+ sizeof(struct hlist_head);
if (totalram_pages > 1024 * 1024 * 1024 / PAGE_SIZE)
size = 8192;
if (size < 16)
@@ -287,7 +288,7 @@ static int htable_create(struct net *net, struct hashlimit_cfg3 *cfg,
}
/* FIXME: don't use vmalloc() here or anywhere else -HW */
hinfo = vmalloc(sizeof(struct xt_hashlimit_htable) +
- sizeof(struct list_head) * size);
+ sizeof(struct hlist_head) * size);
if (hinfo == NULL)
return -ENOMEM;
*out_hinfo = hinfo;
@@ -527,12 +528,12 @@ static u64 user2rate(u64 user)
}
}
-static u64 user2rate_bytes(u64 user)
+static u64 user2rate_bytes(u32 user)
{
u64 r;
- r = user ? 0xFFFFFFFFULL / user : 0xFFFFFFFFULL;
- r = (r - 1) << 4;
+ r = user ? U32_MAX / user : U32_MAX;
+ r = (r - 1) << XT_HASHLIMIT_BYTE_SHIFT;
return r;
}
@@ -588,7 +589,8 @@ static void rateinfo_init(struct dsthash_ent *dh,
dh->rateinfo.prev_window = 0;
dh->rateinfo.current_rate = 0;
if (hinfo->cfg.mode & XT_HASHLIMIT_BYTES) {
- dh->rateinfo.rate = user2rate_bytes(hinfo->cfg.avg);
+ dh->rateinfo.rate =
+ user2rate_bytes((u32)hinfo->cfg.avg);
if (hinfo->cfg.burst)
dh->rateinfo.burst =
hinfo->cfg.burst * dh->rateinfo.rate;
@@ -870,7 +872,7 @@ static int hashlimit_mt_check_common(const struct xt_mtchk_param *par,
/* Check for overflow. */
if (revision >= 3 && cfg->mode & XT_HASHLIMIT_RATE_MATCH) {
- if (cfg->avg == 0) {
+ if (cfg->avg == 0 || cfg->avg > U32_MAX) {
pr_info("hashlimit invalid rate\n");
return -ERANGE;
}
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 76cf273a56c7..c3aec6227c91 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -1112,7 +1112,8 @@ static int ovs_nla_init_match_and_action(struct net *net,
if (!a[OVS_FLOW_ATTR_KEY]) {
OVS_NLERR(log,
"Flow key attribute not present in set flow.");
- return -EINVAL;
+ error = -EINVAL;
+ goto error;
}
*acts = get_flow_actions(net, a[OVS_FLOW_ATTR_ACTIONS], key,
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index a306974e2fb4..da6fa82c98a8 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -53,10 +53,13 @@ static void tcf_action_goto_chain_exec(const struct tc_action *a,
res->goto_tp = rcu_dereference_bh(chain->filter_chain);
}
-static void free_tcf(struct rcu_head *head)
+/* XXX: For standalone actions, we don't need a RCU grace period either, because
+ * actions are always connected to filters and filters are already destroyed in
+ * RCU callbacks, so after a RCU grace period actions are already disconnected
+ * from filters. Readers later can not find us.
+ */
+static void free_tcf(struct tc_action *p)
{
- struct tc_action *p = container_of(head, struct tc_action, tcfa_rcu);
-
free_percpu(p->cpu_bstats);
free_percpu(p->cpu_qstats);
@@ -76,11 +79,7 @@ static void tcf_idr_remove(struct tcf_idrinfo *idrinfo, struct tc_action *p)
idr_remove_ext(&idrinfo->action_idr, p->tcfa_index);
spin_unlock_bh(&idrinfo->lock);
gen_kill_estimator(&p->tcfa_rate_est);
- /*
- * gen_estimator est_timer() might access p->tcfa_lock
- * or bstats, wait a RCU grace period before freeing p
- */
- call_rcu(&p->tcfa_rcu, free_tcf);
+ free_tcf(p);
}
int __tcf_idr_release(struct tc_action *p, bool bind, bool strict)
@@ -181,7 +180,7 @@ static int tcf_del_walker(struct tcf_idrinfo *idrinfo, struct sk_buff *skb,
idr_for_each_entry_ext(idr, p, id) {
ret = __tcf_idr_release(p, false, true);
if (ret == ACT_P_DELETED) {
- module_put(p->ops->owner);
+ module_put(ops->owner);
n_i++;
} else if (ret < 0) {
goto nla_put_failure;
@@ -259,7 +258,7 @@ void tcf_idr_cleanup(struct tc_action *a, struct nlattr *est)
{
if (est)
gen_kill_estimator(&a->tcfa_rate_est);
- call_rcu(&a->tcfa_rcu, free_tcf);
+ free_tcf(a);
}
EXPORT_SYMBOL(tcf_idr_cleanup);
@@ -515,13 +514,15 @@ EXPORT_SYMBOL(tcf_action_exec);
int tcf_action_destroy(struct list_head *actions, int bind)
{
+ const struct tc_action_ops *ops;
struct tc_action *a, *tmp;
int ret = 0;
list_for_each_entry_safe(a, tmp, actions, list) {
+ ops = a->ops;
ret = __tcf_idr_release(a, bind, true);
if (ret == ACT_P_DELETED)
- module_put(a->ops->owner);
+ module_put(ops->owner);
else if (ret < 0)
return ret;
}
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index c743f03cfebd..0b2219adf520 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -182,7 +182,7 @@ static struct tcf_chain *tcf_chain_create(struct tcf_block *block,
list_add_tail(&chain->list, &block->chain_list);
chain->block = block;
chain->index = chain_index;
- chain->refcnt = 0;
+ chain->refcnt = 1;
return chain;
}
@@ -194,21 +194,20 @@ static void tcf_chain_flush(struct tcf_chain *chain)
RCU_INIT_POINTER(*chain->p_filter_chain, NULL);
while ((tp = rtnl_dereference(chain->filter_chain)) != NULL) {
RCU_INIT_POINTER(chain->filter_chain, tp->next);
+ tcf_chain_put(chain);
tcf_proto_destroy(tp);
}
}
static void tcf_chain_destroy(struct tcf_chain *chain)
{
- /* May be already removed from the list by the previous call. */
- if (!list_empty(&chain->list))
- list_del_init(&chain->list);
+ list_del(&chain->list);
+ kfree(chain);
+}
- /* There might still be a reference held when we got here from
- * tcf_block_put. Wait for the user to drop reference before free.
- */
- if (!chain->refcnt)
- kfree(chain);
+static void tcf_chain_hold(struct tcf_chain *chain)
+{
+ ++chain->refcnt;
}
struct tcf_chain *tcf_chain_get(struct tcf_block *block, u32 chain_index,
@@ -217,24 +216,19 @@ struct tcf_chain *tcf_chain_get(struct tcf_block *block, u32 chain_index,
struct tcf_chain *chain;
list_for_each_entry(chain, &block->chain_list, list) {
- if (chain->index == chain_index)
- goto incref;
+ if (chain->index == chain_index) {
+ tcf_chain_hold(chain);
+ return chain;
+ }
}
- chain = create ? tcf_chain_create(block, chain_index) : NULL;
-incref:
- if (chain)
- chain->refcnt++;
- return chain;
+ return create ? tcf_chain_create(block, chain_index) : NULL;
}
EXPORT_SYMBOL(tcf_chain_get);
void tcf_chain_put(struct tcf_chain *chain)
{
- /* Destroy unused chain, with exception of chain 0, which is the
- * default one and has to be always present.
- */
- if (--chain->refcnt == 0 && !chain->filter_chain && chain->index != 0)
+ if (--chain->refcnt == 0)
tcf_chain_destroy(chain);
}
EXPORT_SYMBOL(tcf_chain_put);
@@ -279,10 +273,31 @@ void tcf_block_put(struct tcf_block *block)
if (!block)
return;
- list_for_each_entry_safe(chain, tmp, &block->chain_list, list) {
+ /* XXX: Standalone actions are not allowed to jump to any chain, and
+ * bound actions should be all removed after flushing. However,
+ * filters are destroyed in RCU callbacks, we have to hold the chains
+ * first, otherwise we would always race with RCU callbacks on this list
+ * without proper locking.
+ */
+
+ /* Wait for existing RCU callbacks to cool down. */
+ rcu_barrier();
+
+ /* Hold a refcnt for all chains, except 0, in case they are gone. */
+ list_for_each_entry(chain, &block->chain_list, list)
+ if (chain->index)
+ tcf_chain_hold(chain);
+
+ /* No race on the list, because no chain could be destroyed. */
+ list_for_each_entry(chain, &block->chain_list, list)
tcf_chain_flush(chain);
- tcf_chain_destroy(chain);
- }
+
+ /* Wait for RCU callbacks to release the reference count. */
+ rcu_barrier();
+
+ /* At this point, all the chains should have refcnt == 1. */
+ list_for_each_entry_safe(chain, tmp, &block->chain_list, list)
+ tcf_chain_put(chain);
kfree(block);
}
EXPORT_SYMBOL(tcf_block_put);
@@ -360,6 +375,7 @@ static void tcf_chain_tp_insert(struct tcf_chain *chain,
rcu_assign_pointer(*chain->p_filter_chain, tp);
RCU_INIT_POINTER(tp->next, tcf_chain_tp_prev(chain_info));
rcu_assign_pointer(*chain_info->pprev, tp);
+ tcf_chain_hold(chain);
}
static void tcf_chain_tp_remove(struct tcf_chain *chain,
@@ -371,6 +387,7 @@ static void tcf_chain_tp_remove(struct tcf_chain *chain,
if (chain->p_filter_chain && tp == chain->filter_chain)
RCU_INIT_POINTER(*chain->p_filter_chain, next);
RCU_INIT_POINTER(*chain_info->pprev, next);
+ tcf_chain_put(chain);
}
static struct tcf_proto *tcf_chain_tp_find(struct tcf_chain *chain,
diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c
index 21cc45caf842..eeac606c95ab 100644
--- a/net/sched/cls_matchall.c
+++ b/net/sched/cls_matchall.c
@@ -32,6 +32,7 @@ static int mall_classify(struct sk_buff *skb, const struct tcf_proto *tp,
if (tc_skip_sw(head->flags))
return -1;
+ *res = head->res;
return tcf_exts_exec(skb, &head->exts, res);
}
diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h
index 98c05db85bcb..b1f6ed48bc72 100644
--- a/net/sched/cls_rsvp.h
+++ b/net/sched/cls_rsvp.h
@@ -389,7 +389,7 @@ static unsigned int gen_handle(struct tcf_proto *tp, unsigned salt)
if ((data->hgenerator += 0x10000) == 0)
data->hgenerator = 0x10000;
h = data->hgenerator|salt;
- if (rsvp_get(tp, h) == 0)
+ if (!rsvp_get(tp, h))
return h;
}
return 0;
diff --git a/net/sctp/sctp_diag.c b/net/sctp/sctp_diag.c
index e99518e79b52..22ed01a76b19 100644
--- a/net/sctp/sctp_diag.c
+++ b/net/sctp/sctp_diag.c
@@ -279,9 +279,11 @@ out:
return err;
}
-static int sctp_sock_dump(struct sock *sk, void *p)
+static int sctp_sock_dump(struct sctp_transport *tsp, void *p)
{
+ struct sctp_endpoint *ep = tsp->asoc->ep;
struct sctp_comm_param *commp = p;
+ struct sock *sk = ep->base.sk;
struct sk_buff *skb = commp->skb;
struct netlink_callback *cb = commp->cb;
const struct inet_diag_req_v2 *r = commp->r;
@@ -289,9 +291,7 @@ static int sctp_sock_dump(struct sock *sk, void *p)
int err = 0;
lock_sock(sk);
- if (!sctp_sk(sk)->ep)
- goto release;
- list_for_each_entry(assoc, &sctp_sk(sk)->ep->asocs, asocs) {
+ list_for_each_entry(assoc, &ep->asocs, asocs) {
if (cb->args[4] < cb->args[1])
goto next;
@@ -309,7 +309,6 @@ static int sctp_sock_dump(struct sock *sk, void *p)
cb->nlh->nlmsg_seq,
NLM_F_MULTI, cb->nlh,
commp->net_admin) < 0) {
- cb->args[3] = 1;
err = 1;
goto release;
}
@@ -327,40 +326,30 @@ next:
cb->args[4]++;
}
cb->args[1] = 0;
- cb->args[2]++;
cb->args[3] = 0;
cb->args[4] = 0;
release:
release_sock(sk);
- sock_put(sk);
return err;
}
-static int sctp_get_sock(struct sctp_transport *tsp, void *p)
+static int sctp_sock_filter(struct sctp_transport *tsp, void *p)
{
struct sctp_endpoint *ep = tsp->asoc->ep;
struct sctp_comm_param *commp = p;
struct sock *sk = ep->base.sk;
- struct netlink_callback *cb = commp->cb;
const struct inet_diag_req_v2 *r = commp->r;
struct sctp_association *assoc =
list_entry(ep->asocs.next, struct sctp_association, asocs);
/* find the ep only once through the transports by this condition */
if (tsp->asoc != assoc)
- goto out;
+ return 0;
if (r->sdiag_family != AF_UNSPEC && sk->sk_family != r->sdiag_family)
- goto out;
-
- sock_hold(sk);
- cb->args[5] = (long)sk;
+ return 0;
return 1;
-
-out:
- cb->args[2]++;
- return 0;
}
static int sctp_ep_dump(struct sctp_endpoint *ep, void *p)
@@ -503,12 +492,8 @@ skip:
if (!(idiag_states & ~(TCPF_LISTEN | TCPF_CLOSE)))
goto done;
-next:
- cb->args[5] = 0;
- sctp_for_each_transport(sctp_get_sock, net, cb->args[2], &commp);
-
- if (cb->args[5] && !sctp_sock_dump((struct sock *)cb->args[5], &commp))
- goto next;
+ sctp_for_each_transport(sctp_sock_filter, sctp_sock_dump,
+ net, (int *)&cb->args[2], &commp);
done:
cb->args[1] = cb->args[4];
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 1b00a1e09b93..d4730ada7f32 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -4658,29 +4658,39 @@ int sctp_transport_lookup_process(int (*cb)(struct sctp_transport *, void *),
EXPORT_SYMBOL_GPL(sctp_transport_lookup_process);
int sctp_for_each_transport(int (*cb)(struct sctp_transport *, void *),
- struct net *net, int pos, void *p) {
+ int (*cb_done)(struct sctp_transport *, void *),
+ struct net *net, int *pos, void *p) {
struct rhashtable_iter hti;
- void *obj;
- int err;
-
- err = sctp_transport_walk_start(&hti);
- if (err)
- return err;
+ struct sctp_transport *tsp;
+ int ret;
- obj = sctp_transport_get_idx(net, &hti, pos + 1);
- for (; !IS_ERR_OR_NULL(obj); obj = sctp_transport_get_next(net, &hti)) {
- struct sctp_transport *transport = obj;
+again:
+ ret = sctp_transport_walk_start(&hti);
+ if (ret)
+ return ret;
- if (!sctp_transport_hold(transport))
+ tsp = sctp_transport_get_idx(net, &hti, *pos + 1);
+ for (; !IS_ERR_OR_NULL(tsp); tsp = sctp_transport_get_next(net, &hti)) {
+ if (!sctp_transport_hold(tsp))
continue;
- err = cb(transport, p);
- sctp_transport_put(transport);
- if (err)
+ ret = cb(tsp, p);
+ if (ret)
break;
+ (*pos)++;
+ sctp_transport_put(tsp);
}
sctp_transport_walk_stop(&hti);
- return err;
+ if (ret) {
+ if (cb_done && !cb_done(tsp, p)) {
+ (*pos)++;
+ sctp_transport_put(tsp);
+ goto again;
+ }
+ sctp_transport_put(tsp);
+ }
+
+ return ret;
}
EXPORT_SYMBOL_GPL(sctp_for_each_transport);
diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c
index 0225d62a869f..a71be33f3afe 100644
--- a/net/sctp/ulpqueue.c
+++ b/net/sctp/ulpqueue.c
@@ -265,7 +265,8 @@ int sctp_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event)
sctp_ulpq_clear_pd(ulpq);
if (queue == &sk->sk_receive_queue && !sp->data_ready_signalled) {
- sp->data_ready_signalled = 1;
+ if (!sock_owned_by_user(sk))
+ sp->data_ready_signalled = 1;
sk->sk_data_ready(sk);
}
return 1;
diff --git a/net/sunrpc/backchannel_rqst.c b/net/sunrpc/backchannel_rqst.c
index ac701c28f44f..c2c68a15b59d 100644
--- a/net/sunrpc/backchannel_rqst.c
+++ b/net/sunrpc/backchannel_rqst.c
@@ -171,10 +171,10 @@ int xprt_setup_bc(struct rpc_xprt *xprt, unsigned int min_reqs)
/*
* Add the temporary list to the backchannel preallocation list
*/
- spin_lock_bh(&xprt->bc_pa_lock);
+ spin_lock(&xprt->bc_pa_lock);
list_splice(&tmp_list, &xprt->bc_pa_list);
xprt_inc_alloc_count(xprt, min_reqs);
- spin_unlock_bh(&xprt->bc_pa_lock);
+ spin_unlock(&xprt->bc_pa_lock);
dprintk("RPC: setup backchannel transport done\n");
return 0;
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 2e49d1f892b7..2ad827db2704 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -1903,6 +1903,14 @@ call_connect_status(struct rpc_task *task)
task->tk_status = 0;
switch (status) {
case -ECONNREFUSED:
+ /* A positive refusal suggests a rebind is needed. */
+ if (RPC_IS_SOFTCONN(task))
+ break;
+ if (clnt->cl_autobind) {
+ rpc_force_rebind(clnt);
+ task->tk_action = call_bind;
+ return;
+ }
case -ECONNRESET:
case -ECONNABORTED:
case -ENETUNREACH:
@@ -2139,10 +2147,6 @@ call_status(struct rpc_task *task)
rpc_delay(task, 3*HZ);
case -ETIMEDOUT:
task->tk_action = call_timeout;
- if (!(task->tk_flags & RPC_TASK_NO_RETRANS_TIMEOUT)
- && task->tk_client->cl_discrtry)
- xprt_conditional_disconnect(req->rq_xprt,
- req->rq_connect_cookie);
break;
case -ECONNREFUSED:
case -ECONNRESET:
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 85ce0db5b0a6..aa04666f929d 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -421,7 +421,7 @@ __svc_init_bc(struct svc_serv *serv)
*/
static struct svc_serv *
__svc_create(struct svc_program *prog, unsigned int bufsize, int npools,
- struct svc_serv_ops *ops)
+ const struct svc_serv_ops *ops)
{
struct svc_serv *serv;
unsigned int vers;
@@ -486,7 +486,7 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools,
struct svc_serv *
svc_create(struct svc_program *prog, unsigned int bufsize,
- struct svc_serv_ops *ops)
+ const struct svc_serv_ops *ops)
{
return __svc_create(prog, bufsize, /*npools*/1, ops);
}
@@ -494,7 +494,7 @@ EXPORT_SYMBOL_GPL(svc_create);
struct svc_serv *
svc_create_pooled(struct svc_program *prog, unsigned int bufsize,
- struct svc_serv_ops *ops)
+ const struct svc_serv_ops *ops)
{
struct svc_serv *serv;
unsigned int npools = svc_pool_map_get();
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index e18500151236..ff8e06cd067e 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -693,7 +693,7 @@ static struct svc_xprt *svc_udp_create(struct svc_serv *serv,
return svc_create_socket(serv, IPPROTO_UDP, net, sa, salen, flags);
}
-static struct svc_xprt_ops svc_udp_ops = {
+static const struct svc_xprt_ops svc_udp_ops = {
.xpo_create = svc_udp_create,
.xpo_recvfrom = svc_udp_recvfrom,
.xpo_sendto = svc_udp_sendto,
@@ -1013,7 +1013,7 @@ static int receive_cb_reply(struct svc_sock *svsk, struct svc_rqst *rqstp)
if (!bc_xprt)
return -EAGAIN;
- spin_lock_bh(&bc_xprt->transport_lock);
+ spin_lock(&bc_xprt->recv_lock);
req = xprt_lookup_rqst(bc_xprt, xid);
if (!req)
goto unlock_notfound;
@@ -1031,7 +1031,7 @@ static int receive_cb_reply(struct svc_sock *svsk, struct svc_rqst *rqstp)
memcpy(dst->iov_base, src->iov_base, src->iov_len);
xprt_complete_rqst(req->rq_task, rqstp->rq_arg.len);
rqstp->rq_arg.len = 0;
- spin_unlock_bh(&bc_xprt->transport_lock);
+ spin_unlock(&bc_xprt->recv_lock);
return 0;
unlock_notfound:
printk(KERN_NOTICE
@@ -1040,7 +1040,7 @@ unlock_notfound:
__func__, ntohl(calldir),
bc_xprt, ntohl(xid));
unlock_eagain:
- spin_unlock_bh(&bc_xprt->transport_lock);
+ spin_unlock(&bc_xprt->recv_lock);
return -EAGAIN;
}
@@ -1241,7 +1241,7 @@ static void svc_bc_tcp_sock_detach(struct svc_xprt *xprt)
{
}
-static struct svc_xprt_ops svc_tcp_bc_ops = {
+static const struct svc_xprt_ops svc_tcp_bc_ops = {
.xpo_create = svc_bc_tcp_create,
.xpo_detach = svc_bc_tcp_sock_detach,
.xpo_free = svc_bc_sock_free,
@@ -1275,7 +1275,7 @@ static void svc_cleanup_bc_xprt_sock(void)
}
#endif /* CONFIG_SUNRPC_BACKCHANNEL */
-static struct svc_xprt_ops svc_tcp_ops = {
+static const struct svc_xprt_ops svc_tcp_ops = {
.xpo_create = svc_tcp_create,
.xpo_recvfrom = svc_tcp_recvfrom,
.xpo_sendto = svc_tcp_sendto,
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 4654a9934269..e741ec2b4d8e 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -844,6 +844,50 @@ struct rpc_rqst *xprt_lookup_rqst(struct rpc_xprt *xprt, __be32 xid)
}
EXPORT_SYMBOL_GPL(xprt_lookup_rqst);
+/**
+ * xprt_pin_rqst - Pin a request on the transport receive list
+ * @req: Request to pin
+ *
+ * Caller must ensure this is atomic with the call to xprt_lookup_rqst()
+ * so should be holding the xprt transport lock.
+ */
+void xprt_pin_rqst(struct rpc_rqst *req)
+{
+ set_bit(RPC_TASK_MSG_RECV, &req->rq_task->tk_runstate);
+}
+EXPORT_SYMBOL_GPL(xprt_pin_rqst);
+
+/**
+ * xprt_unpin_rqst - Unpin a request on the transport receive list
+ * @req: Request to pin
+ *
+ * Caller should be holding the xprt transport lock.
+ */
+void xprt_unpin_rqst(struct rpc_rqst *req)
+{
+ struct rpc_task *task = req->rq_task;
+
+ clear_bit(RPC_TASK_MSG_RECV, &task->tk_runstate);
+ if (test_bit(RPC_TASK_MSG_RECV_WAIT, &task->tk_runstate))
+ wake_up_bit(&task->tk_runstate, RPC_TASK_MSG_RECV);
+}
+EXPORT_SYMBOL_GPL(xprt_unpin_rqst);
+
+static void xprt_wait_on_pinned_rqst(struct rpc_rqst *req)
+__must_hold(&req->rq_xprt->recv_lock)
+{
+ struct rpc_task *task = req->rq_task;
+
+ if (task && test_bit(RPC_TASK_MSG_RECV, &task->tk_runstate)) {
+ spin_unlock(&req->rq_xprt->recv_lock);
+ set_bit(RPC_TASK_MSG_RECV_WAIT, &task->tk_runstate);
+ wait_on_bit(&task->tk_runstate, RPC_TASK_MSG_RECV,
+ TASK_UNINTERRUPTIBLE);
+ clear_bit(RPC_TASK_MSG_RECV_WAIT, &task->tk_runstate);
+ spin_lock(&req->rq_xprt->recv_lock);
+ }
+}
+
static void xprt_update_rtt(struct rpc_task *task)
{
struct rpc_rqst *req = task->tk_rqstp;
@@ -966,13 +1010,13 @@ void xprt_transmit(struct rpc_task *task)
/*
* Add to the list only if we're expecting a reply
*/
- spin_lock_bh(&xprt->transport_lock);
/* Update the softirq receive buffer */
memcpy(&req->rq_private_buf, &req->rq_rcv_buf,
sizeof(req->rq_private_buf));
/* Add request to the receive list */
+ spin_lock(&xprt->recv_lock);
list_add_tail(&req->rq_list, &xprt->recv);
- spin_unlock_bh(&xprt->transport_lock);
+ spin_unlock(&xprt->recv_lock);
xprt_reset_majortimeo(req);
/* Turn off autodisconnect */
del_singleshot_timer_sync(&xprt->timer);
@@ -1287,12 +1331,16 @@ void xprt_release(struct rpc_task *task)
task->tk_ops->rpc_count_stats(task, task->tk_calldata);
else if (task->tk_client)
rpc_count_iostats(task, task->tk_client->cl_metrics);
+ spin_lock(&xprt->recv_lock);
+ if (!list_empty(&req->rq_list)) {
+ list_del(&req->rq_list);
+ xprt_wait_on_pinned_rqst(req);
+ }
+ spin_unlock(&xprt->recv_lock);
spin_lock_bh(&xprt->transport_lock);
xprt->ops->release_xprt(xprt, task);
if (xprt->ops->release_request)
xprt->ops->release_request(task);
- if (!list_empty(&req->rq_list))
- list_del(&req->rq_list);
xprt->last_used = jiffies;
xprt_schedule_autodisconnect(xprt);
spin_unlock_bh(&xprt->transport_lock);
@@ -1318,6 +1366,7 @@ static void xprt_init(struct rpc_xprt *xprt, struct net *net)
spin_lock_init(&xprt->transport_lock);
spin_lock_init(&xprt->reserve_lock);
+ spin_lock_init(&xprt->recv_lock);
INIT_LIST_HEAD(&xprt->free);
INIT_LIST_HEAD(&xprt->recv);
diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c
index 03f6b5840764..d31d0ac5ada9 100644
--- a/net/sunrpc/xprtrdma/backchannel.c
+++ b/net/sunrpc/xprtrdma/backchannel.c
@@ -49,6 +49,7 @@ static int rpcrdma_bc_setup_rqst(struct rpcrdma_xprt *r_xprt,
if (IS_ERR(rb))
goto out_fail;
req->rl_rdmabuf = rb;
+ xdr_buf_init(&req->rl_hdrbuf, rb->rg_base, rdmab_length(rb));
size = r_xprt->rx_data.inline_rsize;
rb = rpcrdma_alloc_regbuf(size, DMA_TO_DEVICE, GFP_KERNEL);
@@ -202,20 +203,24 @@ size_t xprt_rdma_bc_maxpayload(struct rpc_xprt *xprt)
*/
int rpcrdma_bc_marshal_reply(struct rpc_rqst *rqst)
{
- struct rpc_xprt *xprt = rqst->rq_xprt;
- struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+ struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt);
struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
- struct rpcrdma_msg *headerp;
-
- headerp = rdmab_to_msg(req->rl_rdmabuf);
- headerp->rm_xid = rqst->rq_xid;
- headerp->rm_vers = rpcrdma_version;
- headerp->rm_credit =
- cpu_to_be32(r_xprt->rx_buf.rb_bc_srv_max_requests);
- headerp->rm_type = rdma_msg;
- headerp->rm_body.rm_chunks[0] = xdr_zero;
- headerp->rm_body.rm_chunks[1] = xdr_zero;
- headerp->rm_body.rm_chunks[2] = xdr_zero;
+ __be32 *p;
+
+ rpcrdma_set_xdrlen(&req->rl_hdrbuf, 0);
+ xdr_init_encode(&req->rl_stream, &req->rl_hdrbuf,
+ req->rl_rdmabuf->rg_base);
+
+ p = xdr_reserve_space(&req->rl_stream, 28);
+ if (unlikely(!p))
+ return -EIO;
+ *p++ = rqst->rq_xid;
+ *p++ = rpcrdma_version;
+ *p++ = cpu_to_be32(r_xprt->rx_buf.rb_bc_srv_max_requests);
+ *p++ = rdma_msg;
+ *p++ = xdr_zero;
+ *p++ = xdr_zero;
+ *p = xdr_zero;
if (!rpcrdma_prepare_send_sges(&r_xprt->rx_ia, req, RPCRDMA_HDRLEN_MIN,
&rqst->rq_snd_buf, rpcrdma_noch))
@@ -271,9 +276,6 @@ void xprt_rdma_bc_free_rqst(struct rpc_rqst *rqst)
* @xprt: transport receiving the call
* @rep: receive buffer containing the call
*
- * Called in the RPC reply handler, which runs in a tasklet.
- * Be quick about it.
- *
* Operational assumptions:
* o Backchannel credits are ignored, just as the NFS server
* forechannel currently does
@@ -284,7 +286,6 @@ void rpcrdma_bc_receive_call(struct rpcrdma_xprt *r_xprt,
struct rpcrdma_rep *rep)
{
struct rpc_xprt *xprt = &r_xprt->rx_xprt;
- struct rpcrdma_msg *headerp;
struct svc_serv *bc_serv;
struct rpcrdma_req *req;
struct rpc_rqst *rqst;
@@ -292,24 +293,15 @@ void rpcrdma_bc_receive_call(struct rpcrdma_xprt *r_xprt,
size_t size;
__be32 *p;
- headerp = rdmab_to_msg(rep->rr_rdmabuf);
+ p = xdr_inline_decode(&rep->rr_stream, 0);
+ size = xdr_stream_remaining(&rep->rr_stream);
+
#ifdef RPCRDMA_BACKCHANNEL_DEBUG
pr_info("RPC: %s: callback XID %08x, length=%u\n",
- __func__, be32_to_cpu(headerp->rm_xid), rep->rr_len);
- pr_info("RPC: %s: %*ph\n", __func__, rep->rr_len, headerp);
+ __func__, be32_to_cpup(p), size);
+ pr_info("RPC: %s: %*ph\n", __func__, size, p);
#endif
- /* Sanity check:
- * Need at least enough bytes for RPC/RDMA header, as code
- * here references the header fields by array offset. Also,
- * backward calls are always inline, so ensure there
- * are some bytes beyond the RPC/RDMA header.
- */
- if (rep->rr_len < RPCRDMA_HDRLEN_MIN + 24)
- goto out_short;
- p = (__be32 *)((unsigned char *)headerp + RPCRDMA_HDRLEN_MIN);
- size = rep->rr_len - RPCRDMA_HDRLEN_MIN;
-
/* Grab a free bc rqst */
spin_lock(&xprt->bc_pa_lock);
if (list_empty(&xprt->bc_pa_list)) {
@@ -325,7 +317,7 @@ void rpcrdma_bc_receive_call(struct rpcrdma_xprt *r_xprt,
/* Prepare rqst */
rqst->rq_reply_bytes_recvd = 0;
rqst->rq_bytes_sent = 0;
- rqst->rq_xid = headerp->rm_xid;
+ rqst->rq_xid = *p;
rqst->rq_private_buf.len = size;
set_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state);
@@ -337,9 +329,9 @@ void rpcrdma_bc_receive_call(struct rpcrdma_xprt *r_xprt,
buf->len = size;
/* The receive buffer has to be hooked to the rpcrdma_req
- * so that it can be reposted after the server is done
- * parsing it but just before sending the backward
- * direction reply.
+ * so that it is not released while the req is pointing
+ * to its buffer, and so that it can be reposted after
+ * the Upper Layer is done decoding it.
*/
req = rpcr_to_rdmar(rqst);
dprintk("RPC: %s: attaching rep %p to req %p\n",
@@ -367,13 +359,4 @@ out_overflow:
* when the connection is re-established.
*/
return;
-
-out_short:
- pr_warn("RPC/RDMA short backward direction call\n");
-
- if (rpcrdma_ep_post_recv(&r_xprt->rx_ia, rep))
- xprt_disconnect_done(xprt);
- else
- pr_warn("RPC: %s: reposting rep %p\n",
- __func__, rep);
}
diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c
index d3f84bb1d443..6c7151341194 100644
--- a/net/sunrpc/xprtrdma/fmr_ops.c
+++ b/net/sunrpc/xprtrdma/fmr_ops.c
@@ -177,7 +177,7 @@ fmr_op_maxpages(struct rpcrdma_xprt *r_xprt)
/* Use the ib_map_phys_fmr() verb to register a memory region
* for remote access via RDMA READ or RDMA WRITE.
*/
-static int
+static struct rpcrdma_mr_seg *
fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
int nsegs, bool writing, struct rpcrdma_mw **out)
{
@@ -188,7 +188,7 @@ fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
mw = rpcrdma_get_mw(r_xprt);
if (!mw)
- return -ENOBUFS;
+ return ERR_PTR(-ENOBUFS);
pageoff = offset_in_page(seg1->mr_offset);
seg1->mr_offset -= pageoff; /* start of page */
@@ -232,13 +232,13 @@ fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
mw->mw_offset = dma_pages[0] + pageoff;
*out = mw;
- return mw->mw_nents;
+ return seg;
out_dmamap_err:
pr_err("rpcrdma: failed to DMA map sg %p sg_nents %d\n",
mw->mw_sg, i);
rpcrdma_put_mw(r_xprt, mw);
- return -EIO;
+ return ERR_PTR(-EIO);
out_maperr:
pr_err("rpcrdma: ib_map_phys_fmr %u@0x%llx+%i (%d) status %i\n",
@@ -247,7 +247,7 @@ out_maperr:
ib_dma_unmap_sg(r_xprt->rx_ia.ri_device,
mw->mw_sg, mw->mw_nents, mw->mw_dir);
rpcrdma_put_mw(r_xprt, mw);
- return -EIO;
+ return ERR_PTR(-EIO);
}
/* Invalidate all memory regions that were registered for "req".
diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index 6aea36a38bfd..5a936a6a31a3 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -344,7 +344,7 @@ frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc)
/* Post a REG_MR Work Request to register a memory region
* for remote access via RDMA READ or RDMA WRITE.
*/
-static int
+static struct rpcrdma_mr_seg *
frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
int nsegs, bool writing, struct rpcrdma_mw **out)
{
@@ -364,7 +364,7 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
rpcrdma_defer_mr_recovery(mw);
mw = rpcrdma_get_mw(r_xprt);
if (!mw)
- return -ENOBUFS;
+ return ERR_PTR(-ENOBUFS);
} while (mw->frmr.fr_state != FRMR_IS_INVALID);
frmr = &mw->frmr;
frmr->fr_state = FRMR_IS_VALID;
@@ -429,25 +429,25 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
mw->mw_offset = mr->iova;
*out = mw;
- return mw->mw_nents;
+ return seg;
out_dmamap_err:
pr_err("rpcrdma: failed to DMA map sg %p sg_nents %d\n",
mw->mw_sg, i);
frmr->fr_state = FRMR_IS_INVALID;
rpcrdma_put_mw(r_xprt, mw);
- return -EIO;
+ return ERR_PTR(-EIO);
out_mapmr_err:
pr_err("rpcrdma: failed to map mr %p (%d/%d)\n",
frmr->fr_mr, n, mw->mw_nents);
rpcrdma_defer_mr_recovery(mw);
- return -EIO;
+ return ERR_PTR(-EIO);
out_senderr:
pr_err("rpcrdma: FRMR registration ib_post_send returned %i\n", rc);
rpcrdma_defer_mr_recovery(mw);
- return -ENOTCONN;
+ return ERR_PTR(-ENOTCONN);
}
/* Invalidate all memory regions that were registered for "req".
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index ca4d6e4528f3..f1889f4d4803 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -169,40 +169,41 @@ static bool rpcrdma_results_inline(struct rpcrdma_xprt *r_xprt,
return rqst->rq_rcv_buf.buflen <= ia->ri_max_inline_read;
}
-/* Split "vec" on page boundaries into segments. FMR registers pages,
- * not a byte range. Other modes coalesce these segments into a single
- * MR when they can.
+/* Split @vec on page boundaries into SGEs. FMR registers pages, not
+ * a byte range. Other modes coalesce these SGEs into a single MR
+ * when they can.
+ *
+ * Returns pointer to next available SGE, and bumps the total number
+ * of SGEs consumed.
*/
-static int
-rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg, int n)
+static struct rpcrdma_mr_seg *
+rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg,
+ unsigned int *n)
{
- size_t page_offset;
- u32 remaining;
+ u32 remaining, page_offset;
char *base;
base = vec->iov_base;
page_offset = offset_in_page(base);
remaining = vec->iov_len;
- while (remaining && n < RPCRDMA_MAX_SEGS) {
- seg[n].mr_page = NULL;
- seg[n].mr_offset = base;
- seg[n].mr_len = min_t(u32, PAGE_SIZE - page_offset, remaining);
- remaining -= seg[n].mr_len;
- base += seg[n].mr_len;
- ++n;
+ while (remaining) {
+ seg->mr_page = NULL;
+ seg->mr_offset = base;
+ seg->mr_len = min_t(u32, PAGE_SIZE - page_offset, remaining);
+ remaining -= seg->mr_len;
+ base += seg->mr_len;
+ ++seg;
+ ++(*n);
page_offset = 0;
}
- return n;
+ return seg;
}
-/*
- * Chunk assembly from upper layer xdr_buf.
- *
- * Prepare the passed-in xdr_buf into representation as RPC/RDMA chunk
- * elements. Segments are then coalesced when registered, if possible
- * within the selected memreg mode.
+/* Convert @xdrbuf into SGEs no larger than a page each. As they
+ * are registered, these SGEs are then coalesced into RDMA segments
+ * when the selected memreg mode supports it.
*
- * Returns positive number of segments converted, or a negative errno.
+ * Returns positive number of SGEs consumed, or a negative errno.
*/
static int
@@ -210,47 +211,41 @@ rpcrdma_convert_iovs(struct rpcrdma_xprt *r_xprt, struct xdr_buf *xdrbuf,
unsigned int pos, enum rpcrdma_chunktype type,
struct rpcrdma_mr_seg *seg)
{
- int len, n, p, page_base;
+ unsigned long page_base;
+ unsigned int len, n;
struct page **ppages;
n = 0;
- if (pos == 0) {
- n = rpcrdma_convert_kvec(&xdrbuf->head[0], seg, n);
- if (n == RPCRDMA_MAX_SEGS)
- goto out_overflow;
- }
+ if (pos == 0)
+ seg = rpcrdma_convert_kvec(&xdrbuf->head[0], seg, &n);
len = xdrbuf->page_len;
ppages = xdrbuf->pages + (xdrbuf->page_base >> PAGE_SHIFT);
page_base = offset_in_page(xdrbuf->page_base);
- p = 0;
- while (len && n < RPCRDMA_MAX_SEGS) {
- if (!ppages[p]) {
- /* alloc the pagelist for receiving buffer */
- ppages[p] = alloc_page(GFP_ATOMIC);
- if (!ppages[p])
+ while (len) {
+ if (unlikely(!*ppages)) {
+ /* XXX: Certain upper layer operations do
+ * not provide receive buffer pages.
+ */
+ *ppages = alloc_page(GFP_ATOMIC);
+ if (!*ppages)
return -EAGAIN;
}
- seg[n].mr_page = ppages[p];
- seg[n].mr_offset = (void *)(unsigned long) page_base;
- seg[n].mr_len = min_t(u32, PAGE_SIZE - page_base, len);
- if (seg[n].mr_len > PAGE_SIZE)
- goto out_overflow;
- len -= seg[n].mr_len;
+ seg->mr_page = *ppages;
+ seg->mr_offset = (char *)page_base;
+ seg->mr_len = min_t(u32, PAGE_SIZE - page_base, len);
+ len -= seg->mr_len;
+ ++ppages;
+ ++seg;
++n;
- ++p;
- page_base = 0; /* page offset only applies to first page */
+ page_base = 0;
}
- /* Message overflows the seg array */
- if (len && n == RPCRDMA_MAX_SEGS)
- goto out_overflow;
-
/* When encoding a Read chunk, the tail iovec contains an
* XDR pad and may be omitted.
*/
if (type == rpcrdma_readch && r_xprt->rx_ia.ri_implicit_roundup)
- return n;
+ goto out;
/* When encoding a Write chunk, some servers need to see an
* extra segment for non-XDR-aligned Write chunks. The upper
@@ -258,30 +253,81 @@ rpcrdma_convert_iovs(struct rpcrdma_xprt *r_xprt, struct xdr_buf *xdrbuf,
* for this purpose.
*/
if (type == rpcrdma_writech && r_xprt->rx_ia.ri_implicit_roundup)
- return n;
+ goto out;
- if (xdrbuf->tail[0].iov_len) {
- n = rpcrdma_convert_kvec(&xdrbuf->tail[0], seg, n);
- if (n == RPCRDMA_MAX_SEGS)
- goto out_overflow;
- }
+ if (xdrbuf->tail[0].iov_len)
+ seg = rpcrdma_convert_kvec(&xdrbuf->tail[0], seg, &n);
+out:
+ if (unlikely(n > RPCRDMA_MAX_SEGS))
+ return -EIO;
return n;
+}
-out_overflow:
- pr_err("rpcrdma: segment array overflow\n");
- return -EIO;
+static inline int
+encode_item_present(struct xdr_stream *xdr)
+{
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, sizeof(*p));
+ if (unlikely(!p))
+ return -EMSGSIZE;
+
+ *p = xdr_one;
+ return 0;
}
-static inline __be32 *
+static inline int
+encode_item_not_present(struct xdr_stream *xdr)
+{
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, sizeof(*p));
+ if (unlikely(!p))
+ return -EMSGSIZE;
+
+ *p = xdr_zero;
+ return 0;
+}
+
+static void
xdr_encode_rdma_segment(__be32 *iptr, struct rpcrdma_mw *mw)
{
*iptr++ = cpu_to_be32(mw->mw_handle);
*iptr++ = cpu_to_be32(mw->mw_length);
- return xdr_encode_hyper(iptr, mw->mw_offset);
+ xdr_encode_hyper(iptr, mw->mw_offset);
}
-/* XDR-encode the Read list. Supports encoding a list of read
+static int
+encode_rdma_segment(struct xdr_stream *xdr, struct rpcrdma_mw *mw)
+{
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, 4 * sizeof(*p));
+ if (unlikely(!p))
+ return -EMSGSIZE;
+
+ xdr_encode_rdma_segment(p, mw);
+ return 0;
+}
+
+static int
+encode_read_segment(struct xdr_stream *xdr, struct rpcrdma_mw *mw,
+ u32 position)
+{
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, 6 * sizeof(*p));
+ if (unlikely(!p))
+ return -EMSGSIZE;
+
+ *p++ = xdr_one; /* Item present */
+ *p++ = cpu_to_be32(position);
+ xdr_encode_rdma_segment(p, mw);
+ return 0;
+}
+
+/* Register and XDR encode the Read list. Supports encoding a list of read
* segments that belong to a single read chunk.
*
* Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64):
@@ -290,23 +336,20 @@ xdr_encode_rdma_segment(__be32 *iptr, struct rpcrdma_mw *mw)
* N elements, position P (same P for all chunks of same arg!):
* 1 - PHLOO - 1 - PHLOO - ... - 1 - PHLOO - 0
*
- * Returns a pointer to the XDR word in the RDMA header following
- * the end of the Read list, or an error pointer.
+ * Returns zero on success, or a negative errno if a failure occurred.
+ * @xdr is advanced to the next position in the stream.
+ *
+ * Only a single @pos value is currently supported.
*/
-static __be32 *
-rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt,
- struct rpcrdma_req *req, struct rpc_rqst *rqst,
- __be32 *iptr, enum rpcrdma_chunktype rtype)
+static noinline int
+rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
+ struct rpc_rqst *rqst, enum rpcrdma_chunktype rtype)
{
+ struct xdr_stream *xdr = &req->rl_stream;
struct rpcrdma_mr_seg *seg;
struct rpcrdma_mw *mw;
unsigned int pos;
- int n, nsegs;
-
- if (rtype == rpcrdma_noch) {
- *iptr++ = xdr_zero; /* item not present */
- return iptr;
- }
+ int nsegs;
pos = rqst->rq_snd_buf.head[0].iov_len;
if (rtype == rpcrdma_areadch)
@@ -315,40 +358,33 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt,
nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_snd_buf, pos,
rtype, seg);
if (nsegs < 0)
- return ERR_PTR(nsegs);
+ return nsegs;
do {
- n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs,
- false, &mw);
- if (n < 0)
- return ERR_PTR(n);
+ seg = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs,
+ false, &mw);
+ if (IS_ERR(seg))
+ return PTR_ERR(seg);
rpcrdma_push_mw(mw, &req->rl_registered);
- *iptr++ = xdr_one; /* item present */
-
- /* All read segments in this chunk
- * have the same "position".
- */
- *iptr++ = cpu_to_be32(pos);
- iptr = xdr_encode_rdma_segment(iptr, mw);
+ if (encode_read_segment(xdr, mw, pos) < 0)
+ return -EMSGSIZE;
dprintk("RPC: %5u %s: pos %u %u@0x%016llx:0x%08x (%s)\n",
rqst->rq_task->tk_pid, __func__, pos,
mw->mw_length, (unsigned long long)mw->mw_offset,
- mw->mw_handle, n < nsegs ? "more" : "last");
+ mw->mw_handle, mw->mw_nents < nsegs ? "more" : "last");
r_xprt->rx_stats.read_chunk_count++;
- seg += n;
- nsegs -= n;
+ nsegs -= mw->mw_nents;
} while (nsegs);
- /* Finish Read list */
- *iptr++ = xdr_zero; /* Next item not present */
- return iptr;
+ return 0;
}
-/* XDR-encode the Write list. Supports encoding a list containing
- * one array of plain segments that belong to a single write chunk.
+/* Register and XDR encode the Write list. Supports encoding a list
+ * containing one array of plain segments that belong to a single
+ * write chunk.
*
* Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64):
*
@@ -356,66 +392,65 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt,
* N elements:
* 1 - N - HLOO - HLOO - ... - HLOO - 0
*
- * Returns a pointer to the XDR word in the RDMA header following
- * the end of the Write list, or an error pointer.
+ * Returns zero on success, or a negative errno if a failure occurred.
+ * @xdr is advanced to the next position in the stream.
+ *
+ * Only a single Write chunk is currently supported.
*/
-static __be32 *
+static noinline int
rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
- struct rpc_rqst *rqst, __be32 *iptr,
- enum rpcrdma_chunktype wtype)
+ struct rpc_rqst *rqst, enum rpcrdma_chunktype wtype)
{
+ struct xdr_stream *xdr = &req->rl_stream;
struct rpcrdma_mr_seg *seg;
struct rpcrdma_mw *mw;
- int n, nsegs, nchunks;
+ int nsegs, nchunks;
__be32 *segcount;
- if (wtype != rpcrdma_writech) {
- *iptr++ = xdr_zero; /* no Write list present */
- return iptr;
- }
-
seg = req->rl_segments;
nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_rcv_buf,
rqst->rq_rcv_buf.head[0].iov_len,
wtype, seg);
if (nsegs < 0)
- return ERR_PTR(nsegs);
+ return nsegs;
- *iptr++ = xdr_one; /* Write list present */
- segcount = iptr++; /* save location of segment count */
+ if (encode_item_present(xdr) < 0)
+ return -EMSGSIZE;
+ segcount = xdr_reserve_space(xdr, sizeof(*segcount));
+ if (unlikely(!segcount))
+ return -EMSGSIZE;
+ /* Actual value encoded below */
nchunks = 0;
do {
- n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs,
- true, &mw);
- if (n < 0)
- return ERR_PTR(n);
+ seg = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs,
+ true, &mw);
+ if (IS_ERR(seg))
+ return PTR_ERR(seg);
rpcrdma_push_mw(mw, &req->rl_registered);
- iptr = xdr_encode_rdma_segment(iptr, mw);
+ if (encode_rdma_segment(xdr, mw) < 0)
+ return -EMSGSIZE;
dprintk("RPC: %5u %s: %u@0x016%llx:0x%08x (%s)\n",
rqst->rq_task->tk_pid, __func__,
mw->mw_length, (unsigned long long)mw->mw_offset,
- mw->mw_handle, n < nsegs ? "more" : "last");
+ mw->mw_handle, mw->mw_nents < nsegs ? "more" : "last");
r_xprt->rx_stats.write_chunk_count++;
r_xprt->rx_stats.total_rdma_request += seg->mr_len;
nchunks++;
- seg += n;
- nsegs -= n;
+ nsegs -= mw->mw_nents;
} while (nsegs);
/* Update count of segments in this Write chunk */
*segcount = cpu_to_be32(nchunks);
- /* Finish Write list */
- *iptr++ = xdr_zero; /* Next item not present */
- return iptr;
+ return 0;
}
-/* XDR-encode the Reply chunk. Supports encoding an array of plain
- * segments that belong to a single write (reply) chunk.
+/* Register and XDR encode the Reply chunk. Supports encoding an array
+ * of plain segments that belong to a single write (reply) chunk.
*
* Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64):
*
@@ -423,58 +458,57 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
* N elements:
* 1 - N - HLOO - HLOO - ... - HLOO
*
- * Returns a pointer to the XDR word in the RDMA header following
- * the end of the Reply chunk, or an error pointer.
+ * Returns zero on success, or a negative errno if a failure occurred.
+ * @xdr is advanced to the next position in the stream.
*/
-static __be32 *
-rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt,
- struct rpcrdma_req *req, struct rpc_rqst *rqst,
- __be32 *iptr, enum rpcrdma_chunktype wtype)
+static noinline int
+rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
+ struct rpc_rqst *rqst, enum rpcrdma_chunktype wtype)
{
+ struct xdr_stream *xdr = &req->rl_stream;
struct rpcrdma_mr_seg *seg;
struct rpcrdma_mw *mw;
- int n, nsegs, nchunks;
+ int nsegs, nchunks;
__be32 *segcount;
- if (wtype != rpcrdma_replych) {
- *iptr++ = xdr_zero; /* no Reply chunk present */
- return iptr;
- }
-
seg = req->rl_segments;
nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_rcv_buf, 0, wtype, seg);
if (nsegs < 0)
- return ERR_PTR(nsegs);
+ return nsegs;
- *iptr++ = xdr_one; /* Reply chunk present */
- segcount = iptr++; /* save location of segment count */
+ if (encode_item_present(xdr) < 0)
+ return -EMSGSIZE;
+ segcount = xdr_reserve_space(xdr, sizeof(*segcount));
+ if (unlikely(!segcount))
+ return -EMSGSIZE;
+ /* Actual value encoded below */
nchunks = 0;
do {
- n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs,
- true, &mw);
- if (n < 0)
- return ERR_PTR(n);
+ seg = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs,
+ true, &mw);
+ if (IS_ERR(seg))
+ return PTR_ERR(seg);
rpcrdma_push_mw(mw, &req->rl_registered);
- iptr = xdr_encode_rdma_segment(iptr, mw);
+ if (encode_rdma_segment(xdr, mw) < 0)
+ return -EMSGSIZE;
dprintk("RPC: %5u %s: %u@0x%016llx:0x%08x (%s)\n",
rqst->rq_task->tk_pid, __func__,
mw->mw_length, (unsigned long long)mw->mw_offset,
- mw->mw_handle, n < nsegs ? "more" : "last");
+ mw->mw_handle, mw->mw_nents < nsegs ? "more" : "last");
r_xprt->rx_stats.reply_chunk_count++;
r_xprt->rx_stats.total_rdma_request += seg->mr_len;
nchunks++;
- seg += n;
- nsegs -= n;
+ nsegs -= mw->mw_nents;
} while (nsegs);
/* Update count of segments in the Reply chunk */
*segcount = cpu_to_be32(nchunks);
- return iptr;
+ return 0;
}
/* Prepare the RPC-over-RDMA header SGE.
@@ -651,37 +685,52 @@ rpcrdma_unmap_sges(struct rpcrdma_ia *ia, struct rpcrdma_req *req)
req->rl_mapped_sges = 0;
}
-/*
- * Marshal a request: the primary job of this routine is to choose
- * the transfer modes. See comments below.
+/**
+ * rpcrdma_marshal_req - Marshal and send one RPC request
+ * @r_xprt: controlling transport
+ * @rqst: RPC request to be marshaled
+ *
+ * For the RPC in "rqst", this function:
+ * - Chooses the transfer mode (eg., RDMA_MSG or RDMA_NOMSG)
+ * - Registers Read, Write, and Reply chunks
+ * - Constructs the transport header
+ * - Posts a Send WR to send the transport header and request
*
- * Returns zero on success, otherwise a negative errno.
+ * Returns:
+ * %0 if the RPC was sent successfully,
+ * %-ENOTCONN if the connection was lost,
+ * %-EAGAIN if not enough pages are available for on-demand reply buffer,
+ * %-ENOBUFS if no MRs are available to register chunks,
+ * %-EMSGSIZE if the transport header is too small,
+ * %-EIO if a permanent problem occurred while marshaling.
*/
-
int
-rpcrdma_marshal_req(struct rpc_rqst *rqst)
+rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst)
{
- struct rpc_xprt *xprt = rqst->rq_xprt;
- struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
+ struct xdr_stream *xdr = &req->rl_stream;
enum rpcrdma_chunktype rtype, wtype;
- struct rpcrdma_msg *headerp;
bool ddp_allowed;
- ssize_t hdrlen;
- size_t rpclen;
- __be32 *iptr;
+ __be32 *p;
+ int ret;
#if defined(CONFIG_SUNRPC_BACKCHANNEL)
if (test_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state))
return rpcrdma_bc_marshal_reply(rqst);
#endif
- headerp = rdmab_to_msg(req->rl_rdmabuf);
- /* don't byte-swap XID, it's already done in request */
- headerp->rm_xid = rqst->rq_xid;
- headerp->rm_vers = rpcrdma_version;
- headerp->rm_credit = cpu_to_be32(r_xprt->rx_buf.rb_max_requests);
- headerp->rm_type = rdma_msg;
+ rpcrdma_set_xdrlen(&req->rl_hdrbuf, 0);
+ xdr_init_encode(xdr, &req->rl_hdrbuf,
+ req->rl_rdmabuf->rg_base);
+
+ /* Fixed header fields */
+ ret = -EMSGSIZE;
+ p = xdr_reserve_space(xdr, 4 * sizeof(*p));
+ if (!p)
+ goto out_err;
+ *p++ = rqst->rq_xid;
+ *p++ = rpcrdma_version;
+ *p++ = cpu_to_be32(r_xprt->rx_buf.rb_max_requests);
/* When the ULP employs a GSS flavor that guarantees integrity
* or privacy, direct data placement of individual data items
@@ -721,22 +770,17 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
* by themselves are larger than the inline threshold.
*/
if (rpcrdma_args_inline(r_xprt, rqst)) {
+ *p++ = rdma_msg;
rtype = rpcrdma_noch;
- rpclen = rqst->rq_snd_buf.len;
} else if (ddp_allowed && rqst->rq_snd_buf.flags & XDRBUF_WRITE) {
+ *p++ = rdma_msg;
rtype = rpcrdma_readch;
- rpclen = rqst->rq_snd_buf.head[0].iov_len +
- rqst->rq_snd_buf.tail[0].iov_len;
} else {
r_xprt->rx_stats.nomsg_call_count++;
- headerp->rm_type = htonl(RDMA_NOMSG);
+ *p++ = rdma_nomsg;
rtype = rpcrdma_areadch;
- rpclen = 0;
}
- req->rl_xid = rqst->rq_xid;
- rpcrdma_insert_req(&r_xprt->rx_buf, req);
-
/* This implementation supports the following combinations
* of chunk lists in one RPC-over-RDMA Call message:
*
@@ -759,79 +803,50 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
* send a Call message with a Position Zero Read chunk and a
* regular Read chunk at the same time.
*/
- iptr = headerp->rm_body.rm_chunks;
- iptr = rpcrdma_encode_read_list(r_xprt, req, rqst, iptr, rtype);
- if (IS_ERR(iptr))
+ if (rtype != rpcrdma_noch) {
+ ret = rpcrdma_encode_read_list(r_xprt, req, rqst, rtype);
+ if (ret)
+ goto out_err;
+ }
+ ret = encode_item_not_present(xdr);
+ if (ret)
goto out_err;
- iptr = rpcrdma_encode_write_list(r_xprt, req, rqst, iptr, wtype);
- if (IS_ERR(iptr))
+
+ if (wtype == rpcrdma_writech) {
+ ret = rpcrdma_encode_write_list(r_xprt, req, rqst, wtype);
+ if (ret)
+ goto out_err;
+ }
+ ret = encode_item_not_present(xdr);
+ if (ret)
goto out_err;
- iptr = rpcrdma_encode_reply_chunk(r_xprt, req, rqst, iptr, wtype);
- if (IS_ERR(iptr))
+
+ if (wtype != rpcrdma_replych)
+ ret = encode_item_not_present(xdr);
+ else
+ ret = rpcrdma_encode_reply_chunk(r_xprt, req, rqst, wtype);
+ if (ret)
goto out_err;
- hdrlen = (unsigned char *)iptr - (unsigned char *)headerp;
- dprintk("RPC: %5u %s: %s/%s: hdrlen %zd rpclen %zd\n",
+ dprintk("RPC: %5u %s: %s/%s: hdrlen %u rpclen\n",
rqst->rq_task->tk_pid, __func__,
transfertypes[rtype], transfertypes[wtype],
- hdrlen, rpclen);
+ xdr_stream_pos(xdr));
- if (!rpcrdma_prepare_send_sges(&r_xprt->rx_ia, req, hdrlen,
+ if (!rpcrdma_prepare_send_sges(&r_xprt->rx_ia, req,
+ xdr_stream_pos(xdr),
&rqst->rq_snd_buf, rtype)) {
- iptr = ERR_PTR(-EIO);
+ ret = -EIO;
goto out_err;
}
return 0;
out_err:
- if (PTR_ERR(iptr) != -ENOBUFS) {
- pr_err("rpcrdma: rpcrdma_marshal_req failed, status %ld\n",
- PTR_ERR(iptr));
+ if (ret != -ENOBUFS) {
+ pr_err("rpcrdma: header marshaling failed (%d)\n", ret);
r_xprt->rx_stats.failed_marshal_count++;
}
- return PTR_ERR(iptr);
-}
-
-/*
- * Chase down a received write or reply chunklist to get length
- * RDMA'd by server. See map at rpcrdma_create_chunks()! :-)
- */
-static int
-rpcrdma_count_chunks(struct rpcrdma_rep *rep, int wrchunk, __be32 **iptrp)
-{
- unsigned int i, total_len;
- struct rpcrdma_write_chunk *cur_wchunk;
- char *base = (char *)rdmab_to_msg(rep->rr_rdmabuf);
-
- i = be32_to_cpu(**iptrp);
- cur_wchunk = (struct rpcrdma_write_chunk *) (*iptrp + 1);
- total_len = 0;
- while (i--) {
- struct rpcrdma_segment *seg = &cur_wchunk->wc_target;
- ifdebug(FACILITY) {
- u64 off;
- xdr_decode_hyper((__be32 *)&seg->rs_offset, &off);
- dprintk("RPC: %s: chunk %d@0x%016llx:0x%08x\n",
- __func__,
- be32_to_cpu(seg->rs_length),
- (unsigned long long)off,
- be32_to_cpu(seg->rs_handle));
- }
- total_len += be32_to_cpu(seg->rs_length);
- ++cur_wchunk;
- }
- /* check and adjust for properly terminated write chunk */
- if (wrchunk) {
- __be32 *w = (__be32 *) cur_wchunk;
- if (*w++ != xdr_zero)
- return -1;
- cur_wchunk = (struct rpcrdma_write_chunk *) w;
- }
- if ((char *)cur_wchunk > base + rep->rr_len)
- return -1;
-
- *iptrp = (__be32 *) cur_wchunk;
- return total_len;
+ return ret;
}
/**
@@ -949,37 +964,254 @@ rpcrdma_mark_remote_invalidation(struct list_head *mws,
}
}
-#if defined(CONFIG_SUNRPC_BACKCHANNEL)
/* By convention, backchannel calls arrive via rdma_msg type
* messages, and never populate the chunk lists. This makes
* the RPC/RDMA header small and fixed in size, so it is
* straightforward to check the RPC header's direction field.
*/
static bool
-rpcrdma_is_bcall(struct rpcrdma_msg *headerp)
+rpcrdma_is_bcall(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep,
+ __be32 xid, __be32 proc)
+#if defined(CONFIG_SUNRPC_BACKCHANNEL)
{
- __be32 *p = (__be32 *)headerp;
+ struct xdr_stream *xdr = &rep->rr_stream;
+ __be32 *p;
- if (headerp->rm_type != rdma_msg)
+ if (proc != rdma_msg)
return false;
- if (headerp->rm_body.rm_chunks[0] != xdr_zero)
+
+ /* Peek at stream contents without advancing. */
+ p = xdr_inline_decode(xdr, 0);
+
+ /* Chunk lists */
+ if (*p++ != xdr_zero)
return false;
- if (headerp->rm_body.rm_chunks[1] != xdr_zero)
+ if (*p++ != xdr_zero)
return false;
- if (headerp->rm_body.rm_chunks[2] != xdr_zero)
+ if (*p++ != xdr_zero)
return false;
- /* sanity */
- if (p[7] != headerp->rm_xid)
+ /* RPC header */
+ if (*p++ != xid)
return false;
- /* call direction */
- if (p[8] != cpu_to_be32(RPC_CALL))
+ if (*p != cpu_to_be32(RPC_CALL))
return false;
+ /* Now that we are sure this is a backchannel call,
+ * advance to the RPC header.
+ */
+ p = xdr_inline_decode(xdr, 3 * sizeof(*p));
+ if (unlikely(!p))
+ goto out_short;
+
+ rpcrdma_bc_receive_call(r_xprt, rep);
+ return true;
+
+out_short:
+ pr_warn("RPC/RDMA short backward direction call\n");
+ if (rpcrdma_ep_post_recv(&r_xprt->rx_ia, rep))
+ xprt_disconnect_done(&r_xprt->rx_xprt);
return true;
}
+#else /* CONFIG_SUNRPC_BACKCHANNEL */
+{
+ return false;
+}
#endif /* CONFIG_SUNRPC_BACKCHANNEL */
+static int decode_rdma_segment(struct xdr_stream *xdr, u32 *length)
+{
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, 4 * sizeof(*p));
+ if (unlikely(!p))
+ return -EIO;
+
+ ifdebug(FACILITY) {
+ u64 offset;
+ u32 handle;
+
+ handle = be32_to_cpup(p++);
+ *length = be32_to_cpup(p++);
+ xdr_decode_hyper(p, &offset);
+ dprintk("RPC: %s: segment %u@0x%016llx:0x%08x\n",
+ __func__, *length, (unsigned long long)offset,
+ handle);
+ } else {
+ *length = be32_to_cpup(p + 1);
+ }
+
+ return 0;
+}
+
+static int decode_write_chunk(struct xdr_stream *xdr, u32 *length)
+{
+ u32 segcount, seglength;
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, sizeof(*p));
+ if (unlikely(!p))
+ return -EIO;
+
+ *length = 0;
+ segcount = be32_to_cpup(p);
+ while (segcount--) {
+ if (decode_rdma_segment(xdr, &seglength))
+ return -EIO;
+ *length += seglength;
+ }
+
+ dprintk("RPC: %s: segcount=%u, %u bytes\n",
+ __func__, be32_to_cpup(p), *length);
+ return 0;
+}
+
+/* In RPC-over-RDMA Version One replies, a Read list is never
+ * expected. This decoder is a stub that returns an error if
+ * a Read list is present.
+ */
+static int decode_read_list(struct xdr_stream *xdr)
+{
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, sizeof(*p));
+ if (unlikely(!p))
+ return -EIO;
+ if (unlikely(*p != xdr_zero))
+ return -EIO;
+ return 0;
+}
+
+/* Supports only one Write chunk in the Write list
+ */
+static int decode_write_list(struct xdr_stream *xdr, u32 *length)
+{
+ u32 chunklen;
+ bool first;
+ __be32 *p;
+
+ *length = 0;
+ first = true;
+ do {
+ p = xdr_inline_decode(xdr, sizeof(*p));
+ if (unlikely(!p))
+ return -EIO;
+ if (*p == xdr_zero)
+ break;
+ if (!first)
+ return -EIO;
+
+ if (decode_write_chunk(xdr, &chunklen))
+ return -EIO;
+ *length += chunklen;
+ first = false;
+ } while (true);
+ return 0;
+}
+
+static int decode_reply_chunk(struct xdr_stream *xdr, u32 *length)
+{
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, sizeof(*p));
+ if (unlikely(!p))
+ return -EIO;
+
+ *length = 0;
+ if (*p != xdr_zero)
+ if (decode_write_chunk(xdr, length))
+ return -EIO;
+ return 0;
+}
+
+static int
+rpcrdma_decode_msg(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep,
+ struct rpc_rqst *rqst)
+{
+ struct xdr_stream *xdr = &rep->rr_stream;
+ u32 writelist, replychunk, rpclen;
+ char *base;
+
+ /* Decode the chunk lists */
+ if (decode_read_list(xdr))
+ return -EIO;
+ if (decode_write_list(xdr, &writelist))
+ return -EIO;
+ if (decode_reply_chunk(xdr, &replychunk))
+ return -EIO;
+
+ /* RDMA_MSG sanity checks */
+ if (unlikely(replychunk))
+ return -EIO;
+
+ /* Build the RPC reply's Payload stream in rqst->rq_rcv_buf */
+ base = (char *)xdr_inline_decode(xdr, 0);
+ rpclen = xdr_stream_remaining(xdr);
+ r_xprt->rx_stats.fixup_copy_count +=
+ rpcrdma_inline_fixup(rqst, base, rpclen, writelist & 3);
+
+ r_xprt->rx_stats.total_rdma_reply += writelist;
+ return rpclen + xdr_align_size(writelist);
+}
+
+static noinline int
+rpcrdma_decode_nomsg(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep)
+{
+ struct xdr_stream *xdr = &rep->rr_stream;
+ u32 writelist, replychunk;
+
+ /* Decode the chunk lists */
+ if (decode_read_list(xdr))
+ return -EIO;
+ if (decode_write_list(xdr, &writelist))
+ return -EIO;
+ if (decode_reply_chunk(xdr, &replychunk))
+ return -EIO;
+
+ /* RDMA_NOMSG sanity checks */
+ if (unlikely(writelist))
+ return -EIO;
+ if (unlikely(!replychunk))
+ return -EIO;
+
+ /* Reply chunk buffer already is the reply vector */
+ r_xprt->rx_stats.total_rdma_reply += replychunk;
+ return replychunk;
+}
+
+static noinline int
+rpcrdma_decode_error(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep,
+ struct rpc_rqst *rqst)
+{
+ struct xdr_stream *xdr = &rep->rr_stream;
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, sizeof(*p));
+ if (unlikely(!p))
+ return -EIO;
+
+ switch (*p) {
+ case err_vers:
+ p = xdr_inline_decode(xdr, 2 * sizeof(*p));
+ if (!p)
+ break;
+ dprintk("RPC: %5u: %s: server reports version error (%u-%u)\n",
+ rqst->rq_task->tk_pid, __func__,
+ be32_to_cpup(p), be32_to_cpu(*(p + 1)));
+ break;
+ case err_chunk:
+ dprintk("RPC: %5u: %s: server reports header decoding error\n",
+ rqst->rq_task->tk_pid, __func__);
+ break;
+ default:
+ dprintk("RPC: %5u: %s: server reports unrecognized error %d\n",
+ rqst->rq_task->tk_pid, __func__, be32_to_cpup(p));
+ }
+
+ r_xprt->rx_stats.bad_reply_count++;
+ return -EREMOTEIO;
+}
+
/* Process received RPC/RDMA messages.
*
* Errors must result in the RPC task either being awakened, or
@@ -991,51 +1223,48 @@ rpcrdma_reply_handler(struct work_struct *work)
struct rpcrdma_rep *rep =
container_of(work, struct rpcrdma_rep, rr_work);
struct rpcrdma_xprt *r_xprt = rep->rr_rxprt;
- struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
struct rpc_xprt *xprt = &r_xprt->rx_xprt;
- struct rpcrdma_msg *headerp;
+ struct xdr_stream *xdr = &rep->rr_stream;
struct rpcrdma_req *req;
struct rpc_rqst *rqst;
- __be32 *iptr;
- int rdmalen, status, rmerr;
+ __be32 *p, xid, vers, proc;
unsigned long cwnd;
- struct list_head mws;
+ int status;
dprintk("RPC: %s: incoming rep %p\n", __func__, rep);
- if (rep->rr_len == RPCRDMA_BAD_LEN)
+ if (rep->rr_hdrbuf.head[0].iov_len == 0)
goto out_badstatus;
- if (rep->rr_len < RPCRDMA_HDRLEN_ERR)
+
+ xdr_init_decode(xdr, &rep->rr_hdrbuf,
+ rep->rr_hdrbuf.head[0].iov_base);
+
+ /* Fixed transport header fields */
+ p = xdr_inline_decode(xdr, 4 * sizeof(*p));
+ if (unlikely(!p))
goto out_shortreply;
+ xid = *p++;
+ vers = *p++;
+ p++; /* credits */
+ proc = *p++;
- headerp = rdmab_to_msg(rep->rr_rdmabuf);
-#if defined(CONFIG_SUNRPC_BACKCHANNEL)
- if (rpcrdma_is_bcall(headerp))
- goto out_bcall;
-#endif
+ if (rpcrdma_is_bcall(r_xprt, rep, xid, proc))
+ return;
/* Match incoming rpcrdma_rep to an rpcrdma_req to
* get context for handling any incoming chunks.
*/
- spin_lock(&buf->rb_lock);
- req = rpcrdma_lookup_req_locked(&r_xprt->rx_buf,
- headerp->rm_xid);
- if (!req)
- goto out_nomatch;
- if (req->rl_reply)
- goto out_duplicate;
-
- list_replace_init(&req->rl_registered, &mws);
- rpcrdma_mark_remote_invalidation(&mws, rep);
-
- /* Avoid races with signals and duplicate replies
- * by marking this req as matched.
- */
+ spin_lock(&xprt->recv_lock);
+ rqst = xprt_lookup_rqst(xprt, xid);
+ if (!rqst)
+ goto out_norqst;
+ xprt_pin_rqst(rqst);
+ spin_unlock(&xprt->recv_lock);
+ req = rpcr_to_rdmar(rqst);
req->rl_reply = rep;
- spin_unlock(&buf->rb_lock);
dprintk("RPC: %s: reply %p completes request %p (xid 0x%08x)\n",
- __func__, rep, req, be32_to_cpu(headerp->rm_xid));
+ __func__, rep, req, be32_to_cpu(xid));
/* Invalidate and unmap the data payloads before waking the
* waiting application. This guarantees the memory regions
@@ -1044,99 +1273,42 @@ rpcrdma_reply_handler(struct work_struct *work)
* waking the next RPC waits until this RPC has relinquished
* all its Send Queue entries.
*/
- if (!list_empty(&mws))
- r_xprt->rx_ia.ri_ops->ro_unmap_sync(r_xprt, &mws);
+ if (!list_empty(&req->rl_registered)) {
+ rpcrdma_mark_remote_invalidation(&req->rl_registered, rep);
+ r_xprt->rx_ia.ri_ops->ro_unmap_sync(r_xprt,
+ &req->rl_registered);
+ }
- /* Perform XID lookup, reconstruction of the RPC reply, and
- * RPC completion while holding the transport lock to ensure
- * the rep, rqst, and rq_task pointers remain stable.
- */
- spin_lock_bh(&xprt->transport_lock);
- rqst = xprt_lookup_rqst(xprt, headerp->rm_xid);
- if (!rqst)
- goto out_norqst;
xprt->reestablish_timeout = 0;
- if (headerp->rm_vers != rpcrdma_version)
+ if (vers != rpcrdma_version)
goto out_badversion;
- /* check for expected message types */
- /* The order of some of these tests is important. */
- switch (headerp->rm_type) {
+ switch (proc) {
case rdma_msg:
- /* never expect read chunks */
- /* never expect reply chunks (two ways to check) */
- if (headerp->rm_body.rm_chunks[0] != xdr_zero ||
- (headerp->rm_body.rm_chunks[1] == xdr_zero &&
- headerp->rm_body.rm_chunks[2] != xdr_zero))
- goto badheader;
- if (headerp->rm_body.rm_chunks[1] != xdr_zero) {
- /* count any expected write chunks in read reply */
- /* start at write chunk array count */
- iptr = &headerp->rm_body.rm_chunks[2];
- rdmalen = rpcrdma_count_chunks(rep, 1, &iptr);
- /* check for validity, and no reply chunk after */
- if (rdmalen < 0 || *iptr++ != xdr_zero)
- goto badheader;
- rep->rr_len -=
- ((unsigned char *)iptr - (unsigned char *)headerp);
- status = rep->rr_len + rdmalen;
- r_xprt->rx_stats.total_rdma_reply += rdmalen;
- /* special case - last chunk may omit padding */
- if (rdmalen &= 3) {
- rdmalen = 4 - rdmalen;
- status += rdmalen;
- }
- } else {
- /* else ordinary inline */
- rdmalen = 0;
- iptr = (__be32 *)((unsigned char *)headerp +
- RPCRDMA_HDRLEN_MIN);
- rep->rr_len -= RPCRDMA_HDRLEN_MIN;
- status = rep->rr_len;
- }
-
- r_xprt->rx_stats.fixup_copy_count +=
- rpcrdma_inline_fixup(rqst, (char *)iptr, rep->rr_len,
- rdmalen);
+ status = rpcrdma_decode_msg(r_xprt, rep, rqst);
break;
-
case rdma_nomsg:
- /* never expect read or write chunks, always reply chunks */
- if (headerp->rm_body.rm_chunks[0] != xdr_zero ||
- headerp->rm_body.rm_chunks[1] != xdr_zero ||
- headerp->rm_body.rm_chunks[2] != xdr_one)
- goto badheader;
- iptr = (__be32 *)((unsigned char *)headerp +
- RPCRDMA_HDRLEN_MIN);
- rdmalen = rpcrdma_count_chunks(rep, 0, &iptr);
- if (rdmalen < 0)
- goto badheader;
- r_xprt->rx_stats.total_rdma_reply += rdmalen;
- /* Reply chunk buffer already is the reply vector - no fixup. */
- status = rdmalen;
+ status = rpcrdma_decode_nomsg(r_xprt, rep);
break;
-
case rdma_error:
- goto out_rdmaerr;
-
-badheader:
+ status = rpcrdma_decode_error(r_xprt, rep, rqst);
+ break;
default:
- dprintk("RPC: %5u %s: invalid rpcrdma reply (type %u)\n",
- rqst->rq_task->tk_pid, __func__,
- be32_to_cpu(headerp->rm_type));
status = -EIO;
- r_xprt->rx_stats.bad_reply_count++;
- break;
}
+ if (status < 0)
+ goto out_badheader;
out:
+ spin_lock(&xprt->recv_lock);
cwnd = xprt->cwnd;
xprt->cwnd = atomic_read(&r_xprt->rx_buf.rb_credits) << RPC_CWNDSHIFT;
if (xprt->cwnd > cwnd)
xprt_release_rqst_cong(rqst->rq_task);
xprt_complete_rqst(rqst->rq_task, status);
- spin_unlock_bh(&xprt->transport_lock);
+ xprt_unpin_rqst(rqst);
+ spin_unlock(&xprt->recv_lock);
dprintk("RPC: %s: xprt_complete_rqst(0x%p, 0x%p, %d)\n",
__func__, xprt, rqst, status);
return;
@@ -1149,72 +1321,38 @@ out_badstatus:
}
return;
-#if defined(CONFIG_SUNRPC_BACKCHANNEL)
-out_bcall:
- rpcrdma_bc_receive_call(r_xprt, rep);
- return;
-#endif
-
/* If the incoming reply terminated a pending RPC, the next
* RPC call will post a replacement receive buffer as it is
* being marshaled.
*/
out_badversion:
dprintk("RPC: %s: invalid version %d\n",
- __func__, be32_to_cpu(headerp->rm_vers));
+ __func__, be32_to_cpu(vers));
status = -EIO;
r_xprt->rx_stats.bad_reply_count++;
goto out;
-out_rdmaerr:
- rmerr = be32_to_cpu(headerp->rm_body.rm_error.rm_err);
- switch (rmerr) {
- case ERR_VERS:
- pr_err("%s: server reports header version error (%u-%u)\n",
- __func__,
- be32_to_cpu(headerp->rm_body.rm_error.rm_vers_low),
- be32_to_cpu(headerp->rm_body.rm_error.rm_vers_high));
- break;
- case ERR_CHUNK:
- pr_err("%s: server reports header decoding error\n",
- __func__);
- break;
- default:
- pr_err("%s: server reports unknown error %d\n",
- __func__, rmerr);
- }
- status = -EREMOTEIO;
+out_badheader:
+ dprintk("RPC: %5u %s: invalid rpcrdma reply (type %u)\n",
+ rqst->rq_task->tk_pid, __func__, be32_to_cpu(proc));
r_xprt->rx_stats.bad_reply_count++;
+ status = -EIO;
goto out;
-/* The req was still available, but by the time the transport_lock
+/* The req was still available, but by the time the recv_lock
* was acquired, the rqst and task had been released. Thus the RPC
* has already been terminated.
*/
out_norqst:
- spin_unlock_bh(&xprt->transport_lock);
- rpcrdma_buffer_put(req);
- dprintk("RPC: %s: race, no rqst left for req %p\n",
- __func__, req);
- return;
+ spin_unlock(&xprt->recv_lock);
+ dprintk("RPC: %s: no match for incoming xid 0x%08x\n",
+ __func__, be32_to_cpu(xid));
+ goto repost;
out_shortreply:
dprintk("RPC: %s: short/invalid reply\n", __func__);
goto repost;
-out_nomatch:
- spin_unlock(&buf->rb_lock);
- dprintk("RPC: %s: no match for incoming xid 0x%08x len %d\n",
- __func__, be32_to_cpu(headerp->rm_xid),
- rep->rr_len);
- goto repost;
-
-out_duplicate:
- spin_unlock(&buf->rb_lock);
- dprintk("RPC: %s: "
- "duplicate reply %p to RPC request %p: xid 0x%08x\n",
- __func__, rep, req, be32_to_cpu(headerp->rm_xid));
-
/* If no pending RPC transaction was matched, post a replacement
* receive buffer before returning.
*/
diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
index c676ed0efb5a..ec37ad83b068 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
@@ -52,7 +52,7 @@ int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, __be32 *rdma_resp,
if (src->iov_len < 24)
goto out_shortreply;
- spin_lock_bh(&xprt->transport_lock);
+ spin_lock(&xprt->recv_lock);
req = xprt_lookup_rqst(xprt, xid);
if (!req)
goto out_notfound;
@@ -69,17 +69,20 @@ int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, __be32 *rdma_resp,
else if (credits > r_xprt->rx_buf.rb_bc_max_requests)
credits = r_xprt->rx_buf.rb_bc_max_requests;
+ spin_lock_bh(&xprt->transport_lock);
cwnd = xprt->cwnd;
xprt->cwnd = credits << RPC_CWNDSHIFT;
if (xprt->cwnd > cwnd)
xprt_release_rqst_cong(req->rq_task);
+ spin_unlock_bh(&xprt->transport_lock);
+
ret = 0;
xprt_complete_rqst(req->rq_task, rcvbuf->len);
rcvbuf->len = 0;
out_unlock:
- spin_unlock_bh(&xprt->transport_lock);
+ spin_unlock(&xprt->recv_lock);
out:
return ret;
@@ -266,7 +269,7 @@ xprt_rdma_bc_put(struct rpc_xprt *xprt)
module_put(THIS_MODULE);
}
-static struct rpc_xprt_ops xprt_rdma_bc_procs = {
+static const struct rpc_xprt_ops xprt_rdma_bc_procs = {
.reserve_xprt = xprt_reserve_xprt_cong,
.release_xprt = xprt_release_xprt_cong,
.alloc_slot = xprt_alloc_slot,
diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c
index 933f79bed270..7dcda4597057 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_rw.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c
@@ -660,19 +660,21 @@ out_initerr:
return -EIO;
}
+/* Walk the segments in the Read chunk starting at @p and construct
+ * RDMA Read operations to pull the chunk to the server.
+ */
static int svc_rdma_build_read_chunk(struct svc_rqst *rqstp,
struct svc_rdma_read_info *info,
__be32 *p)
{
int ret;
+ ret = -EINVAL;
info->ri_chunklen = 0;
- while (*p++ != xdr_zero) {
+ while (*p++ != xdr_zero && be32_to_cpup(p++) == info->ri_position) {
u32 rs_handle, rs_length;
u64 rs_offset;
- if (be32_to_cpup(p++) != info->ri_position)
- break;
rs_handle = be32_to_cpup(p++);
rs_length = be32_to_cpup(p++);
p = xdr_decode_hyper(p, &rs_offset);
@@ -689,78 +691,6 @@ static int svc_rdma_build_read_chunk(struct svc_rqst *rqstp,
return ret;
}
-/* If there is inline content following the Read chunk, append it to
- * the page list immediately following the data payload. This has to
- * be done after the reader function has determined how many pages
- * were consumed for RDMA Read.
- *
- * On entry, ri_pageno and ri_pageoff point directly to the end of the
- * page list. On exit, both have been updated to the new "next byte".
- *
- * Assumptions:
- * - Inline content fits entirely in rq_pages[0]
- * - Trailing content is only a handful of bytes
- */
-static int svc_rdma_copy_tail(struct svc_rqst *rqstp,
- struct svc_rdma_read_info *info)
-{
- struct svc_rdma_op_ctxt *head = info->ri_readctxt;
- unsigned int tail_length, remaining;
- u8 *srcp, *destp;
-
- /* Assert that all inline content fits in page 0. This is an
- * implementation limit, not a protocol limit.
- */
- if (head->arg.head[0].iov_len > PAGE_SIZE) {
- pr_warn_once("svcrdma: too much trailing inline content\n");
- return -EINVAL;
- }
-
- srcp = head->arg.head[0].iov_base;
- srcp += info->ri_position;
- tail_length = head->arg.head[0].iov_len - info->ri_position;
- remaining = tail_length;
-
- /* If there is room on the last page in the page list, try to
- * fit the trailing content there.
- */
- if (info->ri_pageoff > 0) {
- unsigned int len;
-
- len = min_t(unsigned int, remaining,
- PAGE_SIZE - info->ri_pageoff);
- destp = page_address(rqstp->rq_pages[info->ri_pageno]);
- destp += info->ri_pageoff;
-
- memcpy(destp, srcp, len);
- srcp += len;
- destp += len;
- info->ri_pageoff += len;
- remaining -= len;
-
- if (info->ri_pageoff == PAGE_SIZE) {
- info->ri_pageno++;
- info->ri_pageoff = 0;
- }
- }
-
- /* Otherwise, a fresh page is needed. */
- if (remaining) {
- head->arg.pages[info->ri_pageno] =
- rqstp->rq_pages[info->ri_pageno];
- head->count++;
-
- destp = page_address(rqstp->rq_pages[info->ri_pageno]);
- memcpy(destp, srcp, remaining);
- info->ri_pageoff += remaining;
- }
-
- head->arg.page_len += tail_length;
- head->arg.len += tail_length;
- head->arg.buflen += tail_length;
- return 0;
-}
-
/* Construct RDMA Reads to pull over a normal Read chunk. The chunk
* data lands in the page list of head->arg.pages.
*
@@ -785,34 +715,28 @@ static int svc_rdma_build_normal_read_chunk(struct svc_rqst *rqstp,
if (ret < 0)
goto out;
- /* Read chunk may need XDR round-up (see RFC 5666, s. 3.7).
+ /* Split the Receive buffer between the head and tail
+ * buffers at Read chunk's position. XDR roundup of the
+ * chunk is not included in either the pagelist or in
+ * the tail.
*/
- if (info->ri_chunklen & 3) {
- u32 padlen = 4 - (info->ri_chunklen & 3);
-
- info->ri_chunklen += padlen;
+ head->arg.tail[0].iov_base =
+ head->arg.head[0].iov_base + info->ri_position;
+ head->arg.tail[0].iov_len =
+ head->arg.head[0].iov_len - info->ri_position;
+ head->arg.head[0].iov_len = info->ri_position;
- /* NB: data payload always starts on XDR alignment,
- * thus the pad can never contain a page boundary.
- */
- info->ri_pageoff += padlen;
- if (info->ri_pageoff == PAGE_SIZE) {
- info->ri_pageno++;
- info->ri_pageoff = 0;
- }
- }
+ /* Read chunk may need XDR roundup (see RFC 5666, s. 3.7).
+ *
+ * NFSv2/3 write decoders need the length of the tail to
+ * contain the size of the roundup padding.
+ */
+ head->arg.tail[0].iov_len += 4 - (info->ri_chunklen & 3);
head->arg.page_len = info->ri_chunklen;
head->arg.len += info->ri_chunklen;
head->arg.buflen += info->ri_chunklen;
- if (info->ri_position < head->arg.head[0].iov_len) {
- ret = svc_rdma_copy_tail(rqstp, info);
- if (ret < 0)
- goto out;
- }
- head->arg.head[0].iov_len = info->ri_position;
-
out:
return ret;
}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index e660d4965b18..5caf8e722a11 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -51,6 +51,7 @@
#include <linux/workqueue.h>
#include <rdma/ib_verbs.h>
#include <rdma/rdma_cm.h>
+#include <rdma/rw.h>
#include <linux/sunrpc/svc_rdma.h>
#include <linux/export.h>
#include "xprt_rdma.h"
@@ -70,7 +71,7 @@ static int svc_rdma_has_wspace(struct svc_xprt *xprt);
static int svc_rdma_secure_port(struct svc_rqst *);
static void svc_rdma_kill_temp_xprt(struct svc_xprt *);
-static struct svc_xprt_ops svc_rdma_ops = {
+static const struct svc_xprt_ops svc_rdma_ops = {
.xpo_create = svc_rdma_create,
.xpo_recvfrom = svc_rdma_recvfrom,
.xpo_sendto = svc_rdma_sendto,
@@ -98,7 +99,7 @@ static struct svc_xprt *svc_rdma_bc_create(struct svc_serv *, struct net *,
static void svc_rdma_bc_detach(struct svc_xprt *);
static void svc_rdma_bc_free(struct svc_xprt *);
-static struct svc_xprt_ops svc_rdma_bc_ops = {
+static const struct svc_xprt_ops svc_rdma_bc_ops = {
.xpo_create = svc_rdma_bc_create,
.xpo_detach = svc_rdma_bc_detach,
.xpo_free = svc_rdma_bc_free,
@@ -167,8 +168,8 @@ static bool svc_rdma_prealloc_ctxts(struct svcxprt_rdma *xprt)
{
unsigned int i;
- /* Each RPC/RDMA credit can consume a number of send
- * and receive WQEs. One ctxt is allocated for each.
+ /* Each RPC/RDMA credit can consume one Receive and
+ * one Send WQE at the same time.
*/
i = xprt->sc_sq_depth + xprt->sc_rq_depth;
@@ -713,7 +714,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
struct ib_qp_init_attr qp_attr;
struct ib_device *dev;
struct sockaddr *sap;
- unsigned int i;
+ unsigned int i, ctxts;
int ret = 0;
listen_rdma = container_of(xprt, struct svcxprt_rdma, sc_xprt);
@@ -742,14 +743,26 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
newxprt->sc_max_sge = min((size_t)dev->attrs.max_sge,
(size_t)RPCSVC_MAXPAGES);
newxprt->sc_max_req_size = svcrdma_max_req_size;
- newxprt->sc_max_requests = min_t(u32, dev->attrs.max_qp_wr,
- svcrdma_max_requests);
- newxprt->sc_fc_credits = cpu_to_be32(newxprt->sc_max_requests);
- newxprt->sc_max_bc_requests = min_t(u32, dev->attrs.max_qp_wr,
- svcrdma_max_bc_requests);
+ newxprt->sc_max_requests = svcrdma_max_requests;
+ newxprt->sc_max_bc_requests = svcrdma_max_bc_requests;
newxprt->sc_rq_depth = newxprt->sc_max_requests +
newxprt->sc_max_bc_requests;
- newxprt->sc_sq_depth = newxprt->sc_rq_depth;
+ if (newxprt->sc_rq_depth > dev->attrs.max_qp_wr) {
+ pr_warn("svcrdma: reducing receive depth to %d\n",
+ dev->attrs.max_qp_wr);
+ newxprt->sc_rq_depth = dev->attrs.max_qp_wr;
+ newxprt->sc_max_requests = newxprt->sc_rq_depth - 2;
+ newxprt->sc_max_bc_requests = 2;
+ }
+ newxprt->sc_fc_credits = cpu_to_be32(newxprt->sc_max_requests);
+ ctxts = rdma_rw_mr_factor(dev, newxprt->sc_port_num, RPCSVC_MAXPAGES);
+ ctxts *= newxprt->sc_max_requests;
+ newxprt->sc_sq_depth = newxprt->sc_rq_depth + ctxts;
+ if (newxprt->sc_sq_depth > dev->attrs.max_qp_wr) {
+ pr_warn("svcrdma: reducing send depth to %d\n",
+ dev->attrs.max_qp_wr);
+ newxprt->sc_sq_depth = dev->attrs.max_qp_wr;
+ }
atomic_set(&newxprt->sc_sq_avail, newxprt->sc_sq_depth);
if (!svc_rdma_prealloc_ctxts(newxprt))
@@ -784,8 +797,8 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
qp_attr.event_handler = qp_event_handler;
qp_attr.qp_context = &newxprt->sc_xprt;
qp_attr.port_num = newxprt->sc_port_num;
- qp_attr.cap.max_rdma_ctxs = newxprt->sc_max_requests;
- qp_attr.cap.max_send_wr = newxprt->sc_sq_depth;
+ qp_attr.cap.max_rdma_ctxs = ctxts;
+ qp_attr.cap.max_send_wr = newxprt->sc_sq_depth - ctxts;
qp_attr.cap.max_recv_wr = newxprt->sc_rq_depth;
qp_attr.cap.max_send_sge = newxprt->sc_max_sge;
qp_attr.cap.max_recv_sge = newxprt->sc_max_sge;
@@ -853,6 +866,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
dprintk(" remote address : %pIS:%u\n", sap, rpc_get_port(sap));
dprintk(" max_sge : %d\n", newxprt->sc_max_sge);
dprintk(" sq_depth : %d\n", newxprt->sc_sq_depth);
+ dprintk(" rdma_rw_ctxs : %d\n", ctxts);
dprintk(" max_requests : %d\n", newxprt->sc_max_requests);
dprintk(" ord : %d\n", newxprt->sc_ord);
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index d1c458e5ec4d..c84e2b644e13 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -149,7 +149,7 @@ static struct ctl_table sunrpc_table[] = {
#endif
-static struct rpc_xprt_ops xprt_rdma_procs; /*forward reference */
+static const struct rpc_xprt_ops xprt_rdma_procs;
static void
xprt_rdma_format_addresses4(struct rpc_xprt *xprt, struct sockaddr *sap)
@@ -559,6 +559,7 @@ rpcrdma_get_rdmabuf(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
r_xprt->rx_stats.hardway_register_count += size;
req->rl_rdmabuf = rb;
+ xdr_buf_init(&req->rl_hdrbuf, rb->rg_base, rdmab_length(rb));
return true;
}
@@ -684,7 +685,6 @@ xprt_rdma_free(struct rpc_task *task)
dprintk("RPC: %s: called on 0x%p\n", __func__, req->rl_reply);
- rpcrdma_remove_req(&r_xprt->rx_buf, req);
if (!list_empty(&req->rl_registered))
ia->ri_ops->ro_unmap_safe(r_xprt, req, !RPC_IS_ASYNC(task));
rpcrdma_unmap_sges(ia, req);
@@ -730,7 +730,7 @@ xprt_rdma_send_request(struct rpc_task *task)
if (unlikely(!list_empty(&req->rl_registered)))
r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req, false);
- rc = rpcrdma_marshal_req(rqst);
+ rc = rpcrdma_marshal_req(r_xprt, rqst);
if (rc < 0)
goto failed_marshal;
@@ -811,7 +811,7 @@ xprt_rdma_disable_swap(struct rpc_xprt *xprt)
* Plumbing for rpc transport switch and kernel module
*/
-static struct rpc_xprt_ops xprt_rdma_procs = {
+static const struct rpc_xprt_ops xprt_rdma_procs = {
.reserve_xprt = xprt_reserve_xprt_cong,
.release_xprt = xprt_release_xprt_cong, /* sunrpc/xprt.c */
.alloc_slot = xprt_alloc_slot,
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index e4171f2abe37..11a1fbf7e59e 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -139,14 +139,11 @@ rpcrdma_wc_send(struct ib_cq *cq, struct ib_wc *wc)
static void
rpcrdma_update_granted_credits(struct rpcrdma_rep *rep)
{
- struct rpcrdma_msg *rmsgp = rdmab_to_msg(rep->rr_rdmabuf);
struct rpcrdma_buffer *buffer = &rep->rr_rxprt->rx_buf;
+ __be32 *p = rep->rr_rdmabuf->rg_base;
u32 credits;
- if (rep->rr_len < RPCRDMA_HDRLEN_ERR)
- return;
-
- credits = be32_to_cpu(rmsgp->rm_credit);
+ credits = be32_to_cpup(p + 2);
if (credits == 0)
credits = 1; /* don't deadlock */
else if (credits > buffer->rb_max_requests)
@@ -173,21 +170,19 @@ rpcrdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc)
goto out_fail;
/* status == SUCCESS means all fields in wc are trustworthy */
- if (wc->opcode != IB_WC_RECV)
- return;
-
dprintk("RPC: %s: rep %p opcode 'recv', length %u: success\n",
__func__, rep, wc->byte_len);
- rep->rr_len = wc->byte_len;
+ rpcrdma_set_xdrlen(&rep->rr_hdrbuf, wc->byte_len);
rep->rr_wc_flags = wc->wc_flags;
rep->rr_inv_rkey = wc->ex.invalidate_rkey;
ib_dma_sync_single_for_cpu(rdmab_device(rep->rr_rdmabuf),
rdmab_addr(rep->rr_rdmabuf),
- rep->rr_len, DMA_FROM_DEVICE);
+ wc->byte_len, DMA_FROM_DEVICE);
- rpcrdma_update_granted_credits(rep);
+ if (wc->byte_len >= RPCRDMA_HDRLEN_ERR)
+ rpcrdma_update_granted_credits(rep);
out_schedule:
queue_work(rpcrdma_receive_wq, &rep->rr_work);
@@ -198,7 +193,7 @@ out_fail:
pr_err("rpcrdma: Recv: %s (%u/0x%x)\n",
ib_wc_status_msg(wc->status),
wc->status, wc->vendor_err);
- rep->rr_len = RPCRDMA_BAD_LEN;
+ rpcrdma_set_xdrlen(&rep->rr_hdrbuf, 0);
goto out_schedule;
}
@@ -974,6 +969,8 @@ rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt)
rc = PTR_ERR(rep->rr_rdmabuf);
goto out_free;
}
+ xdr_buf_init(&rep->rr_hdrbuf, rep->rr_rdmabuf->rg_base,
+ rdmab_length(rep->rr_rdmabuf));
rep->rr_cqe.done = rpcrdma_wc_receive;
rep->rr_rxprt = r_xprt;
@@ -1004,7 +1001,6 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt)
spin_lock_init(&buf->rb_recovery_lock);
INIT_LIST_HEAD(&buf->rb_mws);
INIT_LIST_HEAD(&buf->rb_all);
- INIT_LIST_HEAD(&buf->rb_pending);
INIT_LIST_HEAD(&buf->rb_stale_mrs);
INIT_DELAYED_WORK(&buf->rb_refresh_worker,
rpcrdma_mr_refresh_worker);
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index b282d3f8cdd8..e26a97d2f922 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -218,18 +218,17 @@ enum {
struct rpcrdma_rep {
struct ib_cqe rr_cqe;
- unsigned int rr_len;
int rr_wc_flags;
u32 rr_inv_rkey;
+ struct rpcrdma_regbuf *rr_rdmabuf;
struct rpcrdma_xprt *rr_rxprt;
struct work_struct rr_work;
+ struct xdr_buf rr_hdrbuf;
+ struct xdr_stream rr_stream;
struct list_head rr_list;
struct ib_recv_wr rr_recv_wr;
- struct rpcrdma_regbuf *rr_rdmabuf;
};
-#define RPCRDMA_BAD_LEN (~0U)
-
/*
* struct rpcrdma_mw - external memory region metadata
*
@@ -341,11 +340,12 @@ enum {
struct rpcrdma_buffer;
struct rpcrdma_req {
struct list_head rl_list;
- __be32 rl_xid;
unsigned int rl_mapped_sges;
unsigned int rl_connect_cookie;
struct rpcrdma_buffer *rl_buffer;
struct rpcrdma_rep *rl_reply;
+ struct xdr_stream rl_stream;
+ struct xdr_buf rl_hdrbuf;
struct ib_send_wr rl_send_wr;
struct ib_sge rl_send_sge[RPCRDMA_MAX_SEND_SGES];
struct rpcrdma_regbuf *rl_rdmabuf; /* xprt header */
@@ -403,7 +403,6 @@ struct rpcrdma_buffer {
int rb_send_count, rb_recv_count;
struct list_head rb_send_bufs;
struct list_head rb_recv_bufs;
- struct list_head rb_pending;
u32 rb_max_requests;
atomic_t rb_credits; /* most recent credit grant */
@@ -440,24 +439,27 @@ struct rpcrdma_create_data_internal {
* Statistics for RPCRDMA
*/
struct rpcrdma_stats {
+ /* accessed when sending a call */
unsigned long read_chunk_count;
unsigned long write_chunk_count;
unsigned long reply_chunk_count;
-
unsigned long long total_rdma_request;
- unsigned long long total_rdma_reply;
+ /* rarely accessed error counters */
unsigned long long pullup_copy_count;
- unsigned long long fixup_copy_count;
unsigned long hardway_register_count;
unsigned long failed_marshal_count;
unsigned long bad_reply_count;
- unsigned long nomsg_call_count;
- unsigned long bcall_count;
unsigned long mrs_recovered;
unsigned long mrs_orphaned;
unsigned long mrs_allocated;
+
+ /* accessed when receiving a reply */
+ unsigned long long total_rdma_reply;
+ unsigned long long fixup_copy_count;
unsigned long local_inv_needed;
+ unsigned long nomsg_call_count;
+ unsigned long bcall_count;
};
/*
@@ -465,7 +467,8 @@ struct rpcrdma_stats {
*/
struct rpcrdma_xprt;
struct rpcrdma_memreg_ops {
- int (*ro_map)(struct rpcrdma_xprt *,
+ struct rpcrdma_mr_seg *
+ (*ro_map)(struct rpcrdma_xprt *,
struct rpcrdma_mr_seg *, int, bool,
struct rpcrdma_mw **);
void (*ro_unmap_sync)(struct rpcrdma_xprt *,
@@ -552,34 +555,6 @@ void rpcrdma_destroy_req(struct rpcrdma_req *);
int rpcrdma_buffer_create(struct rpcrdma_xprt *);
void rpcrdma_buffer_destroy(struct rpcrdma_buffer *);
-static inline void
-rpcrdma_insert_req(struct rpcrdma_buffer *buffers, struct rpcrdma_req *req)
-{
- spin_lock(&buffers->rb_lock);
- if (list_empty(&req->rl_list))
- list_add_tail(&req->rl_list, &buffers->rb_pending);
- spin_unlock(&buffers->rb_lock);
-}
-
-static inline struct rpcrdma_req *
-rpcrdma_lookup_req_locked(struct rpcrdma_buffer *buffers, __be32 xid)
-{
- struct rpcrdma_req *pos;
-
- list_for_each_entry(pos, &buffers->rb_pending, rl_list)
- if (pos->rl_xid == xid)
- return pos;
- return NULL;
-}
-
-static inline void
-rpcrdma_remove_req(struct rpcrdma_buffer *buffers, struct rpcrdma_req *req)
-{
- spin_lock(&buffers->rb_lock);
- list_del(&req->rl_list);
- spin_unlock(&buffers->rb_lock);
-}
-
struct rpcrdma_mw *rpcrdma_get_mw(struct rpcrdma_xprt *);
void rpcrdma_put_mw(struct rpcrdma_xprt *, struct rpcrdma_mw *);
struct rpcrdma_req *rpcrdma_buffer_get(struct rpcrdma_buffer *);
@@ -638,10 +613,16 @@ enum rpcrdma_chunktype {
bool rpcrdma_prepare_send_sges(struct rpcrdma_ia *, struct rpcrdma_req *,
u32, struct xdr_buf *, enum rpcrdma_chunktype);
void rpcrdma_unmap_sges(struct rpcrdma_ia *, struct rpcrdma_req *);
-int rpcrdma_marshal_req(struct rpc_rqst *);
+int rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst);
void rpcrdma_set_max_header_sizes(struct rpcrdma_xprt *);
void rpcrdma_reply_handler(struct work_struct *work);
+static inline void rpcrdma_set_xdrlen(struct xdr_buf *xdr, size_t len)
+{
+ xdr->head[0].iov_len = len;
+ xdr->len = len;
+}
+
/* RPC/RDMA module init - xprtrdma/transport.c
*/
extern unsigned int xprt_rdma_max_inline_read;
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 4f154d388748..9b5de31aa429 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -969,10 +969,12 @@ static void xs_local_data_read_skb(struct rpc_xprt *xprt,
return;
/* Look up and lock the request corresponding to the given XID */
- spin_lock_bh(&xprt->transport_lock);
+ spin_lock(&xprt->recv_lock);
rovr = xprt_lookup_rqst(xprt, *xp);
if (!rovr)
goto out_unlock;
+ xprt_pin_rqst(rovr);
+ spin_unlock(&xprt->recv_lock);
task = rovr->rq_task;
copied = rovr->rq_private_buf.buflen;
@@ -981,13 +983,16 @@ static void xs_local_data_read_skb(struct rpc_xprt *xprt,
if (xs_local_copy_to_xdr(&rovr->rq_private_buf, skb)) {
dprintk("RPC: sk_buff copy failed\n");
- goto out_unlock;
+ spin_lock(&xprt->recv_lock);
+ goto out_unpin;
}
+ spin_lock(&xprt->recv_lock);
xprt_complete_rqst(task, copied);
-
+out_unpin:
+ xprt_unpin_rqst(rovr);
out_unlock:
- spin_unlock_bh(&xprt->transport_lock);
+ spin_unlock(&xprt->recv_lock);
}
static void xs_local_data_receive(struct sock_xprt *transport)
@@ -1050,10 +1055,12 @@ static void xs_udp_data_read_skb(struct rpc_xprt *xprt,
return;
/* Look up and lock the request corresponding to the given XID */
- spin_lock_bh(&xprt->transport_lock);
+ spin_lock(&xprt->recv_lock);
rovr = xprt_lookup_rqst(xprt, *xp);
if (!rovr)
goto out_unlock;
+ xprt_pin_rqst(rovr);
+ spin_unlock(&xprt->recv_lock);
task = rovr->rq_task;
if ((copied = rovr->rq_private_buf.buflen) > repsize)
@@ -1062,16 +1069,21 @@ static void xs_udp_data_read_skb(struct rpc_xprt *xprt,
/* Suck it into the iovec, verify checksum if not done by hw. */
if (csum_partial_copy_to_xdr(&rovr->rq_private_buf, skb)) {
__UDPX_INC_STATS(sk, UDP_MIB_INERRORS);
- goto out_unlock;
+ spin_lock(&xprt->recv_lock);
+ goto out_unpin;
}
__UDPX_INC_STATS(sk, UDP_MIB_INDATAGRAMS);
+ spin_lock_bh(&xprt->transport_lock);
xprt_adjust_cwnd(xprt, task, copied);
+ spin_unlock_bh(&xprt->transport_lock);
+ spin_lock(&xprt->recv_lock);
xprt_complete_rqst(task, copied);
-
+out_unpin:
+ xprt_unpin_rqst(rovr);
out_unlock:
- spin_unlock_bh(&xprt->transport_lock);
+ spin_unlock(&xprt->recv_lock);
}
static void xs_udp_data_receive(struct sock_xprt *transport)
@@ -1277,25 +1289,12 @@ static inline void xs_tcp_read_common(struct rpc_xprt *xprt,
}
len = desc->count;
- if (len > transport->tcp_reclen - transport->tcp_offset) {
- struct xdr_skb_reader my_desc;
-
- len = transport->tcp_reclen - transport->tcp_offset;
- memcpy(&my_desc, desc, sizeof(my_desc));
- my_desc.count = len;
- r = xdr_partial_copy_from_skb(rcvbuf, transport->tcp_copied,
- &my_desc, xdr_skb_read_bits);
- desc->count -= r;
- desc->offset += r;
- } else
- r = xdr_partial_copy_from_skb(rcvbuf, transport->tcp_copied,
+ if (len > transport->tcp_reclen - transport->tcp_offset)
+ desc->count = transport->tcp_reclen - transport->tcp_offset;
+ r = xdr_partial_copy_from_skb(rcvbuf, transport->tcp_copied,
desc, xdr_skb_read_bits);
- if (r > 0) {
- transport->tcp_copied += r;
- transport->tcp_offset += r;
- }
- if (r != len) {
+ if (desc->count) {
/* Error when copying to the receive buffer,
* usually because we weren't able to allocate
* additional buffer pages. All we can do now
@@ -1315,6 +1314,10 @@ static inline void xs_tcp_read_common(struct rpc_xprt *xprt,
return;
}
+ transport->tcp_copied += r;
+ transport->tcp_offset += r;
+ desc->count = len - r;
+
dprintk("RPC: XID %08x read %zd bytes\n",
ntohl(transport->tcp_xid), r);
dprintk("RPC: xprt = %p, tcp_copied = %lu, tcp_offset = %u, "
@@ -1343,21 +1346,24 @@ static inline int xs_tcp_read_reply(struct rpc_xprt *xprt,
dprintk("RPC: read reply XID %08x\n", ntohl(transport->tcp_xid));
/* Find and lock the request corresponding to this xid */
- spin_lock_bh(&xprt->transport_lock);
+ spin_lock(&xprt->recv_lock);
req = xprt_lookup_rqst(xprt, transport->tcp_xid);
if (!req) {
dprintk("RPC: XID %08x request not found!\n",
ntohl(transport->tcp_xid));
- spin_unlock_bh(&xprt->transport_lock);
+ spin_unlock(&xprt->recv_lock);
return -1;
}
+ xprt_pin_rqst(req);
+ spin_unlock(&xprt->recv_lock);
xs_tcp_read_common(xprt, desc, req);
+ spin_lock(&xprt->recv_lock);
if (!(transport->tcp_flags & TCP_RCV_COPY_DATA))
xprt_complete_rqst(req->rq_task, transport->tcp_copied);
-
- spin_unlock_bh(&xprt->transport_lock);
+ xprt_unpin_rqst(req);
+ spin_unlock(&xprt->recv_lock);
return 0;
}
@@ -1376,11 +1382,9 @@ static int xs_tcp_read_callback(struct rpc_xprt *xprt,
container_of(xprt, struct sock_xprt, xprt);
struct rpc_rqst *req;
- /* Look up and lock the request corresponding to the given XID */
- spin_lock_bh(&xprt->transport_lock);
+ /* Look up the request corresponding to the given XID */
req = xprt_lookup_bc_request(xprt, transport->tcp_xid);
if (req == NULL) {
- spin_unlock_bh(&xprt->transport_lock);
printk(KERN_WARNING "Callback slot table overflowed\n");
xprt_force_disconnect(xprt);
return -1;
@@ -1391,7 +1395,6 @@ static int xs_tcp_read_callback(struct rpc_xprt *xprt,
if (!(transport->tcp_flags & TCP_RCV_COPY_DATA))
xprt_complete_bc_request(req, transport->tcp_copied);
- spin_unlock_bh(&xprt->transport_lock);
return 0;
}
@@ -1516,6 +1519,7 @@ static void xs_tcp_data_receive(struct sock_xprt *transport)
.arg.data = xprt,
};
unsigned long total = 0;
+ int loop;
int read = 0;
mutex_lock(&transport->recv_mutex);
@@ -1524,20 +1528,20 @@ static void xs_tcp_data_receive(struct sock_xprt *transport)
goto out;
/* We use rd_desc to pass struct xprt to xs_tcp_data_recv */
- for (;;) {
+ for (loop = 0; loop < 64; loop++) {
lock_sock(sk);
read = tcp_read_sock(sk, &rd_desc, xs_tcp_data_recv);
if (read <= 0) {
clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state);
release_sock(sk);
- if (!test_bit(XPRT_SOCK_DATA_READY, &transport->sock_state))
- break;
- } else {
- release_sock(sk);
- total += read;
+ break;
}
+ release_sock(sk);
+ total += read;
rd_desc.count = 65536;
}
+ if (test_bit(XPRT_SOCK_DATA_READY, &transport->sock_state))
+ queue_work(xprtiod_workqueue, &transport->recv_worker);
out:
mutex_unlock(&transport->recv_mutex);
trace_xs_tcp_data_ready(xprt, read, total);
@@ -2724,7 +2728,7 @@ static void bc_destroy(struct rpc_xprt *xprt)
module_put(THIS_MODULE);
}
-static struct rpc_xprt_ops xs_local_ops = {
+static const struct rpc_xprt_ops xs_local_ops = {
.reserve_xprt = xprt_reserve_xprt,
.release_xprt = xs_tcp_release_xprt,
.alloc_slot = xprt_alloc_slot,
@@ -2742,7 +2746,7 @@ static struct rpc_xprt_ops xs_local_ops = {
.disable_swap = xs_disable_swap,
};
-static struct rpc_xprt_ops xs_udp_ops = {
+static const struct rpc_xprt_ops xs_udp_ops = {
.set_buffer_size = xs_udp_set_buffer_size,
.reserve_xprt = xprt_reserve_xprt_cong,
.release_xprt = xprt_release_xprt_cong,
@@ -2764,7 +2768,7 @@ static struct rpc_xprt_ops xs_udp_ops = {
.inject_disconnect = xs_inject_disconnect,
};
-static struct rpc_xprt_ops xs_tcp_ops = {
+static const struct rpc_xprt_ops xs_tcp_ops = {
.reserve_xprt = xprt_reserve_xprt,
.release_xprt = xs_tcp_release_xprt,
.alloc_slot = xprt_lock_and_alloc_slot,
@@ -2795,7 +2799,7 @@ static struct rpc_xprt_ops xs_tcp_ops = {
* The rpc_xprt_ops for the server backchannel
*/
-static struct rpc_xprt_ops bc_tcp_ops = {
+static const struct rpc_xprt_ops bc_tcp_ops = {
.reserve_xprt = xprt_reserve_xprt,
.release_xprt = xprt_release_xprt,
.alloc_slot = xprt_alloc_slot,
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index fa596fa71ba7..7d80040a37b6 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -639,7 +639,7 @@ sendpage_end:
return ret;
}
-void tls_sw_free_resources(struct sock *sk)
+static void tls_sw_free_resources(struct sock *sk)
{
struct tls_context *tls_ctx = tls_get_ctx(sk);
struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);