From df71837d5024e2524cd51c93621e558aa7dd9f3f Mon Sep 17 00:00:00 2001 From: Trent Jaeger Date: Tue, 13 Dec 2005 23:12:27 -0800 Subject: [LSM-IPSec]: Security association restriction. This patch series implements per packet access control via the extension of the Linux Security Modules (LSM) interface by hooks in the XFRM and pfkey subsystems that leverage IPSec security associations to label packets. Extensions to the SELinux LSM are included that leverage the patch for this purpose. This patch implements the changes necessary to the XFRM subsystem, pfkey interface, ipv4/ipv6, and xfrm_user interface to restrict a socket to use only authorized security associations (or no security association) to send/receive network packets. Patch purpose: The patch is designed to enable access control per packets based on the strongly authenticated IPSec security association. Such access controls augment the existing ones based on network interface and IP address. The former are very coarse-grained, and the latter can be spoofed. By using IPSec, the system can control access to remote hosts based on cryptographic keys generated using the IPSec mechanism. This enables access control on a per-machine basis or per-application if the remote machine is running the same mechanism and trusted to enforce the access control policy. Patch design approach: The overall approach is that policy (xfrm_policy) entries set by user-level programs (e.g., setkey for ipsec-tools) are extended with a security context that is used at policy selection time in the XFRM subsystem to restrict the sockets that can send/receive packets via security associations (xfrm_states) that are built from those policies. A presentation available at www.selinux-symposium.org/2005/presentations/session2/2-3-jaeger.pdf from the SELinux symposium describes the overall approach. Patch implementation details: On output, the policy retrieved (via xfrm_policy_lookup or xfrm_sk_policy_lookup) must be authorized for the security context of the socket and the same security context is required for resultant security association (retrieved or negotiated via racoon in ipsec-tools). This is enforced in xfrm_state_find. On input, the policy retrieved must also be authorized for the socket (at __xfrm_policy_check), and the security context of the policy must also match the security association being used. The patch has virtually no impact on packets that do not use IPSec. The existing Netfilter (outgoing) and LSM rcv_skb hooks are used as before. Also, if IPSec is used without security contexts, the impact is minimal. The LSM must allow such policies to be selected for the combination of socket and remote machine, but subsequent IPSec processing proceeds as in the original case. Testing: The pfkey interface is tested using the ipsec-tools. ipsec-tools have been modified (a separate ipsec-tools patch is available for version 0.5) that supports assignment of xfrm_policy entries and security associations with security contexts via setkey and the negotiation using the security contexts via racoon. The xfrm_user interface is tested via ad hoc programs that set security contexts. These programs are also available from me, and contain programs for setting, getting, and deleting policy for testing this interface. Testing of sa functions was done by tracing kernel behavior. Signed-off-by: Trent Jaeger Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- include/net/flow.h | 7 ++++--- include/net/xfrm.h | 27 +++++++++++++++++++++++++-- 2 files changed, 29 insertions(+), 5 deletions(-) (limited to 'include/net') diff --git a/include/net/flow.h b/include/net/flow.h index 9a5c94b1a0ec..ec7eb86eb203 100644 --- a/include/net/flow.h +++ b/include/net/flow.h @@ -84,11 +84,12 @@ struct flowi { #define FLOW_DIR_OUT 1 #define FLOW_DIR_FWD 2 -typedef void (*flow_resolve_t)(struct flowi *key, u16 family, u8 dir, +struct sock; +typedef void (*flow_resolve_t)(struct flowi *key, u32 sk_sid, u16 family, u8 dir, void **objp, atomic_t **obj_refp); -extern void *flow_cache_lookup(struct flowi *key, u16 family, u8 dir, - flow_resolve_t resolver); +extern void *flow_cache_lookup(struct flowi *key, u32 sk_sid, u16 family, u8 dir, + flow_resolve_t resolver); extern void flow_cache_flush(void); extern atomic_t flow_cache_genid; diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 1cdb87912137..487abca3ca6f 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -144,6 +144,9 @@ struct xfrm_state * transformer. */ struct xfrm_type *type; + /* Security context */ + struct xfrm_sec_ctx *security; + /* Private data of this transformer, format is opaque, * interpreted by xfrm_type methods. */ void *data; @@ -298,6 +301,7 @@ struct xfrm_policy __u8 flags; __u8 dead; __u8 xfrm_nr; + struct xfrm_sec_ctx *security; struct xfrm_tmpl xfrm_vec[XFRM_MAX_DEPTH]; }; @@ -510,6 +514,25 @@ xfrm_selector_match(struct xfrm_selector *sel, struct flowi *fl, return 0; } +#ifdef CONFIG_SECURITY_NETWORK_XFRM +/* If neither has a context --> match + * Otherwise, both must have a context and the sids, doi, alg must match + */ +static inline int xfrm_sec_ctx_match(struct xfrm_sec_ctx *s1, struct xfrm_sec_ctx *s2) +{ + return ((!s1 && !s2) || + (s1 && s2 && + (s1->ctx_sid == s2->ctx_sid) && + (s1->ctx_doi == s2->ctx_doi) && + (s1->ctx_alg == s2->ctx_alg))); +} +#else +static inline int xfrm_sec_ctx_match(struct xfrm_sec_ctx *s1, struct xfrm_sec_ctx *s2) +{ + return 1; +} +#endif + /* A struct encoding bundle of transformations to apply to some set of flow. * * dst->child points to the next element of bundle. @@ -878,8 +901,8 @@ static inline int xfrm_dst_lookup(struct xfrm_dst **dst, struct flowi *fl, unsig struct xfrm_policy *xfrm_policy_alloc(gfp_t gfp); extern int xfrm_policy_walk(int (*func)(struct xfrm_policy *, int, int, void*), void *); int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl); -struct xfrm_policy *xfrm_policy_bysel(int dir, struct xfrm_selector *sel, - int delete); +struct xfrm_policy *xfrm_policy_bysel_ctx(int dir, struct xfrm_selector *sel, + struct xfrm_sec_ctx *ctx, int delete); struct xfrm_policy *xfrm_policy_byid(int dir, u32 id, int delete); void xfrm_policy_flush(void); u32 xfrm_get_acqseq(void); -- cgit v1.2.3 From 89cee8b1cbb9dac40c92ef1968aea2b45f82fd18 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 13 Dec 2005 23:14:27 -0800 Subject: [IPV4]: Safer reassembly Another spin of Herbert Xu's "safer ip reassembly" patch for 2.6.16. (The original patch is here: http://marc.theaimsgroup.com/?l=linux-netdev&m=112281936522415&w=2 and my only contribution is to have tested it.) This patch (optionally) does additional checks before accepting IP fragments, which can greatly reduce the possibility of reassembling fragments which originated from different IP datagrams. Signed-off-by: Herbert Xu Signed-off-by: Arthur Kepner Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 23 ++++++++++++ include/linux/sysctl.h | 1 + include/net/inetpeer.h | 1 + include/net/ip.h | 2 + net/ipv4/inetpeer.c | 1 + net/ipv4/ip_fragment.c | 68 +++++++++++++++++++++++++++++++++- net/ipv4/ip_output.c | 1 + net/ipv4/sysctl_net_ipv4.c | 10 +++++ 8 files changed, 106 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index ebc09a159f62..2b7cf19a06ad 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -46,6 +46,29 @@ ipfrag_secret_interval - INTEGER for the hash secret) for IP fragments. Default: 600 +ipfrag_max_dist - INTEGER + ipfrag_max_dist is a non-negative integer value which defines the + maximum "disorder" which is allowed among fragments which share a + common IP source address. Note that reordering of packets is + not unusual, but if a large number of fragments arrive from a source + IP address while a particular fragment queue remains incomplete, it + probably indicates that one or more fragments belonging to that queue + have been lost. When ipfrag_max_dist is positive, an additional check + is done on fragments before they are added to a reassembly queue - if + ipfrag_max_dist (or more) fragments have arrived from a particular IP + address between additions to any IP fragment queue using that source + address, it's presumed that one or more fragments in the queue are + lost. The existing fragment queue will be dropped, and a new one + started. An ipfrag_max_dist value of zero disables this check. + + Using a very small value, e.g. 1 or 2, for ipfrag_max_dist can + result in unnecessarily dropping fragment queues when normal + reordering of packets occurs, which could lead to poor application + performance. Using a very large value, e.g. 50000, increases the + likelihood of incorrectly reassembling IP fragments that originate + from different IP datagrams, which could result in data corruption. + Default: 64 + INET peer storage: inet_peer_threshold - INTEGER diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 4be34ef8c2f7..93fa765e47d3 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -390,6 +390,7 @@ enum NET_IPV4_ICMP_ERRORS_USE_INBOUND_IFADDR=109, NET_TCP_CONG_CONTROL=110, NET_TCP_ABC=111, + NET_IPV4_IPFRAG_MAX_DIST=112, }; enum { diff --git a/include/net/inetpeer.h b/include/net/inetpeer.h index 7fda471002b6..0965515f40cf 100644 --- a/include/net/inetpeer.h +++ b/include/net/inetpeer.h @@ -25,6 +25,7 @@ struct inet_peer __u32 v4daddr; /* peer's address */ __u16 avl_height; __u16 ip_id_count; /* IP ID for the next packet */ + atomic_t rid; /* Frag reception counter */ __u32 tcp_ts; unsigned long tcp_ts_stamp; }; diff --git a/include/net/ip.h b/include/net/ip.h index e4563bbee6ea..4d6294ba038e 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -45,6 +45,7 @@ struct inet_skb_parm #define IPSKB_TRANSLATED 2 #define IPSKB_FORWARDED 4 #define IPSKB_XFRM_TUNNEL_SIZE 8 +#define IPSKB_FRAG_COMPLETE 16 }; struct ipcm_cookie @@ -168,6 +169,7 @@ extern int sysctl_ipfrag_high_thresh; extern int sysctl_ipfrag_low_thresh; extern int sysctl_ipfrag_time; extern int sysctl_ipfrag_secret_interval; +extern int sysctl_ipfrag_max_dist; /* From inetpeer.c */ extern int inet_peer_threshold; diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index 2fc3fd38924f..ce5fe3f74a3d 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -401,6 +401,7 @@ struct inet_peer *inet_getpeer(__u32 daddr, int create) return NULL; n->v4daddr = daddr; atomic_set(&n->refcnt, 1); + atomic_set(&n->rid, 0); n->ip_id_count = secure_ip_id(daddr); n->tcp_ts_stamp = 0; diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 8ce0ce2ee48e..ce2b70ce4018 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -22,6 +22,7 @@ * Patrick McHardy : LRU queue of frag heads for evictor. */ +#include #include #include #include @@ -38,6 +39,7 @@ #include #include #include +#include #include #include #include @@ -56,6 +58,8 @@ int sysctl_ipfrag_high_thresh = 256*1024; int sysctl_ipfrag_low_thresh = 192*1024; +int sysctl_ipfrag_max_dist = 64; + /* Important NOTE! Fragment queue must be destroyed before MSL expires. * RFC791 is wrong proposing to prolongate timer each fragment arrival by TTL. */ @@ -89,8 +93,10 @@ struct ipq { spinlock_t lock; atomic_t refcnt; struct timer_list timer; /* when will this queue expire? */ - int iif; struct timeval stamp; + int iif; + unsigned int rid; + struct inet_peer *peer; }; /* Hash table. */ @@ -195,6 +201,9 @@ static void ip_frag_destroy(struct ipq *qp, int *work) BUG_TRAP(qp->last_in&COMPLETE); BUG_TRAP(del_timer(&qp->timer) == 0); + if (qp->peer) + inet_putpeer(qp->peer); + /* Release all fragment data. */ fp = qp->fragments; while (fp) { @@ -353,6 +362,7 @@ static struct ipq *ip_frag_create(unsigned hash, struct iphdr *iph, u32 user) qp->meat = 0; qp->fragments = NULL; qp->iif = 0; + qp->peer = sysctl_ipfrag_max_dist ? inet_getpeer(iph->saddr, 1) : NULL; /* Initialize a timer for this entry. */ init_timer(&qp->timer); @@ -398,6 +408,56 @@ static inline struct ipq *ip_find(struct iphdr *iph, u32 user) return ip_frag_create(hash, iph, user); } +/* Is the fragment too far ahead to be part of ipq? */ +static inline int ip_frag_too_far(struct ipq *qp) +{ + struct inet_peer *peer = qp->peer; + unsigned int max = sysctl_ipfrag_max_dist; + unsigned int start, end; + + int rc; + + if (!peer || !max) + return 0; + + start = qp->rid; + end = atomic_inc_return(&peer->rid); + qp->rid = end; + + rc = qp->fragments && (end - start) > max; + + if (rc) { + IP_INC_STATS_BH(IPSTATS_MIB_REASMFAILS); + } + + return rc; +} + +static int ip_frag_reinit(struct ipq *qp) +{ + struct sk_buff *fp; + + if (!mod_timer(&qp->timer, jiffies + sysctl_ipfrag_time)) { + atomic_inc(&qp->refcnt); + return -ETIMEDOUT; + } + + fp = qp->fragments; + do { + struct sk_buff *xp = fp->next; + frag_kfree_skb(fp, NULL); + fp = xp; + } while (fp); + + qp->last_in = 0; + qp->len = 0; + qp->meat = 0; + qp->fragments = NULL; + qp->iif = 0; + + return 0; +} + /* Add new segment to existing queue. */ static void ip_frag_queue(struct ipq *qp, struct sk_buff *skb) { @@ -408,6 +468,12 @@ static void ip_frag_queue(struct ipq *qp, struct sk_buff *skb) if (qp->last_in & COMPLETE) goto err; + if (!(IPCB(skb)->flags & IPSKB_FRAG_COMPLETE) && + unlikely(ip_frag_too_far(qp)) && unlikely(ip_frag_reinit(qp))) { + ipq_kill(qp); + goto err; + } + offset = ntohs(skb->nh.iph->frag_off); flags = offset & ~IP_OFFSET; offset &= IP_OFFSET; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index eba64e2bd397..2a830de3a699 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -445,6 +445,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) hlen = iph->ihl * 4; mtu = dst_mtu(&rt->u.dst) - hlen; /* Size of data space */ + IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE; /* When frag_list is given, use it. First, check its validity: * some transformers could create wrong frag_list or break existing diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 01444a02b48b..dbf82955aabe 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -22,6 +22,7 @@ extern int sysctl_ip_nonlocal_bind; #ifdef CONFIG_SYSCTL +static int zero; static int tcp_retr1_max = 255; static int ip_local_port_range_min[] = { 1, 1 }; static int ip_local_port_range_max[] = { 65535, 65535 }; @@ -613,6 +614,15 @@ ctl_table ipv4_table[] = { .proc_handler = &proc_dointvec_jiffies, .strategy = &sysctl_jiffies }, + { + .ctl_name = NET_IPV4_IPFRAG_MAX_DIST, + .procname = "ipfrag_max_dist", + .data = &sysctl_ipfrag_max_dist, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .extra1 = &zero + }, { .ctl_name = NET_TCP_NO_METRICS_SAVE, .procname = "tcp_no_metrics_save", -- cgit v1.2.3 From 971af18bbfabb7b7c9c548da34a51e30869c08fc Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 13 Dec 2005 23:14:47 -0800 Subject: [IPV6]: Reuse inet_csk_get_port in tcp_v6_get_port Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- include/net/inet_connection_sock.h | 6 ++- net/dccp/ipv4.c | 3 +- net/ipv4/inet_connection_sock.c | 11 +++-- net/ipv4/tcp_ipv4.c | 3 +- net/ipv6/tcp_ipv6.c | 95 ++------------------------------------ 5 files changed, 21 insertions(+), 97 deletions(-) (limited to 'include/net') diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index b0c99060b78d..edc68e858d51 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -192,8 +192,12 @@ extern struct request_sock *inet_csk_search_req(const struct sock *sk, const __u16 rport, const __u32 raddr, const __u32 laddr); +extern int inet_csk_bind_conflict(const struct sock *sk, + const struct inet_bind_bucket *tb); extern int inet_csk_get_port(struct inet_hashinfo *hashinfo, - struct sock *sk, unsigned short snum); + struct sock *sk, unsigned short snum, + int (*bind_conflict)(const struct sock *sk, + const struct inet_bind_bucket *tb)); extern struct dst_entry* inet_csk_route_req(struct sock *sk, const struct request_sock *req); diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 656e13e38cfb..1ac3e30ae791 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -37,7 +37,8 @@ EXPORT_SYMBOL_GPL(dccp_hashinfo); static int dccp_v4_get_port(struct sock *sk, const unsigned short snum) { - return inet_csk_get_port(&dccp_hashinfo, sk, snum); + return inet_csk_get_port(&dccp_hashinfo, sk, snum, + inet_csk_bind_conflict); } static void dccp_v4_hash(struct sock *sk) diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 3fe021f1a566..f05b6e761102 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -37,7 +37,8 @@ EXPORT_SYMBOL(inet_csk_timer_bug_msg); */ int sysctl_local_port_range[2] = { 1024, 4999 }; -static inline int inet_csk_bind_conflict(struct sock *sk, struct inet_bind_bucket *tb) +int inet_csk_bind_conflict(const struct sock *sk, + const struct inet_bind_bucket *tb) { const u32 sk_rcv_saddr = inet_rcv_saddr(sk); struct sock *sk2; @@ -62,11 +63,15 @@ static inline int inet_csk_bind_conflict(struct sock *sk, struct inet_bind_bucke return node != NULL; } +EXPORT_SYMBOL_GPL(inet_csk_bind_conflict); + /* Obtain a reference to a local port for the given sock, * if snum is zero it means select any available local port. */ int inet_csk_get_port(struct inet_hashinfo *hashinfo, - struct sock *sk, unsigned short snum) + struct sock *sk, unsigned short snum, + int (*bind_conflict)(const struct sock *sk, + const struct inet_bind_bucket *tb)) { struct inet_bind_hashbucket *head; struct hlist_node *node; @@ -125,7 +130,7 @@ tb_found: goto success; } else { ret = 1; - if (inet_csk_bind_conflict(sk, tb)) + if (bind_conflict(sk, tb)) goto fail_unlock; } } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 4d5021e1929b..2aa19c89a94a 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -97,7 +97,8 @@ struct inet_hashinfo __cacheline_aligned tcp_hashinfo = { static int tcp_v4_get_port(struct sock *sk, unsigned short snum) { - return inet_csk_get_port(&tcp_hashinfo, sk, snum); + return inet_csk_get_port(&tcp_hashinfo, sk, snum, + inet_csk_bind_conflict); } static void tcp_v4_hash(struct sock *sk) diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 8827389abaf7..76c8f5a2f7f3 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -76,8 +76,8 @@ static int tcp_v6_xmit(struct sk_buff *skb, int ipfragok); static struct tcp_func ipv6_mapped; static struct tcp_func ipv6_specific; -static inline int tcp_v6_bind_conflict(const struct sock *sk, - const struct inet_bind_bucket *tb) +int inet6_csk_bind_conflict(const struct sock *sk, + const struct inet_bind_bucket *tb) { const struct sock *sk2; const struct hlist_node *node; @@ -97,97 +97,10 @@ static inline int tcp_v6_bind_conflict(const struct sock *sk, return node != NULL; } -/* Grrr, addr_type already calculated by caller, but I don't want - * to add some silly "cookie" argument to this method just for that. - * But it doesn't matter, the recalculation is in the rarest path - * this function ever takes. - */ static int tcp_v6_get_port(struct sock *sk, unsigned short snum) { - struct inet_bind_hashbucket *head; - struct inet_bind_bucket *tb; - struct hlist_node *node; - int ret; - - local_bh_disable(); - if (snum == 0) { - int low = sysctl_local_port_range[0]; - int high = sysctl_local_port_range[1]; - int remaining = (high - low) + 1; - int rover = net_random() % (high - low) + low; - - do { - head = &tcp_hashinfo.bhash[inet_bhashfn(rover, tcp_hashinfo.bhash_size)]; - spin_lock(&head->lock); - inet_bind_bucket_for_each(tb, node, &head->chain) - if (tb->port == rover) - goto next; - break; - next: - spin_unlock(&head->lock); - if (++rover > high) - rover = low; - } while (--remaining > 0); - - /* Exhausted local port range during search? It is not - * possible for us to be holding one of the bind hash - * locks if this test triggers, because if 'remaining' - * drops to zero, we broke out of the do/while loop at - * the top level, not from the 'break;' statement. - */ - ret = 1; - if (unlikely(remaining <= 0)) - goto fail; - - /* OK, here is the one we will use. */ - snum = rover; - } else { - head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)]; - spin_lock(&head->lock); - inet_bind_bucket_for_each(tb, node, &head->chain) - if (tb->port == snum) - goto tb_found; - } - tb = NULL; - goto tb_not_found; -tb_found: - if (tb && !hlist_empty(&tb->owners)) { - if (tb->fastreuse > 0 && sk->sk_reuse && - sk->sk_state != TCP_LISTEN) { - goto success; - } else { - ret = 1; - if (tcp_v6_bind_conflict(sk, tb)) - goto fail_unlock; - } - } -tb_not_found: - ret = 1; - if (tb == NULL) { - tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, snum); - if (tb == NULL) - goto fail_unlock; - } - if (hlist_empty(&tb->owners)) { - if (sk->sk_reuse && sk->sk_state != TCP_LISTEN) - tb->fastreuse = 1; - else - tb->fastreuse = 0; - } else if (tb->fastreuse && - (!sk->sk_reuse || sk->sk_state == TCP_LISTEN)) - tb->fastreuse = 0; - -success: - if (!inet_csk(sk)->icsk_bind_hash) - inet_bind_hash(sk, tb, snum); - BUG_TRAP(inet_csk(sk)->icsk_bind_hash == tb); - ret = 0; - -fail_unlock: - spin_unlock(&head->lock); -fail: - local_bh_enable(); - return ret; + return inet_csk_get_port(&tcp_hashinfo, sk, snum, + inet6_csk_bind_conflict); } static __inline__ void __tcp_v6_hash(struct sock *sk) -- cgit v1.2.3 From 90b19d31695371bd3ed256d4c9e280861cd6ae7e Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 13 Dec 2005 23:15:01 -0800 Subject: [IPV6]: Generalise __tcp_v6_hash, renaming it to __inet6_hash Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- include/net/inet6_hashtables.h | 26 ++++++++++++++++++++++++++ net/ipv6/tcp_ipv6.c | 34 ++++------------------------------ 2 files changed, 30 insertions(+), 30 deletions(-) (limited to 'include/net') diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h index 5a2beed5a770..a4a204f99ea6 100644 --- a/include/net/inet6_hashtables.h +++ b/include/net/inet6_hashtables.h @@ -48,6 +48,32 @@ static inline int inet6_sk_ehashfn(const struct sock *sk) return inet6_ehashfn(laddr, lport, faddr, fport); } +static inline void __inet6_hash(struct inet_hashinfo *hashinfo, + struct sock *sk) +{ + struct hlist_head *list; + rwlock_t *lock; + + BUG_TRAP(sk_unhashed(sk)); + + if (sk->sk_state == TCP_LISTEN) { + list = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; + lock = &hashinfo->lhash_lock; + inet_listen_wlock(hashinfo); + } else { + unsigned int hash; + sk->sk_hash = hash = inet6_sk_ehashfn(sk); + hash &= (hashinfo->ehash_size - 1); + list = &hashinfo->ehash[hash].chain; + lock = &hashinfo->ehash[hash].lock; + write_lock(lock); + } + + __sk_add_node(sk, list); + sock_prot_inc_use(sk->sk_prot); + write_unlock(lock); +} + /* * Sockets in TCP_CLOSE state are _always_ taken out of the hash, so * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 76c8f5a2f7f3..bf41f84d6692 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -103,32 +103,6 @@ static int tcp_v6_get_port(struct sock *sk, unsigned short snum) inet6_csk_bind_conflict); } -static __inline__ void __tcp_v6_hash(struct sock *sk) -{ - struct hlist_head *list; - rwlock_t *lock; - - BUG_TRAP(sk_unhashed(sk)); - - if (sk->sk_state == TCP_LISTEN) { - list = &tcp_hashinfo.listening_hash[inet_sk_listen_hashfn(sk)]; - lock = &tcp_hashinfo.lhash_lock; - inet_listen_wlock(&tcp_hashinfo); - } else { - unsigned int hash; - sk->sk_hash = hash = inet6_sk_ehashfn(sk); - hash &= (tcp_hashinfo.ehash_size - 1); - list = &tcp_hashinfo.ehash[hash].chain; - lock = &tcp_hashinfo.ehash[hash].lock; - write_lock(lock); - } - - __sk_add_node(sk, list); - sock_prot_inc_use(sk->sk_prot); - write_unlock(lock); -} - - static void tcp_v6_hash(struct sock *sk) { if (sk->sk_state != TCP_CLOSE) { @@ -139,7 +113,7 @@ static void tcp_v6_hash(struct sock *sk) return; } local_bh_disable(); - __tcp_v6_hash(sk); + __inet6_hash(&tcp_hashinfo, sk); local_bh_enable(); } } @@ -374,7 +348,7 @@ ok: inet_bind_hash(sk, tb, port); if (sk_unhashed(sk)) { inet_sk(sk)->sport = htons(port); - __tcp_v6_hash(sk); + __inet6_hash(&tcp_hashinfo, sk); } spin_unlock(&head->lock); @@ -392,7 +366,7 @@ ok: spin_lock_bh(&head->lock); if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { - __tcp_v6_hash(sk); + __inet6_hash(&tcp_hashinfo, sk); spin_unlock_bh(&head->lock); return 0; } else { @@ -1295,7 +1269,7 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, newinet->daddr = newinet->saddr = newinet->rcv_saddr = LOOPBACK4_IPV6; - __tcp_v6_hash(newsk); + __inet6_hash(&tcp_hashinfo, newsk); inet_inherit_port(&tcp_hashinfo, sk, newsk); return newsk; -- cgit v1.2.3 From c2977c2213993bff51911f4117281b31c4612591 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 13 Dec 2005 23:15:12 -0800 Subject: [ICSK]: make inet_csk_reqsk_queue_hash_add timeout arg unsigned long Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- include/net/inet_connection_sock.h | 2 +- net/ipv4/inet_connection_sock.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index edc68e858d51..ccc81a1c550c 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -211,7 +211,7 @@ static inline void inet_csk_reqsk_queue_add(struct sock *sk, extern void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, - const unsigned timeout); + unsigned long timeout); static inline void inet_csk_reqsk_queue_removed(struct sock *sk, struct request_sock *req) diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index f05b6e761102..e2bf508ed770 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -385,7 +385,7 @@ struct request_sock *inet_csk_search_req(const struct sock *sk, EXPORT_SYMBOL_GPL(inet_csk_search_req); void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, - const unsigned timeout) + unsigned long timeout) { struct inet_connection_sock *icsk = inet_csk(sk); struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; -- cgit v1.2.3 From 8129765ac07c2455c927051e3a8b048b619b56ee Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 13 Dec 2005 23:15:24 -0800 Subject: [IPV6]: Generalise tcp_v6_search_req & tcp_v6_synq_add More work is needed tho to introduce inet6_request_sock from tcp6_request_sock, in the same layout considerations as ipv6_pinfo in inet_sock, next changeset will do that. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- include/net/inet6_connection_sock.h | 31 ++++++++++++ include/net/request_sock.h | 2 +- net/ipv6/Makefile | 3 +- net/ipv6/inet6_connection_sock.c | 96 +++++++++++++++++++++++++++++++++++++ net/ipv6/tcp_ipv6.c | 78 +++--------------------------- 5 files changed, 137 insertions(+), 73 deletions(-) create mode 100644 include/net/inet6_connection_sock.h create mode 100644 net/ipv6/inet6_connection_sock.c (limited to 'include/net') diff --git a/include/net/inet6_connection_sock.h b/include/net/inet6_connection_sock.h new file mode 100644 index 000000000000..aa30ebde70dc --- /dev/null +++ b/include/net/inet6_connection_sock.h @@ -0,0 +1,31 @@ +/* + * NET Generic infrastructure for INET6 connection oriented protocols. + * + * Authors: Many people, see the TCPv6 sources + * + * From code originally in TCPv6 + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#ifndef _INET6_CONNECTION_SOCK_H +#define _INET6_CONNECTION_SOCK_H + +#include + +struct sock; +struct request_sock; + +extern struct request_sock *inet6_csk_search_req(const struct sock *sk, + struct request_sock ***prevp, + const __u16 rport, + const struct in6_addr *raddr, + const struct in6_addr *laddr, + const int iif); + +extern void inet6_csk_reqsk_queue_hash_add(struct sock *sk, + struct request_sock *req, + const unsigned long timeout); +#endif /* _INET6_CONNECTION_SOCK_H */ diff --git a/include/net/request_sock.h b/include/net/request_sock.h index b52cc52ffe39..11641c9384f7 100644 --- a/include/net/request_sock.h +++ b/include/net/request_sock.h @@ -244,7 +244,7 @@ static inline int reqsk_queue_is_full(const struct request_sock_queue *queue) static inline void reqsk_queue_hash_req(struct request_sock_queue *queue, u32 hash, struct request_sock *req, - unsigned timeout) + unsigned long timeout) { struct listen_sock *lopt = queue->listen_opt; diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile index 6460eec834b7..9601fd7f9d66 100644 --- a/net/ipv6/Makefile +++ b/net/ipv6/Makefile @@ -8,7 +8,8 @@ ipv6-objs := af_inet6.o anycast.o ip6_output.o ip6_input.o addrconf.o sit.o \ route.o ip6_fib.o ipv6_sockglue.o ndisc.o udp.o raw.o \ protocol.o icmp.o mcast.o reassembly.o tcp_ipv6.o \ exthdrs.o sysctl_net_ipv6.o datagram.o proc.o \ - ip6_flowlabel.o ipv6_syms.o netfilter.o + ip6_flowlabel.o ipv6_syms.o netfilter.o \ + inet6_connection_sock.o ipv6-$(CONFIG_XFRM) += xfrm6_policy.o xfrm6_state.o xfrm6_input.o \ xfrm6_output.o diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c new file mode 100644 index 000000000000..04ff44344f90 --- /dev/null +++ b/net/ipv6/inet6_connection_sock.c @@ -0,0 +1,96 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Support for INET6 connection oriented protocols. + * + * Authors: See the TCPv6 sources + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or(at your option) any later version. + */ + +#include +#include +#include +#include +#include + +#include +#include +#include + +/* + * request_sock (formerly open request) hash tables. + */ +static u32 inet6_synq_hash(const struct in6_addr *raddr, const u16 rport, + const u32 rnd, const u16 synq_hsize) +{ + u32 a = raddr->s6_addr32[0]; + u32 b = raddr->s6_addr32[1]; + u32 c = raddr->s6_addr32[2]; + + a += JHASH_GOLDEN_RATIO; + b += JHASH_GOLDEN_RATIO; + c += rnd; + __jhash_mix(a, b, c); + + a += raddr->s6_addr32[3]; + b += (u32)rport; + __jhash_mix(a, b, c); + + return c & (synq_hsize - 1); +} + +struct request_sock *inet6_csk_search_req(const struct sock *sk, + struct request_sock ***prevp, + const __u16 rport, + const struct in6_addr *raddr, + const struct in6_addr *laddr, + const int iif) +{ + const struct inet_connection_sock *icsk = inet_csk(sk); + struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; + struct request_sock *req, **prev; + + for (prev = &lopt->syn_table[inet6_synq_hash(raddr, rport, + lopt->hash_rnd, + lopt->nr_table_entries)]; + (req = *prev) != NULL; + prev = &req->dl_next) { + const struct tcp6_request_sock *treq = tcp6_rsk(req); + + if (inet_rsk(req)->rmt_port == rport && + req->rsk_ops->family == AF_INET6 && + ipv6_addr_equal(&treq->rmt_addr, raddr) && + ipv6_addr_equal(&treq->loc_addr, laddr) && + (!treq->iif || treq->iif == iif)) { + BUG_TRAP(req->sk == NULL); + *prevp = prev; + return req; + } + } + + return NULL; +} + +EXPORT_SYMBOL_GPL(inet6_csk_search_req); + +void inet6_csk_reqsk_queue_hash_add(struct sock *sk, + struct request_sock *req, + const unsigned long timeout) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; + const u32 h = inet6_synq_hash(&tcp6_rsk(req)->rmt_addr, + inet_rsk(req)->rmt_port, + lopt->hash_rnd, lopt->nr_table_entries); + + reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, timeout); + inet_csk_reqsk_queue_added(sk, timeout); +} + +EXPORT_SYMBOL_GPL(inet6_csk_reqsk_queue_hash_add); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index bf41f84d6692..5a10d30cec4a 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -48,6 +48,7 @@ #include #include #include +#include #include #include #include @@ -118,60 +119,6 @@ static void tcp_v6_hash(struct sock *sk) } } -/* - * Open request hash tables. - */ - -static u32 tcp_v6_synq_hash(const struct in6_addr *raddr, const u16 rport, const u32 rnd) -{ - u32 a, b, c; - - a = raddr->s6_addr32[0]; - b = raddr->s6_addr32[1]; - c = raddr->s6_addr32[2]; - - a += JHASH_GOLDEN_RATIO; - b += JHASH_GOLDEN_RATIO; - c += rnd; - __jhash_mix(a, b, c); - - a += raddr->s6_addr32[3]; - b += (u32) rport; - __jhash_mix(a, b, c); - - return c & (TCP_SYNQ_HSIZE - 1); -} - -static struct request_sock *tcp_v6_search_req(const struct sock *sk, - struct request_sock ***prevp, - __u16 rport, - struct in6_addr *raddr, - struct in6_addr *laddr, - int iif) -{ - const struct inet_connection_sock *icsk = inet_csk(sk); - struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; - struct request_sock *req, **prev; - - for (prev = &lopt->syn_table[tcp_v6_synq_hash(raddr, rport, lopt->hash_rnd)]; - (req = *prev) != NULL; - prev = &req->dl_next) { - const struct tcp6_request_sock *treq = tcp6_rsk(req); - - if (inet_rsk(req)->rmt_port == rport && - req->rsk_ops->family == AF_INET6 && - ipv6_addr_equal(&treq->rmt_addr, raddr) && - ipv6_addr_equal(&treq->loc_addr, laddr) && - (!treq->iif || treq->iif == iif)) { - BUG_TRAP(req->sk == NULL); - *prevp = prev; - return req; - } - } - - return NULL; -} - static __inline__ u16 tcp_v6_check(struct tcphdr *th, int len, struct in6_addr *saddr, struct in6_addr *daddr, @@ -662,8 +609,8 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if (sock_owned_by_user(sk)) goto out; - req = tcp_v6_search_req(sk, &prev, th->dest, &hdr->daddr, - &hdr->saddr, inet6_iif(skb)); + req = inet6_csk_search_req(sk, &prev, th->dest, &hdr->daddr, + &hdr->saddr, inet6_iif(skb)); if (!req) goto out; @@ -978,8 +925,9 @@ static struct sock *tcp_v6_hnd_req(struct sock *sk,struct sk_buff *skb) struct sock *nsk; /* Find possible connection requests. */ - req = tcp_v6_search_req(sk, &prev, th->source, &skb->nh.ipv6h->saddr, - &skb->nh.ipv6h->daddr, inet6_iif(skb)); + req = inet6_csk_search_req(sk, &prev, th->source, + &skb->nh.ipv6h->saddr, + &skb->nh.ipv6h->daddr, inet6_iif(skb)); if (req) return tcp_check_req(sk, skb, req, prev); @@ -1003,17 +951,6 @@ static struct sock *tcp_v6_hnd_req(struct sock *sk,struct sk_buff *skb) return sk; } -static void tcp_v6_synq_add(struct sock *sk, struct request_sock *req) -{ - struct inet_connection_sock *icsk = inet_csk(sk); - struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; - const u32 h = tcp_v6_synq_hash(&tcp6_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port, lopt->hash_rnd); - - reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, TCP_TIMEOUT_INIT); - inet_csk_reqsk_queue_added(sk, TCP_TIMEOUT_INIT); -} - - /* FIXME: this is substantially similar to the ipv4 code. * Can some kind of merge be done? -- erics */ @@ -1083,8 +1020,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) if (tcp_v6_send_synack(sk, req, NULL)) goto drop; - tcp_v6_synq_add(sk, req); - + inet6_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); return 0; drop: -- cgit v1.2.3 From 8292a17a399ffb7c5c8b083db4ad994e090055f7 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 13 Dec 2005 23:15:52 -0800 Subject: [ICSK]: Rename struct tcp_func to struct inet_connection_sock_af_ops And move it to struct inet_connection_sock. DCCP will use it in the upcoming changesets. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- include/linux/tcp.h | 2 -- include/net/inet_connection_sock.h | 26 ++++++++++++++++++++ include/net/tcp.h | 50 +------------------------------------- include/net/transp_v6.h | 2 +- net/ipv4/syncookies.c | 4 +-- net/ipv4/tcp.c | 8 +++--- net/ipv4/tcp_input.c | 11 +++++---- net/ipv4/tcp_ipv4.c | 11 ++++----- net/ipv4/tcp_minisocks.c | 8 +++--- net/ipv4/tcp_output.c | 17 ++++++------- net/ipv6/ipv6_sockglue.c | 2 +- net/ipv6/tcp_ipv6.c | 28 ++++++++++----------- 12 files changed, 71 insertions(+), 98 deletions(-) (limited to 'include/net') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 0e1da6602e05..4e1434007f44 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -295,8 +295,6 @@ struct tcp_sock { struct sk_buff_head out_of_order_queue; /* Out of order segments go here */ - struct tcp_func *af_specific; /* Operations which are AF_INET{4,6} specific */ - __u32 rcv_wnd; /* Current receiver window */ __u32 rcv_wup; /* rcv_nxt on last window update sent */ __u32 write_seq; /* Tail(+1) of data held in tcp send buffer */ diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index ccc81a1c550c..9e20d201e951 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -15,6 +15,7 @@ #ifndef _INET_CONNECTION_SOCK_H #define _INET_CONNECTION_SOCK_H +#include #include #include #include @@ -29,6 +30,29 @@ struct inet_bind_bucket; struct inet_hashinfo; struct tcp_congestion_ops; +/* + * Pointers to address related TCP functions + * (i.e. things that depend on the address family) + */ +struct inet_connection_sock_af_ops { + int (*queue_xmit)(struct sk_buff *skb, int ipfragok); + void (*send_check)(struct sock *sk, int len, + struct sk_buff *skb); + int (*rebuild_header)(struct sock *sk); + int (*conn_request)(struct sock *sk, struct sk_buff *skb); + struct sock *(*syn_recv_sock)(struct sock *sk, struct sk_buff *skb, + struct request_sock *req, + struct dst_entry *dst); + int (*remember_stamp)(struct sock *sk); + __u16 net_header_len; + int (*setsockopt)(struct sock *sk, int level, int optname, + char __user *optval, int optlen); + int (*getsockopt)(struct sock *sk, int level, int optname, + char __user *optval, int __user *optlen); + void (*addr2sockaddr)(struct sock *sk, struct sockaddr *); + int sockaddr_len; +}; + /** inet_connection_sock - INET connection oriented sock * * @icsk_accept_queue: FIFO of established children @@ -37,6 +61,7 @@ struct tcp_congestion_ops; * @icsk_retransmit_timer: Resend (no ack) * @icsk_rto: Retransmit timeout * @icsk_ca_ops Pluggable congestion control hook + * @icsk_af_ops Operations which are AF_INET{4,6} specific * @icsk_ca_state: Congestion control state * @icsk_retransmits: Number of unrecovered [RTO] timeouts * @icsk_pending: Scheduled timer event @@ -55,6 +80,7 @@ struct inet_connection_sock { struct timer_list icsk_delack_timer; __u32 icsk_rto; struct tcp_congestion_ops *icsk_ca_ops; + struct inet_connection_sock_af_ops *icsk_af_ops; __u8 icsk_ca_state; __u8 icsk_retransmits; __u8 icsk_pending; diff --git a/include/net/tcp.h b/include/net/tcp.h index d78025f9fbea..83b117a25c2a 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -224,53 +224,6 @@ extern atomic_t tcp_memory_allocated; extern atomic_t tcp_sockets_allocated; extern int tcp_memory_pressure; -/* - * Pointers to address related TCP functions - * (i.e. things that depend on the address family) - */ - -struct tcp_func { - int (*queue_xmit) (struct sk_buff *skb, - int ipfragok); - - void (*send_check) (struct sock *sk, - struct tcphdr *th, - int len, - struct sk_buff *skb); - - int (*rebuild_header) (struct sock *sk); - - int (*conn_request) (struct sock *sk, - struct sk_buff *skb); - - struct sock * (*syn_recv_sock) (struct sock *sk, - struct sk_buff *skb, - struct request_sock *req, - struct dst_entry *dst); - - int (*remember_stamp) (struct sock *sk); - - __u16 net_header_len; - - int (*setsockopt) (struct sock *sk, - int level, - int optname, - char __user *optval, - int optlen); - - int (*getsockopt) (struct sock *sk, - int level, - int optname, - char __user *optval, - int __user *optlen); - - - void (*addr2sockaddr) (struct sock *sk, - struct sockaddr *); - - int sockaddr_len; -}; - /* * The next routines deal with comparing 32 bit unsigned ints * and worry about wraparound (automatic with unsigned arithmetic). @@ -405,8 +358,7 @@ extern void tcp_parse_options(struct sk_buff *skb, * TCP v4 functions exported for the inet6 API */ -extern void tcp_v4_send_check(struct sock *sk, - struct tcphdr *th, int len, +extern void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb); extern int tcp_v4_conn_request(struct sock *sk, diff --git a/include/net/transp_v6.h b/include/net/transp_v6.h index 4e86f2de6638..61f724c1036f 100644 --- a/include/net/transp_v6.h +++ b/include/net/transp_v6.h @@ -44,7 +44,7 @@ extern int datagram_send_ctl(struct msghdr *msg, /* * address family specific functions */ -extern struct tcp_func ipv4_specific; +extern struct inet_connection_sock_af_ops ipv4_specific; extern int inet6_destroy_sock(struct sock *sk); diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index a34e60ea48a1..e20be3331f67 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -173,10 +173,10 @@ static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct dst_entry *dst) { - struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); struct sock *child; - child = tp->af_specific->syn_recv_sock(sk, skb, req, dst); + child = icsk->icsk_af_ops->syn_recv_sock(sk, skb, req, dst); if (child) inet_csk_reqsk_queue_add(sk, req, child); else diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index ef98b14ac56d..eacfe6a3442c 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1696,8 +1696,8 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval, int err = 0; if (level != SOL_TCP) - return tp->af_specific->setsockopt(sk, level, optname, - optval, optlen); + return icsk->icsk_af_ops->setsockopt(sk, level, optname, + optval, optlen); /* This is a string value all the others are int's */ if (optname == TCP_CONGESTION) { @@ -1939,8 +1939,8 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int val, len; if (level != SOL_TCP) - return tp->af_specific->getsockopt(sk, level, optname, - optval, optlen); + return icsk->icsk_af_ops->getsockopt(sk, level, optname, + optval, optlen); if (get_user(len, optlen)) return -EFAULT; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index bf2e23086bce..7de6184d4bd8 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4071,8 +4071,10 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, mb(); tcp_set_state(sk, TCP_ESTABLISHED); + icsk = inet_csk(sk); + /* Make sure socket is routed, for correct metrics. */ - tp->af_specific->rebuild_header(sk); + icsk->icsk_af_ops->rebuild_header(sk); tcp_init_metrics(sk); @@ -4098,8 +4100,6 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, sk_wake_async(sk, 0, POLL_OUT); } - icsk = inet_csk(sk); - if (sk->sk_write_pending || icsk->icsk_accept_queue.rskq_defer_accept || icsk->icsk_ack.pingpong) { @@ -4220,6 +4220,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, struct tcphdr *th, unsigned len) { struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); int queued = 0; tp->rx_opt.saw_tstamp = 0; @@ -4236,7 +4237,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, goto discard; if(th->syn) { - if(tp->af_specific->conn_request(sk, skb) < 0) + if (icsk->icsk_af_ops->conn_request(sk, skb) < 0) return 1; /* Now we have several options: In theory there is @@ -4349,7 +4350,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, /* Make sure socket is routed, for * correct metrics. */ - tp->af_specific->rebuild_header(sk); + icsk->icsk_af_ops->rebuild_header(sk); tcp_init_metrics(sk); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 2aa19c89a94a..704cf2105795 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -86,8 +86,7 @@ int sysctl_tcp_low_latency; /* Socket used for sending RSTs */ static struct socket *tcp_socket; -void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len, - struct sk_buff *skb); +void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb); struct inet_hashinfo __cacheline_aligned tcp_hashinfo = { .lhash_lock = RW_LOCK_UNLOCKED, @@ -645,10 +644,10 @@ out: } /* This routine computes an IPv4 TCP checksum. */ -void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len, - struct sk_buff *skb) +void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb) { struct inet_sock *inet = inet_sk(sk); + struct tcphdr *th = skb->h.th; if (skb->ip_summed == CHECKSUM_HW) { th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0); @@ -1383,7 +1382,7 @@ int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw) return 0; } -struct tcp_func ipv4_specific = { +struct inet_connection_sock_af_ops ipv4_specific = { .queue_xmit = ip_queue_xmit, .send_check = tcp_v4_send_check, .rebuild_header = inet_sk_rebuild_header, @@ -1434,7 +1433,7 @@ static int tcp_v4_init_sock(struct sock *sk) sk->sk_write_space = sk_stream_write_space; sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); - tp->af_specific = &ipv4_specific; + icsk->icsk_af_ops = &ipv4_specific; sk->sk_sndbuf = sysctl_tcp_wmem[1]; sk->sk_rcvbuf = sysctl_tcp_rmem[1]; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 1b66a2ac4321..9c029683a626 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -274,18 +274,18 @@ kill: void tcp_time_wait(struct sock *sk, int state, int timeo) { struct inet_timewait_sock *tw = NULL; + const struct inet_connection_sock *icsk = inet_csk(sk); const struct tcp_sock *tp = tcp_sk(sk); int recycle_ok = 0; if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp) - recycle_ok = tp->af_specific->remember_stamp(sk); + recycle_ok = icsk->icsk_af_ops->remember_stamp(sk); if (tcp_death_row.tw_count < tcp_death_row.sysctl_max_tw_buckets) tw = inet_twsk_alloc(sk, state); if (tw != NULL) { struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); - const struct inet_connection_sock *icsk = inet_csk(sk); const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1); tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale; @@ -456,7 +456,6 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb, struct request_sock **prev) { struct tcphdr *th = skb->h.th; - struct tcp_sock *tp = tcp_sk(sk); u32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK); int paws_reject = 0; struct tcp_options_received tmp_opt; @@ -613,7 +612,8 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb, * ESTABLISHED STATE. If it will be dropped after * socket is created, wait for troubles. */ - child = tp->af_specific->syn_recv_sock(sk, skb, req, NULL); + child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, + req, NULL); if (child == NULL) goto listen_overflow; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index b7325e0b406a..af1946c52c37 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -371,7 +371,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, TCP_ECN_send(sk, tp, skb, tcp_header_size); } - tp->af_specific->send_check(sk, th, skb->len, skb); + icsk->icsk_af_ops->send_check(sk, skb->len, skb); if (likely(tcb->flags & TCPCB_FLAG_ACK)) tcp_event_ack_sent(sk, tcp_skb_pcount(skb)); @@ -381,7 +381,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, TCP_INC_STATS(TCP_MIB_OUTSEGS); - err = tp->af_specific->queue_xmit(skb, 0); + err = icsk->icsk_af_ops->queue_xmit(skb, 0); if (unlikely(err <= 0)) return err; @@ -638,12 +638,11 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu) { struct tcp_sock *tp = tcp_sk(sk); - int mss_now; - /* Calculate base mss without TCP options: It is MMS_S - sizeof(tcphdr) of rfc1122 */ - mss_now = pmtu - tp->af_specific->net_header_len - sizeof(struct tcphdr); + int mss_now = (pmtu - inet_csk(sk)->icsk_af_ops->net_header_len - + sizeof(struct tcphdr)); /* Clamp it (mss_clamp does not include tcp options) */ if (mss_now > tp->rx_opt.mss_clamp) @@ -705,9 +704,9 @@ unsigned int tcp_current_mss(struct sock *sk, int large_allowed) xmit_size_goal = mss_now; if (doing_tso) { - xmit_size_goal = 65535 - - tp->af_specific->net_header_len - - tp->ext_header_len - tp->tcp_header_len; + xmit_size_goal = (65535 - + inet_csk(sk)->icsk_af_ops->net_header_len - + tp->ext_header_len - tp->tcp_header_len); if (tp->max_window && (xmit_size_goal > (tp->max_window >> 1))) @@ -1422,7 +1421,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) (sysctl_tcp_retrans_collapse != 0)) tcp_retrans_try_collapse(sk, skb, cur_mss); - if(tp->af_specific->rebuild_header(sk)) + if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk)) return -EHOSTUNREACH; /* Routing failure or similar. */ /* Some Solaris stacks overoptimize and ignore the FIN on a diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 3620718defe6..b6b63fa8454c 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -170,7 +170,7 @@ int ipv6_setsockopt(struct sock *sk, int level, int optname, sock_prot_inc_use(&tcp_prot); local_bh_enable(); sk->sk_prot = &tcp_prot; - tp->af_specific = &ipv4_specific; + inet_csk(sk)->icsk_af_ops = &ipv4_specific; sk->sk_socket->ops = &inet_stream_ops; sk->sk_family = PF_INET; tcp_sync_mss(sk, tp->pmtu_cookie); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index c2472d771664..8ce8a1359d2b 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -68,14 +68,14 @@ static void tcp_v6_send_reset(struct sk_buff *skb); static void tcp_v6_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req); -static void tcp_v6_send_check(struct sock *sk, struct tcphdr *th, int len, +static void tcp_v6_send_check(struct sock *sk, int len, struct sk_buff *skb); static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb); static int tcp_v6_xmit(struct sk_buff *skb, int ipfragok); -static struct tcp_func ipv6_mapped; -static struct tcp_func ipv6_specific; +static struct inet_connection_sock_af_ops ipv6_mapped; +static struct inet_connection_sock_af_ops ipv6_specific; int inet6_csk_bind_conflict(const struct sock *sk, const struct inet_bind_bucket *tb) @@ -107,9 +107,7 @@ static int tcp_v6_get_port(struct sock *sk, unsigned short snum) static void tcp_v6_hash(struct sock *sk) { if (sk->sk_state != TCP_CLOSE) { - struct tcp_sock *tp = tcp_sk(sk); - - if (tp->af_specific == &ipv6_mapped) { + if (inet_csk(sk)->icsk_af_ops == &ipv6_mapped) { tcp_prot.hash(sk); return; } @@ -417,14 +415,14 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, sin.sin_port = usin->sin6_port; sin.sin_addr.s_addr = usin->sin6_addr.s6_addr32[3]; - tp->af_specific = &ipv6_mapped; + inet_csk(sk)->icsk_af_ops = &ipv6_mapped; sk->sk_backlog_rcv = tcp_v4_do_rcv; err = tcp_v4_connect(sk, (struct sockaddr *)&sin, sizeof(sin)); if (err) { tp->ext_header_len = exthdrlen; - tp->af_specific = &ipv6_specific; + inet_csk(sk)->icsk_af_ops = &ipv6_specific; sk->sk_backlog_rcv = tcp_v6_do_rcv; goto failure; } else { @@ -751,10 +749,10 @@ static int ipv6_opt_accepted(struct sock *sk, struct sk_buff *skb) } -static void tcp_v6_send_check(struct sock *sk, struct tcphdr *th, int len, - struct sk_buff *skb) +static void tcp_v6_send_check(struct sock *sk, int len, struct sk_buff *skb) { struct ipv6_pinfo *np = inet6_sk(sk); + struct tcphdr *th = skb->h.th; if (skb->ip_summed == CHECKSUM_HW) { th->check = ~csum_ipv6_magic(&np->saddr, &np->daddr, len, IPPROTO_TCP, 0); @@ -1070,7 +1068,7 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, ipv6_addr_copy(&newnp->rcv_saddr, &newnp->saddr); - newtp->af_specific = &ipv6_mapped; + inet_csk(newsk)->icsk_af_ops = &ipv6_mapped; newsk->sk_backlog_rcv = tcp_v4_do_rcv; newnp->pktoptions = NULL; newnp->opt = NULL; @@ -1084,7 +1082,7 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, */ /* It is tricky place. Until this moment IPv4 tcp - worked with IPv6 af_tcp.af_specific. + worked with IPv6 icsk.icsk_af_ops. Sync it now. */ tcp_sync_mss(newsk, newtp->pmtu_cookie); @@ -1631,7 +1629,7 @@ static int tcp_v6_remember_stamp(struct sock *sk) return 0; } -static struct tcp_func ipv6_specific = { +static struct inet_connection_sock_af_ops ipv6_specific = { .queue_xmit = tcp_v6_xmit, .send_check = tcp_v6_send_check, .rebuild_header = tcp_v6_rebuild_header, @@ -1650,7 +1648,7 @@ static struct tcp_func ipv6_specific = { * TCP over IPv4 via INET6 API */ -static struct tcp_func ipv6_mapped = { +static struct inet_connection_sock_af_ops ipv6_mapped = { .queue_xmit = ip_queue_xmit, .send_check = tcp_v4_send_check, .rebuild_header = inet_sk_rebuild_header, @@ -1700,7 +1698,7 @@ static int tcp_v6_init_sock(struct sock *sk) sk->sk_state = TCP_CLOSE; - tp->af_specific = &ipv6_specific; + icsk->icsk_af_ops = &ipv6_specific; icsk->icsk_ca_ops = &tcp_init_congestion_ops; sk->sk_write_space = sk_stream_write_space; sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); -- cgit v1.2.3 From af05dc9394feb193d221bc9d4c6db768facb4b40 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 13 Dec 2005 23:16:04 -0800 Subject: [ICSK]: Move v4_addr2sockaddr from TCP to icsk Renaming it to inet_csk_addr2sockaddr. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- include/net/inet_connection_sock.h | 2 ++ net/ipv4/inet_connection_sock.c | 12 ++++++++++++ net/ipv4/tcp_ipv4.c | 12 +----------- 3 files changed, 15 insertions(+), 11 deletions(-) (limited to 'include/net') diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index 9e20d201e951..e50e2b890c6d 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -303,4 +303,6 @@ static inline unsigned int inet_csk_listen_poll(const struct sock *sk) extern int inet_csk_listen_start(struct sock *sk, const int nr_table_entries); extern void inet_csk_listen_stop(struct sock *sk); +extern void inet_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr); + #endif /* _INET_CONNECTION_SOCK_H */ diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index e2bf508ed770..ae20281d8deb 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -636,3 +636,15 @@ void inet_csk_listen_stop(struct sock *sk) } EXPORT_SYMBOL_GPL(inet_csk_listen_stop); + +void inet_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr) +{ + struct sockaddr_in *sin = (struct sockaddr_in *)uaddr; + const struct inet_sock *inet = inet_sk(sk); + + sin->sin_family = AF_INET; + sin->sin_addr.s_addr = inet->daddr; + sin->sin_port = inet->dport; +} + +EXPORT_SYMBOL_GPL(inet_csk_addr2sockaddr); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 704cf2105795..0b5ab04d3c5a 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1314,16 +1314,6 @@ do_time_wait: goto discard_it; } -static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr) -{ - struct sockaddr_in *sin = (struct sockaddr_in *) uaddr; - struct inet_sock *inet = inet_sk(sk); - - sin->sin_family = AF_INET; - sin->sin_addr.s_addr = inet->daddr; - sin->sin_port = inet->dport; -} - /* VJ's idea. Save last timestamp seen from this destination * and hold it at least for normal timewait interval to use for duplicate * segment detection in subsequent connections, before they enter synchronized @@ -1392,7 +1382,7 @@ struct inet_connection_sock_af_ops ipv4_specific = { .net_header_len = sizeof(struct iphdr), .setsockopt = ip_setsockopt, .getsockopt = ip_getsockopt, - .addr2sockaddr = v4_addr2sockaddr, + .addr2sockaddr = inet_csk_addr2sockaddr, .sockaddr_len = sizeof(struct sockaddr_in), }; -- cgit v1.2.3 From c1cbe4b7ad0bc4b1d98ea708a3fecb7362aa4088 Mon Sep 17 00:00:00 2001 From: Benjamin LaHaise Date: Tue, 13 Dec 2005 23:22:19 -0800 Subject: [NET]: Avoid atomic xchg() for non-error case It also looks like there were 2 places where the test on sk_err was missing from the event wait logic (in sk_stream_wait_connect and sk_stream_wait_memory), while the rest of the sock_error() users look to be doing the right thing. This version of the patch fixes those, and cleans up a few places that were testing ->sk_err directly. Signed-off-by: Benjamin LaHaise Signed-off-by: David S. Miller --- include/net/sock.h | 5 ++++- net/bluetooth/af_bluetooth.c | 5 ++--- net/bluetooth/l2cap.c | 5 +++-- net/bluetooth/sco.c | 5 +++-- net/core/stream.c | 10 +++++++--- net/irda/af_irda.c | 5 +++-- net/llc/af_llc.c | 5 ++--- 7 files changed, 24 insertions(+), 16 deletions(-) (limited to 'include/net') diff --git a/include/net/sock.h b/include/net/sock.h index 982b4ecd187b..0fbae85c6d55 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1166,7 +1166,10 @@ static inline int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb) static inline int sock_error(struct sock *sk) { - int err = xchg(&sk->sk_err, 0); + int err; + if (likely(!sk->sk_err)) + return 0; + err = xchg(&sk->sk_err, 0); return -err; } diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c index ea616e3fc98e..fb031fe9be9e 100644 --- a/net/bluetooth/af_bluetooth.c +++ b/net/bluetooth/af_bluetooth.c @@ -287,10 +287,9 @@ int bt_sock_wait_state(struct sock *sk, int state, unsigned long timeo) timeo = schedule_timeout(timeo); lock_sock(sk); - if (sk->sk_err) { - err = sock_error(sk); + err = sock_error(sk); + if (err) break; - } } set_current_state(TASK_RUNNING); remove_wait_queue(sk->sk_sleep, &wait); diff --git a/net/bluetooth/l2cap.c b/net/bluetooth/l2cap.c index e3bb11ca4235..95f33cc7a24e 100644 --- a/net/bluetooth/l2cap.c +++ b/net/bluetooth/l2cap.c @@ -767,8 +767,9 @@ static int l2cap_sock_sendmsg(struct kiocb *iocb, struct socket *sock, struct ms BT_DBG("sock %p, sk %p", sock, sk); - if (sk->sk_err) - return sock_error(sk); + err = sock_error(sk); + if (err) + return err; if (msg->msg_flags & MSG_OOB) return -EOPNOTSUPP; diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index 9cb00dc6c08c..648181430699 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -637,8 +637,9 @@ static int sco_sock_sendmsg(struct kiocb *iocb, struct socket *sock, BT_DBG("sock %p, sk %p", sock, sk); - if (sk->sk_err) - return sock_error(sk); + err = sock_error(sk); + if (err) + return err; if (msg->msg_flags & MSG_OOB) return -EOPNOTSUPP; diff --git a/net/core/stream.c b/net/core/stream.c index 15bfd03e8024..35e25259fd95 100644 --- a/net/core/stream.c +++ b/net/core/stream.c @@ -55,8 +55,9 @@ int sk_stream_wait_connect(struct sock *sk, long *timeo_p) int done; do { - if (sk->sk_err) - return sock_error(sk); + int err = sock_error(sk); + if (err) + return err; if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) return -EPIPE; if (!*timeo_p) @@ -67,6 +68,7 @@ int sk_stream_wait_connect(struct sock *sk, long *timeo_p) prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); sk->sk_write_pending++; done = sk_wait_event(sk, timeo_p, + !sk->sk_err && !((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))); finish_wait(sk->sk_sleep, &wait); @@ -137,7 +139,9 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p) set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); sk->sk_write_pending++; - sk_wait_event(sk, ¤t_timeo, sk_stream_memory_free(sk) && + sk_wait_event(sk, ¤t_timeo, !sk->sk_err && + !(sk->sk_shutdown & SEND_SHUTDOWN) && + sk_stream_memory_free(sk) && vm_wait); sk->sk_write_pending--; diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c index 6f92f9c62990..f121f7de2032 100644 --- a/net/irda/af_irda.c +++ b/net/irda/af_irda.c @@ -1438,8 +1438,9 @@ static int irda_recvmsg_stream(struct kiocb *iocb, struct socket *sock, /* * POSIX 1003.1g mandates this order. */ - if (sk->sk_err) - ret = sock_error(sk); + ret = sock_error(sk); + if (ret) + break; else if (sk->sk_shutdown & RCV_SHUTDOWN) ; else if (noblock) diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index c3f0b0783453..b6d3df5c911c 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -566,10 +566,9 @@ static int llc_wait_data(struct sock *sk, long timeo) /* * POSIX 1003.1g mandates this order. */ - if (sk->sk_err) { - rc = sock_error(sk); + rc = sock_error(sk); + if (rc) break; - } rc = 0; if (sk->sk_shutdown & RCV_SHUTDOWN) break; -- cgit v1.2.3 From b9750ce13c08aa8a71a9b138d741f3046aefd991 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 13 Dec 2005 23:22:54 -0800 Subject: [IPV6]: Generalise some functions Using sk->sk_protocol instead of IPPROTO_TCP. Will be used by DCCPv6 in the next changesets. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- include/linux/ipv6.h | 2 + include/net/inet6_connection_sock.h | 13 ++- net/ipv6/af_inet6.c | 52 ++++++++++++ net/ipv6/inet6_connection_sock.c | 103 ++++++++++++++++++++++++ net/ipv6/tcp_ipv6.c | 153 +----------------------------------- 5 files changed, 173 insertions(+), 150 deletions(-) (limited to 'include/net') diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 7d3e86d9576e..69a0decfbdf4 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -297,6 +297,8 @@ struct tcp6_sock { struct ipv6_pinfo inet6; }; +extern int inet6_sk_rebuild_header(struct sock *sk); + #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) static inline struct ipv6_pinfo * inet6_sk(const struct sock *__sk) { diff --git a/include/net/inet6_connection_sock.h b/include/net/inet6_connection_sock.h index aa30ebde70dc..b33b438bffcc 100644 --- a/include/net/inet6_connection_sock.h +++ b/include/net/inet6_connection_sock.h @@ -15,8 +15,15 @@ #include -struct sock; +struct in6_addr; +struct inet_bind_bucket; struct request_sock; +struct sk_buff; +struct sock; +struct sockaddr; + +extern int inet6_csk_bind_conflict(const struct sock *sk, + const struct inet_bind_bucket *tb); extern struct request_sock *inet6_csk_search_req(const struct sock *sk, struct request_sock ***prevp, @@ -28,4 +35,8 @@ extern struct request_sock *inet6_csk_search_req(const struct sock *sk, extern void inet6_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, const unsigned long timeout); + +extern void inet6_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr); + +extern int inet6_csk_xmit(struct sk_buff *skb, int ipfragok); #endif /* _INET6_CONNECTION_SOCK_H */ diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index d9546380fa04..fd040e9a1f47 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -609,6 +609,58 @@ inet6_unregister_protosw(struct inet_protosw *p) } } +int inet6_sk_rebuild_header(struct sock *sk) +{ + int err; + struct dst_entry *dst; + struct ipv6_pinfo *np = inet6_sk(sk); + + dst = __sk_dst_check(sk, np->dst_cookie); + + if (dst == NULL) { + struct inet_sock *inet = inet_sk(sk); + struct in6_addr *final_p = NULL, final; + struct flowi fl; + + memset(&fl, 0, sizeof(fl)); + fl.proto = sk->sk_protocol; + ipv6_addr_copy(&fl.fl6_dst, &np->daddr); + ipv6_addr_copy(&fl.fl6_src, &np->saddr); + fl.fl6_flowlabel = np->flow_label; + fl.oif = sk->sk_bound_dev_if; + fl.fl_ip_dport = inet->dport; + fl.fl_ip_sport = inet->sport; + + if (np->opt && np->opt->srcrt) { + struct rt0_hdr *rt0 = (struct rt0_hdr *) np->opt->srcrt; + ipv6_addr_copy(&final, &fl.fl6_dst); + ipv6_addr_copy(&fl.fl6_dst, rt0->addr); + final_p = &final; + } + + err = ip6_dst_lookup(sk, &dst, &fl); + if (err) { + sk->sk_route_caps = 0; + return err; + } + if (final_p) + ipv6_addr_copy(&fl.fl6_dst, final_p); + + if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0) { + sk->sk_err_soft = -err; + return err; + } + + ip6_dst_store(sk, dst, NULL); + sk->sk_route_caps = dst->dev->features & + ~(NETIF_F_IP_CSUM | NETIF_F_TSO); + } + + return 0; +} + +EXPORT_SYMBOL_GPL(inet6_sk_rebuild_header); + int snmp6_mib_init(void *ptr[2], size_t mibsize, size_t mibalign) { diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c index fe874eeaa40c..792f90f0f9ec 100644 --- a/net/ipv6/inet6_connection_sock.c +++ b/net/ipv6/inet6_connection_sock.c @@ -21,8 +21,34 @@ #include #include +#include +#include +#include #include +int inet6_csk_bind_conflict(const struct sock *sk, + const struct inet_bind_bucket *tb) +{ + const struct sock *sk2; + const struct hlist_node *node; + + /* We must walk the whole port owner list in this case. -DaveM */ + sk_for_each_bound(sk2, node, &tb->owners) { + if (sk != sk2 && + (!sk->sk_bound_dev_if || + !sk2->sk_bound_dev_if || + sk->sk_bound_dev_if == sk2->sk_bound_dev_if) && + (!sk->sk_reuse || !sk2->sk_reuse || + sk2->sk_state == TCP_LISTEN) && + ipv6_rcv_saddr_equal(sk, sk2)) + break; + } + + return node != NULL; +} + +EXPORT_SYMBOL_GPL(inet6_csk_bind_conflict); + /* * request_sock (formerly open request) hash tables. */ @@ -94,3 +120,80 @@ void inet6_csk_reqsk_queue_hash_add(struct sock *sk, } EXPORT_SYMBOL_GPL(inet6_csk_reqsk_queue_hash_add); + +void inet6_csk_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) uaddr; + + sin6->sin6_family = AF_INET6; + ipv6_addr_copy(&sin6->sin6_addr, &np->daddr); + sin6->sin6_port = inet_sk(sk)->dport; + /* We do not store received flowlabel for TCP */ + sin6->sin6_flowinfo = 0; + sin6->sin6_scope_id = 0; + if (sk->sk_bound_dev_if && + ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL) + sin6->sin6_scope_id = sk->sk_bound_dev_if; +} + +EXPORT_SYMBOL_GPL(inet6_csk_addr2sockaddr); + +int inet6_csk_xmit(struct sk_buff *skb, int ipfragok) +{ + struct sock *sk = skb->sk; + struct inet_sock *inet = inet_sk(sk); + struct ipv6_pinfo *np = inet6_sk(sk); + struct flowi fl; + struct dst_entry *dst; + struct in6_addr *final_p = NULL, final; + + memset(&fl, 0, sizeof(fl)); + fl.proto = sk->sk_protocol; + ipv6_addr_copy(&fl.fl6_dst, &np->daddr); + ipv6_addr_copy(&fl.fl6_src, &np->saddr); + fl.fl6_flowlabel = np->flow_label; + IP6_ECN_flow_xmit(sk, fl.fl6_flowlabel); + fl.oif = sk->sk_bound_dev_if; + fl.fl_ip_sport = inet->sport; + fl.fl_ip_dport = inet->dport; + + if (np->opt && np->opt->srcrt) { + struct rt0_hdr *rt0 = (struct rt0_hdr *)np->opt->srcrt; + ipv6_addr_copy(&final, &fl.fl6_dst); + ipv6_addr_copy(&fl.fl6_dst, rt0->addr); + final_p = &final; + } + + dst = __sk_dst_check(sk, np->dst_cookie); + + if (dst == NULL) { + int err = ip6_dst_lookup(sk, &dst, &fl); + + if (err) { + sk->sk_err_soft = -err; + return err; + } + + if (final_p) + ipv6_addr_copy(&fl.fl6_dst, final_p); + + if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0) { + sk->sk_route_caps = 0; + return err; + } + + ip6_dst_store(sk, dst, NULL); + sk->sk_route_caps = dst->dev->features & + ~(NETIF_F_IP_CSUM | NETIF_F_TSO); + } + + skb->dst = dst_clone(dst); + + /* Restore final destination back after routing done */ + ipv6_addr_copy(&fl.fl6_dst, &np->daddr); + + return ip6_xmit(sk, skb, &fl, np->opt, 0); +} + +EXPORT_SYMBOL_GPL(inet6_csk_xmit); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 8ce8a1359d2b..2f932ce72610 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -72,32 +72,10 @@ static void tcp_v6_send_check(struct sock *sk, int len, struct sk_buff *skb); static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb); -static int tcp_v6_xmit(struct sk_buff *skb, int ipfragok); static struct inet_connection_sock_af_ops ipv6_mapped; static struct inet_connection_sock_af_ops ipv6_specific; -int inet6_csk_bind_conflict(const struct sock *sk, - const struct inet_bind_bucket *tb) -{ - const struct sock *sk2; - const struct hlist_node *node; - - /* We must walk the whole port owner list in this case. -DaveM */ - sk_for_each_bound(sk2, node, &tb->owners) { - if (sk != sk2 && - (!sk->sk_bound_dev_if || - !sk2->sk_bound_dev_if || - sk->sk_bound_dev_if == sk2->sk_bound_dev_if) && - (!sk->sk_reuse || !sk2->sk_reuse || - sk2->sk_state == TCP_LISTEN) && - ipv6_rcv_saddr_equal(sk, sk2)) - break; - } - - return node != NULL; -} - static int tcp_v6_get_port(struct sock *sk, unsigned short snum) { return inet_csk_get_port(&tcp_hashinfo, sk, snum, @@ -1500,129 +1478,6 @@ do_time_wait: goto discard_it; } -static int tcp_v6_rebuild_header(struct sock *sk) -{ - int err; - struct dst_entry *dst; - struct ipv6_pinfo *np = inet6_sk(sk); - - dst = __sk_dst_check(sk, np->dst_cookie); - - if (dst == NULL) { - struct inet_sock *inet = inet_sk(sk); - struct in6_addr *final_p = NULL, final; - struct flowi fl; - - memset(&fl, 0, sizeof(fl)); - fl.proto = IPPROTO_TCP; - ipv6_addr_copy(&fl.fl6_dst, &np->daddr); - ipv6_addr_copy(&fl.fl6_src, &np->saddr); - fl.fl6_flowlabel = np->flow_label; - fl.oif = sk->sk_bound_dev_if; - fl.fl_ip_dport = inet->dport; - fl.fl_ip_sport = inet->sport; - - if (np->opt && np->opt->srcrt) { - struct rt0_hdr *rt0 = (struct rt0_hdr *) np->opt->srcrt; - ipv6_addr_copy(&final, &fl.fl6_dst); - ipv6_addr_copy(&fl.fl6_dst, rt0->addr); - final_p = &final; - } - - err = ip6_dst_lookup(sk, &dst, &fl); - if (err) { - sk->sk_route_caps = 0; - return err; - } - if (final_p) - ipv6_addr_copy(&fl.fl6_dst, final_p); - - if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0) { - sk->sk_err_soft = -err; - return err; - } - - ip6_dst_store(sk, dst, NULL); - sk->sk_route_caps = dst->dev->features & - ~(NETIF_F_IP_CSUM | NETIF_F_TSO); - } - - return 0; -} - -static int tcp_v6_xmit(struct sk_buff *skb, int ipfragok) -{ - struct sock *sk = skb->sk; - struct inet_sock *inet = inet_sk(sk); - struct ipv6_pinfo *np = inet6_sk(sk); - struct flowi fl; - struct dst_entry *dst; - struct in6_addr *final_p = NULL, final; - - memset(&fl, 0, sizeof(fl)); - fl.proto = IPPROTO_TCP; - ipv6_addr_copy(&fl.fl6_dst, &np->daddr); - ipv6_addr_copy(&fl.fl6_src, &np->saddr); - fl.fl6_flowlabel = np->flow_label; - IP6_ECN_flow_xmit(sk, fl.fl6_flowlabel); - fl.oif = sk->sk_bound_dev_if; - fl.fl_ip_sport = inet->sport; - fl.fl_ip_dport = inet->dport; - - if (np->opt && np->opt->srcrt) { - struct rt0_hdr *rt0 = (struct rt0_hdr *) np->opt->srcrt; - ipv6_addr_copy(&final, &fl.fl6_dst); - ipv6_addr_copy(&fl.fl6_dst, rt0->addr); - final_p = &final; - } - - dst = __sk_dst_check(sk, np->dst_cookie); - - if (dst == NULL) { - int err = ip6_dst_lookup(sk, &dst, &fl); - - if (err) { - sk->sk_err_soft = -err; - return err; - } - - if (final_p) - ipv6_addr_copy(&fl.fl6_dst, final_p); - - if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0) { - sk->sk_route_caps = 0; - return err; - } - - ip6_dst_store(sk, dst, NULL); - sk->sk_route_caps = dst->dev->features & - ~(NETIF_F_IP_CSUM | NETIF_F_TSO); - } - - skb->dst = dst_clone(dst); - - /* Restore final destination back after routing done */ - ipv6_addr_copy(&fl.fl6_dst, &np->daddr); - - return ip6_xmit(sk, skb, &fl, np->opt, 0); -} - -static void v6_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr) -{ - struct ipv6_pinfo *np = inet6_sk(sk); - struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) uaddr; - - sin6->sin6_family = AF_INET6; - ipv6_addr_copy(&sin6->sin6_addr, &np->daddr); - sin6->sin6_port = inet_sk(sk)->dport; - /* We do not store received flowlabel for TCP */ - sin6->sin6_flowinfo = 0; - sin6->sin6_scope_id = 0; - if (sk->sk_bound_dev_if && - ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL) - sin6->sin6_scope_id = sk->sk_bound_dev_if; -} - static int tcp_v6_remember_stamp(struct sock *sk) { /* Alas, not yet... */ @@ -1630,9 +1485,9 @@ static int tcp_v6_remember_stamp(struct sock *sk) } static struct inet_connection_sock_af_ops ipv6_specific = { - .queue_xmit = tcp_v6_xmit, + .queue_xmit = inet6_csk_xmit, .send_check = tcp_v6_send_check, - .rebuild_header = tcp_v6_rebuild_header, + .rebuild_header = inet6_sk_rebuild_header, .conn_request = tcp_v6_conn_request, .syn_recv_sock = tcp_v6_syn_recv_sock, .remember_stamp = tcp_v6_remember_stamp, @@ -1640,7 +1495,7 @@ static struct inet_connection_sock_af_ops ipv6_specific = { .setsockopt = ipv6_setsockopt, .getsockopt = ipv6_getsockopt, - .addr2sockaddr = v6_addr2sockaddr, + .addr2sockaddr = inet6_csk_addr2sockaddr, .sockaddr_len = sizeof(struct sockaddr_in6) }; @@ -1659,7 +1514,7 @@ static struct inet_connection_sock_af_ops ipv6_mapped = { .setsockopt = ipv6_setsockopt, .getsockopt = ipv6_getsockopt, - .addr2sockaddr = v6_addr2sockaddr, + .addr2sockaddr = inet6_csk_addr2sockaddr, .sockaddr_len = sizeof(struct sockaddr_in6) }; -- cgit v1.2.3 From 0fa1a53e1f055a6c790f40e7728f42a825b29248 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 13 Dec 2005 23:23:09 -0800 Subject: [IPV6]: Introduce inet6_timewait_sock Out of tcp6_timewait_sock, that now is just an aggregation of inet_timewait_sock and inet6_timewait_sock, using tw_ipv6_offset in struct inet_timewait_sock, that is common to the IPv6 transport protocols that use timewait sockets, like DCCP and TCP. tw_ipv6_offset plays the struct inet_sock pinfo6 role, i.e. for the generic code to find the IPv6 area in a timewait sock. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- include/linux/ipv6.h | 32 +++++++++++++++++++++----------- include/net/inet6_hashtables.h | 6 +++--- include/net/inet_timewait_sock.h | 3 ++- net/ipv4/inet_diag.c | 6 +++--- net/ipv4/tcp_minisocks.c | 8 +++++--- net/ipv6/addrconf.c | 2 +- net/ipv6/tcp_ipv6.c | 12 ++++++------ 7 files changed, 41 insertions(+), 28 deletions(-) (limited to 'include/net') diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 69a0decfbdf4..7d3908594fac 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -348,26 +348,36 @@ static inline void inet_sk_copy_descendant(struct sock *sk_to, #include +struct inet6_timewait_sock { + struct in6_addr tw_v6_daddr; + struct in6_addr tw_v6_rcv_saddr; +}; + struct tcp6_timewait_sock { - struct tcp_timewait_sock tw_v6_sk; - struct in6_addr tw_v6_daddr; - struct in6_addr tw_v6_rcv_saddr; + struct tcp_timewait_sock tcp6tw_tcp; + struct inet6_timewait_sock tcp6tw_inet6; }; -static inline struct tcp6_timewait_sock *tcp6_twsk(const struct sock *sk) +static inline u16 inet6_tw_offset(const struct proto *prot) +{ + return prot->twsk_obj_size - sizeof(struct inet6_timewait_sock); +} + +static inline struct inet6_timewait_sock *inet6_twsk(const struct sock *sk) { - return (struct tcp6_timewait_sock *)sk; + return (struct inet6_timewait_sock *)(((u8 *)sk) + + inet_twsk(sk)->tw_ipv6_offset); } -static inline struct in6_addr *__tcp_v6_rcv_saddr(const struct sock *sk) +static inline struct in6_addr *__inet6_rcv_saddr(const struct sock *sk) { return likely(sk->sk_state != TCP_TIME_WAIT) ? - &inet6_sk(sk)->rcv_saddr : &tcp6_twsk(sk)->tw_v6_rcv_saddr; + &inet6_sk(sk)->rcv_saddr : &inet6_twsk(sk)->tw_v6_rcv_saddr; } -static inline struct in6_addr *tcp_v6_rcv_saddr(const struct sock *sk) +static inline struct in6_addr *inet6_rcv_saddr(const struct sock *sk) { - return sk->sk_family == AF_INET6 ? __tcp_v6_rcv_saddr(sk) : NULL; + return sk->sk_family == AF_INET6 ? __inet6_rcv_saddr(sk) : NULL; } static inline int inet_v6_ipv6only(const struct sock *sk) @@ -395,8 +405,8 @@ static inline struct raw6_sock *raw6_sk(const struct sock *sk) return NULL; } -#define __tcp_v6_rcv_saddr(__sk) NULL -#define tcp_v6_rcv_saddr(__sk) NULL +#define __inet6_rcv_saddr(__sk) NULL +#define inet6_rcv_saddr(__sk) NULL #define tcp_twsk_ipv6only(__sk) 0 #define inet_v6_ipv6only(__sk) 0 #endif /* defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) */ diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h index a4a204f99ea6..25f708ff020e 100644 --- a/include/net/inet6_hashtables.h +++ b/include/net/inet6_hashtables.h @@ -110,10 +110,10 @@ static inline struct sock * if(*((__u32 *)&(tw->tw_dport)) == ports && sk->sk_family == PF_INET6) { - const struct tcp6_timewait_sock *tcp6tw = tcp6_twsk(sk); + const struct inet6_timewait_sock *tw6 = inet6_twsk(sk); - if (ipv6_addr_equal(&tcp6tw->tw_v6_daddr, saddr) && - ipv6_addr_equal(&tcp6tw->tw_v6_rcv_saddr, daddr) && + if (ipv6_addr_equal(&tw6->tw_v6_daddr, saddr) && + ipv6_addr_equal(&tw6->tw_v6_rcv_saddr, daddr) && (!sk->sk_bound_dev_if || sk->sk_bound_dev_if == dif)) goto hit; } diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h index 28f7b2103505..ca240f856c46 100644 --- a/include/net/inet_timewait_sock.h +++ b/include/net/inet_timewait_sock.h @@ -127,7 +127,8 @@ struct inet_timewait_sock { __u16 tw_num; /* And these are ours. */ __u8 tw_ipv6only:1; - /* 31 bits hole, try to pack */ + /* 15 bits hole, try to pack */ + __u16 tw_ipv6_offset; int tw_timeout; unsigned long tw_ttd; struct inet_bind_bucket *tw_tb; diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 3ce73b141d7e..c49908192047 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -112,12 +112,12 @@ static int inet_diag_fill(struct sk_buff *skb, struct sock *sk, r->idiag_inode = 0; #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) if (r->idiag_family == AF_INET6) { - const struct tcp6_timewait_sock *tcp6tw = tcp6_twsk(sk); + const struct inet6_timewait_sock *tw6 = inet6_twsk(sk); ipv6_addr_copy((struct in6_addr *)r->id.idiag_src, - &tcp6tw->tw_v6_rcv_saddr); + &tw6->tw_v6_rcv_saddr); ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst, - &tcp6tw->tw_v6_daddr); + &tw6->tw_v6_daddr); } #endif nlh->nlmsg_len = skb->tail - b; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 9c029683a626..2b9b7f6c7f7c 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -298,10 +298,12 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) if (tw->tw_family == PF_INET6) { struct ipv6_pinfo *np = inet6_sk(sk); - struct tcp6_timewait_sock *tcp6tw = tcp6_twsk((struct sock *)tw); + struct inet6_timewait_sock *tw6; - ipv6_addr_copy(&tcp6tw->tw_v6_daddr, &np->daddr); - ipv6_addr_copy(&tcp6tw->tw_v6_rcv_saddr, &np->rcv_saddr); + tw->tw_ipv6_offset = inet6_tw_offset(sk->sk_prot); + tw6 = inet6_twsk((struct sock *)tw); + ipv6_addr_copy(&tw6->tw_v6_daddr, &np->daddr); + ipv6_addr_copy(&tw6->tw_v6_rcv_saddr, &np->rcv_saddr); tw->tw_ipv6only = np->ipv6only; } #endif diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index a60585fd85ad..704fb73e6c5f 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1195,7 +1195,7 @@ struct inet6_ifaddr * ipv6_get_ifaddr(struct in6_addr *addr, struct net_device * int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2) { const struct in6_addr *sk_rcv_saddr6 = &inet6_sk(sk)->rcv_saddr; - const struct in6_addr *sk2_rcv_saddr6 = tcp_v6_rcv_saddr(sk2); + const struct in6_addr *sk2_rcv_saddr6 = inet6_rcv_saddr(sk2); u32 sk_rcv_saddr = inet_sk(sk)->rcv_saddr; u32 sk2_rcv_saddr = inet_rcv_saddr(sk2); int sk_ipv6only = ipv6_only_sock(sk); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 2f932ce72610..cb880079daf3 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -138,14 +138,14 @@ static int __tcp_v6_check_established(struct sock *sk, const __u16 lport, /* Check TIME-WAIT sockets first. */ sk_for_each(sk2, node, &(head + tcp_hashinfo.ehash_size)->chain) { - const struct tcp6_timewait_sock *tcp6tw = tcp6_twsk(sk2); + const struct inet6_timewait_sock *tw6 = inet6_twsk(sk2); tw = inet_twsk(sk2); if(*((__u32 *)&(tw->tw_dport)) == ports && sk2->sk_family == PF_INET6 && - ipv6_addr_equal(&tcp6tw->tw_v6_daddr, saddr) && - ipv6_addr_equal(&tcp6tw->tw_v6_rcv_saddr, daddr) && + ipv6_addr_equal(&tw6->tw_v6_daddr, saddr) && + ipv6_addr_equal(&tw6->tw_v6_rcv_saddr, daddr) && sk2->sk_bound_dev_if == sk->sk_bound_dev_if) { const struct tcp_timewait_sock *tcptw = tcp_twsk(sk2); struct tcp_sock *tp = tcp_sk(sk); @@ -1663,14 +1663,14 @@ static void get_timewait6_sock(struct seq_file *seq, { struct in6_addr *dest, *src; __u16 destp, srcp; - struct tcp6_timewait_sock *tcp6tw = tcp6_twsk((struct sock *)tw); + struct inet6_timewait_sock *tw6 = inet6_twsk((struct sock *)tw); int ttd = tw->tw_ttd - jiffies; if (ttd < 0) ttd = 0; - dest = &tcp6tw->tw_v6_daddr; - src = &tcp6tw->tw_v6_rcv_saddr; + dest = &tw6->tw_v6_daddr; + src = &tw6->tw_v6_rcv_saddr; destp = ntohs(tw->tw_dport); srcp = ntohs(tw->tw_sport); -- cgit v1.2.3 From 399c07def62a77678d633f5b3005431423a424a8 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 13 Dec 2005 23:24:28 -0800 Subject: [IPV6]: Export ipv6_opt_accepted It was already non-TCP specific, will be used by DCCPv6. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- include/net/ipv6.h | 2 ++ net/ipv6/af_inet6.c | 21 +++++++++++++++++++++ net/ipv6/tcp_ipv6.c | 16 ---------------- 3 files changed, 23 insertions(+), 16 deletions(-) (limited to 'include/net') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 0a2ad51cff82..851376108ac2 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -240,6 +240,8 @@ extern struct ipv6_txoptions * ipv6_renew_options(struct sock *sk, struct ipv6_t struct ipv6_txoptions *ipv6_fixup_options(struct ipv6_txoptions *opt_space, struct ipv6_txoptions *opt); +extern int ipv6_opt_accepted(struct sock *sk, struct sk_buff *skb); + extern int ip6_frag_nqueues; extern atomic_t ip6_frag_mem; diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 23675ef1f428..bf17aab9b776 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -663,6 +663,27 @@ int inet6_sk_rebuild_header(struct sock *sk) EXPORT_SYMBOL_GPL(inet6_sk_rebuild_header); +int ipv6_opt_accepted(struct sock *sk, struct sk_buff *skb) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct inet6_skb_parm *opt = IP6CB(skb); + + if (np->rxopt.all) { + if ((opt->hop && (np->rxopt.bits.hopopts || + np->rxopt.bits.ohopopts)) || + ((IPV6_FLOWINFO_MASK & *(u32*)skb->nh.raw) && + np->rxopt.bits.rxflow) || + (opt->srcrt && (np->rxopt.bits.srcrt || + np->rxopt.bits.osrcrt)) || + ((opt->dst1 || opt->dst0) && + (np->rxopt.bits.dstopts || np->rxopt.bits.odstopts))) + return 1; + } + return 0; +} + +EXPORT_SYMBOL_GPL(ipv6_opt_accepted); + int snmp6_mib_init(void *ptr[2], size_t mibsize, size_t mibalign) { diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index cb880079daf3..e5c8a669e84e 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -711,22 +711,6 @@ static struct request_sock_ops tcp6_request_sock_ops = { .send_reset = tcp_v6_send_reset }; -static int ipv6_opt_accepted(struct sock *sk, struct sk_buff *skb) -{ - struct ipv6_pinfo *np = inet6_sk(sk); - struct inet6_skb_parm *opt = IP6CB(skb); - - if (np->rxopt.all) { - if ((opt->hop && (np->rxopt.bits.hopopts || np->rxopt.bits.ohopopts)) || - ((IPV6_FLOWINFO_MASK & *(u32*)skb->nh.raw) && np->rxopt.bits.rxflow) || - (opt->srcrt && (np->rxopt.bits.srcrt || np->rxopt.bits.osrcrt)) || - ((opt->dst1 || opt->dst0) && (np->rxopt.bits.dstopts || np->rxopt.bits.odstopts))) - return 1; - } - return 0; -} - - static void tcp_v6_send_check(struct sock *sk, int len, struct sk_buff *skb) { struct ipv6_pinfo *np = inet6_sk(sk); -- cgit v1.2.3 From 6d6ee43e0b8b8d4847627fd43739b98ec2b9404f Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 13 Dec 2005 23:25:19 -0800 Subject: [TWSK]: Introduce struct timewait_sock_ops So that we can share several timewait sockets related functions and make the timewait mini sockets infrastructure closer to the request mini sockets one. Next changesets will take advantage of this, moving more code out of TCP and DCCP v4 and v6 to common infrastructure. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- include/linux/ipv6.h | 3 +- include/net/inet_timewait_sock.h | 3 +- include/net/sock.h | 4 +-- include/net/tcp.h | 3 ++ include/net/timewait_sock.h | 31 ++++++++++++++++++ net/core/sock.c | 21 ++++++------ net/dccp/ipv4.c | 9 ++++- net/dccp/ipv6.c | 6 +++- net/ipv4/inet_timewait_sock.c | 5 +-- net/ipv4/tcp_ipv4.c | 71 ++++++++++++++++++++++++---------------- net/ipv6/tcp_ipv6.c | 25 +++++--------- 11 files changed, 118 insertions(+), 63 deletions(-) create mode 100644 include/net/timewait_sock.h (limited to 'include/net') diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 7d3908594fac..a0d04891fe12 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -360,7 +360,8 @@ struct tcp6_timewait_sock { static inline u16 inet6_tw_offset(const struct proto *prot) { - return prot->twsk_obj_size - sizeof(struct inet6_timewait_sock); + return prot->twsk_prot->twsk_obj_size - + sizeof(struct inet6_timewait_sock); } static inline struct inet6_timewait_sock *inet6_twsk(const struct sock *sk) diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h index ca240f856c46..e396a65473d7 100644 --- a/include/net/inet_timewait_sock.h +++ b/include/net/inet_timewait_sock.h @@ -26,6 +26,7 @@ #include #include +#include #include @@ -200,7 +201,7 @@ static inline void inet_twsk_put(struct inet_timewait_sock *tw) printk(KERN_DEBUG "%s timewait_sock %p released\n", tw->tw_prot->name, tw); #endif - kmem_cache_free(tw->tw_prot->twsk_slab, tw); + kmem_cache_free(tw->tw_prot->twsk_prot->twsk_slab, tw); module_put(owner); } } diff --git a/include/net/sock.h b/include/net/sock.h index 0fbae85c6d55..91d28957dc10 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -493,6 +493,7 @@ extern void sk_stream_kill_queues(struct sock *sk); extern int sk_wait_data(struct sock *sk, long *timeo); struct request_sock_ops; +struct timewait_sock_ops; /* Networking protocol blocks we attach to sockets. * socket layer -> transport layer interface @@ -557,11 +558,10 @@ struct proto { kmem_cache_t *slab; unsigned int obj_size; - kmem_cache_t *twsk_slab; - unsigned int twsk_obj_size; atomic_t *orphan_count; struct request_sock_ops *rsk_prot; + struct timewait_sock_ops *twsk_prot; struct module *owner; diff --git a/include/net/tcp.h b/include/net/tcp.h index 83b117a25c2a..176221cd0cce 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -287,6 +287,9 @@ extern int tcp_rcv_established(struct sock *sk, extern void tcp_rcv_space_adjust(struct sock *sk); +extern int tcp_twsk_unique(struct sock *sk, + struct sock *sktw, void *twp); + static inline void tcp_dec_quickack_mode(struct sock *sk, const unsigned int pkts) { diff --git a/include/net/timewait_sock.h b/include/net/timewait_sock.h new file mode 100644 index 000000000000..2544281e1d5e --- /dev/null +++ b/include/net/timewait_sock.h @@ -0,0 +1,31 @@ +/* + * NET Generic infrastructure for Network protocols. + * + * Authors: Arnaldo Carvalho de Melo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#ifndef _TIMEWAIT_SOCK_H +#define _TIMEWAIT_SOCK_H + +#include +#include + +struct timewait_sock_ops { + kmem_cache_t *twsk_slab; + unsigned int twsk_obj_size; + int (*twsk_unique)(struct sock *sk, + struct sock *sktw, void *twp); +}; + +static inline int twsk_unique(struct sock *sk, struct sock *sktw, void *twp) +{ + if (sk->sk_prot->twsk_prot->twsk_unique != NULL) + return sk->sk_prot->twsk_prot->twsk_unique(sk, sktw, twp); + return 0; +} + +#endif /* _TIMEWAIT_SOCK_H */ diff --git a/net/core/sock.c b/net/core/sock.c index 13cc3be4f056..6465b0e4c8cb 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1488,7 +1488,7 @@ int proto_register(struct proto *prot, int alloc_slab) } } - if (prot->twsk_obj_size) { + if (prot->twsk_prot != NULL) { static const char mask[] = "tw_sock_%s"; timewait_sock_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL); @@ -1497,11 +1497,12 @@ int proto_register(struct proto *prot, int alloc_slab) goto out_free_request_sock_slab; sprintf(timewait_sock_slab_name, mask, prot->name); - prot->twsk_slab = kmem_cache_create(timewait_sock_slab_name, - prot->twsk_obj_size, - 0, SLAB_HWCACHE_ALIGN, - NULL, NULL); - if (prot->twsk_slab == NULL) + prot->twsk_prot->twsk_slab = + kmem_cache_create(timewait_sock_slab_name, + prot->twsk_prot->twsk_obj_size, + 0, SLAB_HWCACHE_ALIGN, + NULL, NULL); + if (prot->twsk_prot->twsk_slab == NULL) goto out_free_timewait_sock_slab_name; } } @@ -1548,12 +1549,12 @@ void proto_unregister(struct proto *prot) prot->rsk_prot->slab = NULL; } - if (prot->twsk_slab != NULL) { - const char *name = kmem_cache_name(prot->twsk_slab); + if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) { + const char *name = kmem_cache_name(prot->twsk_prot->twsk_slab); - kmem_cache_destroy(prot->twsk_slab); + kmem_cache_destroy(prot->twsk_prot->twsk_slab); kfree(name); - prot->twsk_slab = NULL; + prot->twsk_prot->twsk_slab = NULL; } } diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index bc28d71905e2..e11cda0cb6b3 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -1309,6 +1310,10 @@ static struct request_sock_ops dccp_request_sock_ops = { .send_reset = dccp_v4_ctl_send_reset, }; +static struct timewait_sock_ops dccp_timewait_sock_ops = { + .twsk_obj_size = sizeof(struct inet_timewait_sock), +}; + struct proto dccp_prot = { .name = "DCCP", .owner = THIS_MODULE, @@ -1332,5 +1337,7 @@ struct proto dccp_prot = { .max_header = MAX_DCCP_HEADER, .obj_size = sizeof(struct dccp_sock), .rsk_prot = &dccp_request_sock_ops, - .twsk_obj_size = sizeof(struct inet_timewait_sock), + .twsk_prot = &dccp_timewait_sock_ops, }; + +EXPORT_SYMBOL_GPL(dccp_prot); diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index a7d2aee5b3af..4d078f5b911b 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -652,6 +652,10 @@ static struct request_sock_ops dccp6_request_sock_ops = { .send_reset = dccp_v6_ctl_send_reset, }; +static struct timewait_sock_ops dccp6_timewait_sock_ops = { + .twsk_obj_size = sizeof(struct dccp6_timewait_sock), +}; + static void dccp_v6_send_check(struct sock *sk, int len, struct sk_buff *skb) { struct ipv6_pinfo *np = inet6_sk(sk); @@ -1359,7 +1363,7 @@ static struct proto dccp_v6_prot = { .max_header = MAX_DCCP_HEADER, .obj_size = sizeof(struct dccp6_sock), .rsk_prot = &dccp6_request_sock_ops, - .twsk_obj_size = sizeof(struct dccp6_timewait_sock), + .twsk_prot = &dccp6_timewait_sock_ops, }; static struct inet6_protocol dccp_v6_protocol = { diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index a010e9a68811..417f126c749e 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -90,8 +90,9 @@ EXPORT_SYMBOL_GPL(__inet_twsk_hashdance); struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int state) { - struct inet_timewait_sock *tw = kmem_cache_alloc(sk->sk_prot_creator->twsk_slab, - SLAB_ATOMIC); + struct inet_timewait_sock *tw = + kmem_cache_alloc(sk->sk_prot_creator->twsk_prot->twsk_slab, + SLAB_ATOMIC); if (tw != NULL) { const struct inet_sock *inet = inet_sk(sk); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 0b5ab04d3c5a..6728772a943a 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -69,6 +69,7 @@ #include #include #include +#include #include #include @@ -118,6 +119,39 @@ static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb) skb->h.th->source); } +int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) +{ + const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw); + struct tcp_sock *tp = tcp_sk(sk); + + /* With PAWS, it is safe from the viewpoint + of data integrity. Even without PAWS it is safe provided sequence + spaces do not overlap i.e. at data rates <= 80Mbit/sec. + + Actually, the idea is close to VJ's one, only timestamp cache is + held not per host, but per port pair and TW bucket is used as state + holder. + + If TW bucket has been already destroyed we fall back to VJ's scheme + and use initial timestamp retrieved from peer table. + */ + if (tcptw->tw_ts_recent_stamp && + (twp == NULL || (sysctl_tcp_tw_reuse && + xtime.tv_sec - tcptw->tw_ts_recent_stamp > 1))) { + tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2; + if (tp->write_seq == 0) + tp->write_seq = 1; + tp->rx_opt.ts_recent = tcptw->tw_ts_recent; + tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; + sock_hold(sktw); + return 1; + } + + return 0; +} + +EXPORT_SYMBOL_GPL(tcp_twsk_unique); + /* called with local bh disabled */ static int __tcp_v4_check_established(struct sock *sk, __u16 lport, struct inet_timewait_sock **twp) @@ -142,35 +176,9 @@ static int __tcp_v4_check_established(struct sock *sk, __u16 lport, tw = inet_twsk(sk2); if (INET_TW_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif)) { - const struct tcp_timewait_sock *tcptw = tcp_twsk(sk2); - struct tcp_sock *tp = tcp_sk(sk); - - /* With PAWS, it is safe from the viewpoint - of data integrity. Even without PAWS it - is safe provided sequence spaces do not - overlap i.e. at data rates <= 80Mbit/sec. - - Actually, the idea is close to VJ's one, - only timestamp cache is held not per host, - but per port pair and TW bucket is used - as state holder. - - If TW bucket has been already destroyed we - fall back to VJ's scheme and use initial - timestamp retrieved from peer table. - */ - if (tcptw->tw_ts_recent_stamp && - (!twp || (sysctl_tcp_tw_reuse && - xtime.tv_sec - - tcptw->tw_ts_recent_stamp > 1))) { - tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2; - if (tp->write_seq == 0) - tp->write_seq = 1; - tp->rx_opt.ts_recent = tcptw->tw_ts_recent; - tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; - sock_hold(sk2); + if (twsk_unique(sk, sk2, twp)) goto unique; - } else + else goto not_unique; } } @@ -869,6 +877,11 @@ struct request_sock_ops tcp_request_sock_ops = { .send_reset = tcp_v4_send_reset, }; +static struct timewait_sock_ops tcp_timewait_sock_ops = { + .twsk_obj_size = sizeof(struct tcp_timewait_sock), + .twsk_unique = tcp_twsk_unique, +}; + int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) { struct inet_request_sock *ireq; @@ -1979,7 +1992,7 @@ struct proto tcp_prot = { .sysctl_rmem = sysctl_tcp_rmem, .max_header = MAX_TCP_HEADER, .obj_size = sizeof(struct tcp_sock), - .twsk_obj_size = sizeof(struct tcp_timewait_sock), + .twsk_prot = &tcp_timewait_sock_ops, .rsk_prot = &tcp_request_sock_ops, }; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index e5c8a669e84e..514b57bb80b7 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -60,6 +60,7 @@ #include #include #include +#include #include @@ -147,22 +148,9 @@ static int __tcp_v6_check_established(struct sock *sk, const __u16 lport, ipv6_addr_equal(&tw6->tw_v6_daddr, saddr) && ipv6_addr_equal(&tw6->tw_v6_rcv_saddr, daddr) && sk2->sk_bound_dev_if == sk->sk_bound_dev_if) { - const struct tcp_timewait_sock *tcptw = tcp_twsk(sk2); - struct tcp_sock *tp = tcp_sk(sk); - - if (tcptw->tw_ts_recent_stamp && - (!twp || - (sysctl_tcp_tw_reuse && - xtime.tv_sec - tcptw->tw_ts_recent_stamp > 1))) { - /* See comment in tcp_ipv4.c */ - tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2; - if (!tp->write_seq) - tp->write_seq = 1; - tp->rx_opt.ts_recent = tcptw->tw_ts_recent; - tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; - sock_hold(sk2); + if (twsk_unique(sk, sk2, twp)) goto unique; - } else + else goto not_unique; } } @@ -711,6 +699,11 @@ static struct request_sock_ops tcp6_request_sock_ops = { .send_reset = tcp_v6_send_reset }; +static struct timewait_sock_ops tcp6_timewait_sock_ops = { + .twsk_obj_size = sizeof(struct tcp6_timewait_sock), + .twsk_unique = tcp_twsk_unique, +}; + static void tcp_v6_send_check(struct sock *sk, int len, struct sk_buff *skb) { struct ipv6_pinfo *np = inet6_sk(sk); @@ -1752,7 +1745,7 @@ struct proto tcpv6_prot = { .sysctl_rmem = sysctl_tcp_rmem, .max_header = MAX_TCP_HEADER, .obj_size = sizeof(struct tcp6_sock), - .twsk_obj_size = sizeof(struct tcp6_timewait_sock), + .twsk_prot = &tcp6_timewait_sock_ops, .rsk_prot = &tcp6_request_sock_ops, }; -- cgit v1.2.3 From a7f5e7f164788a22eb5d3de8e2d3cee1bf58fdca Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 13 Dec 2005 23:25:31 -0800 Subject: [INET]: Generalise tcp_v4_hash_connect Renaming it to inet_hash_connect, making it possible to ditch dccp_v4_hash_connect and share the same code with TCP instead. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- drivers/char/random.c | 6 +- include/linux/random.h | 2 +- include/net/inet_hashtables.h | 3 + net/dccp/ipv4.c | 160 +------------------------------------ net/ipv4/inet_hashtables.c | 178 ++++++++++++++++++++++++++++++++++++++++++ net/ipv4/tcp_ipv4.c | 173 +--------------------------------------- 6 files changed, 186 insertions(+), 336 deletions(-) (limited to 'include/net') diff --git a/drivers/char/random.c b/drivers/char/random.c index 7999da25fe40..79b59d986af4 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -1554,10 +1554,8 @@ __u32 secure_tcp_sequence_number(__u32 saddr, __u32 daddr, EXPORT_SYMBOL(secure_tcp_sequence_number); - - -/* Generate secure starting point for ephemeral TCP port search */ -u32 secure_tcp_port_ephemeral(__u32 saddr, __u32 daddr, __u16 dport) +/* Generate secure starting point for ephemeral IPV4 transport port search */ +u32 secure_ipv4_port_ephemeral(__u32 saddr, __u32 daddr, __u16 dport) { struct keydata *keyptr = get_keyptr(); u32 hash[4]; diff --git a/include/linux/random.h b/include/linux/random.h index 7b2adb3322d5..01424a8e621c 100644 --- a/include/linux/random.h +++ b/include/linux/random.h @@ -52,7 +52,7 @@ extern void get_random_bytes(void *buf, int nbytes); void generate_random_uuid(unsigned char uuid_out[16]); extern __u32 secure_ip_id(__u32 daddr); -extern u32 secure_tcp_port_ephemeral(__u32 saddr, __u32 daddr, __u16 dport); +extern u32 secure_ipv4_port_ephemeral(__u32 saddr, __u32 daddr, __u16 dport); extern u32 secure_tcpv6_port_ephemeral(const __u32 *saddr, const __u32 *daddr, __u16 dport); extern __u32 secure_tcp_sequence_number(__u32 saddr, __u32 daddr, diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index 07840baa9341..c83baa79f66e 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -434,4 +434,7 @@ static inline struct sock *inet_lookup(struct inet_hashinfo *hashinfo, return sk; } + +extern int inet_hash_connect(struct inet_timewait_death_row *death_row, + struct sock *sk); #endif /* _INET_HASHTABLES_H */ diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index e11cda0cb6b3..671fbf3b2379 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -54,164 +54,6 @@ void dccp_unhash(struct sock *sk) EXPORT_SYMBOL_GPL(dccp_unhash); -/* called with local bh disabled */ -static int __dccp_v4_check_established(struct sock *sk, const __u16 lport, - struct inet_timewait_sock **twp) -{ - struct inet_sock *inet = inet_sk(sk); - const u32 daddr = inet->rcv_saddr; - const u32 saddr = inet->daddr; - const int dif = sk->sk_bound_dev_if; - INET_ADDR_COOKIE(acookie, saddr, daddr) - const __u32 ports = INET_COMBINED_PORTS(inet->dport, lport); - unsigned int hash = inet_ehashfn(daddr, lport, saddr, inet->dport); - struct inet_ehash_bucket *head = inet_ehash_bucket(&dccp_hashinfo, hash); - const struct sock *sk2; - const struct hlist_node *node; - struct inet_timewait_sock *tw; - - prefetch(head->chain.first); - write_lock(&head->lock); - - /* Check TIME-WAIT sockets first. */ - sk_for_each(sk2, node, &(head + dccp_hashinfo.ehash_size)->chain) { - tw = inet_twsk(sk2); - - if (INET_TW_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif)) - goto not_unique; - } - tw = NULL; - - /* And established part... */ - sk_for_each(sk2, node, &head->chain) { - if (INET_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif)) - goto not_unique; - } - - /* Must record num and sport now. Otherwise we will see - * in hash table socket with a funny identity. */ - inet->num = lport; - inet->sport = htons(lport); - sk->sk_hash = hash; - BUG_TRAP(sk_unhashed(sk)); - __sk_add_node(sk, &head->chain); - sock_prot_inc_use(sk->sk_prot); - write_unlock(&head->lock); - - if (twp != NULL) { - *twp = tw; - NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); - } else if (tw != NULL) { - /* Silly. Should hash-dance instead... */ - inet_twsk_deschedule(tw, &dccp_death_row); - NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); - - inet_twsk_put(tw); - } - - return 0; - -not_unique: - write_unlock(&head->lock); - return -EADDRNOTAVAIL; -} - -/* - * Bind a port for a connect operation and hash it. - */ -static int dccp_v4_hash_connect(struct sock *sk) -{ - const unsigned short snum = inet_sk(sk)->num; - struct inet_bind_hashbucket *head; - struct inet_bind_bucket *tb; - int ret; - - if (snum == 0) { - int low = sysctl_local_port_range[0]; - int high = sysctl_local_port_range[1]; - int remaining = (high - low) + 1; - int rover = net_random() % (high - low) + low; - struct hlist_node *node; - struct inet_timewait_sock *tw = NULL; - - local_bh_disable(); - do { - head = &dccp_hashinfo.bhash[inet_bhashfn(rover, - dccp_hashinfo.bhash_size)]; - spin_lock(&head->lock); - - /* Does not bother with rcv_saddr checks, - * because the established check is already - * unique enough. - */ - inet_bind_bucket_for_each(tb, node, &head->chain) { - if (tb->port == rover) { - BUG_TRAP(!hlist_empty(&tb->owners)); - if (tb->fastreuse >= 0) - goto next_port; - if (!__dccp_v4_check_established(sk, - rover, - &tw)) - goto ok; - goto next_port; - } - } - - tb = inet_bind_bucket_create(dccp_hashinfo.bind_bucket_cachep, - head, rover); - if (tb == NULL) { - spin_unlock(&head->lock); - break; - } - tb->fastreuse = -1; - goto ok; - - next_port: - spin_unlock(&head->lock); - if (++rover > high) - rover = low; - } while (--remaining > 0); - - local_bh_enable(); - - return -EADDRNOTAVAIL; - -ok: - /* All locks still held and bhs disabled */ - inet_bind_hash(sk, tb, rover); - if (sk_unhashed(sk)) { - inet_sk(sk)->sport = htons(rover); - __inet_hash(&dccp_hashinfo, sk, 0); - } - spin_unlock(&head->lock); - - if (tw != NULL) { - inet_twsk_deschedule(tw, &dccp_death_row); - inet_twsk_put(tw); - } - - ret = 0; - goto out; - } - - head = &dccp_hashinfo.bhash[inet_bhashfn(snum, - dccp_hashinfo.bhash_size)]; - tb = inet_csk(sk)->icsk_bind_hash; - spin_lock_bh(&head->lock); - if (sk_head(&tb->owners) == sk && sk->sk_bind_node.next == NULL) { - __inet_hash(&dccp_hashinfo, sk, 0); - spin_unlock_bh(&head->lock); - return 0; - } else { - spin_unlock(&head->lock); - /* No definite answer... Walk to established hash table */ - ret = __dccp_v4_check_established(sk, snum, NULL); -out: - local_bh_enable(); - return ret; - } -} - int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { struct inet_sock *inet = inet_sk(sk); @@ -272,7 +114,7 @@ int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) * complete initialization after this. */ dccp_set_state(sk, DCCP_REQUESTING); - err = dccp_v4_hash_connect(sk); + err = inet_hash_connect(&dccp_death_row, sk); if (err != 0) goto failure; diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index e8d29fe736d2..33228115cda4 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -15,12 +15,14 @@ #include #include +#include #include #include #include #include #include +#include /* * Allocate and initialize a new local port bind bucket. @@ -163,3 +165,179 @@ struct sock *__inet_lookup_listener(const struct hlist_head *head, const u32 dad } EXPORT_SYMBOL_GPL(__inet_lookup_listener); + +/* called with local bh disabled */ +static int __inet_check_established(struct inet_timewait_death_row *death_row, + struct sock *sk, __u16 lport, + struct inet_timewait_sock **twp) +{ + struct inet_hashinfo *hinfo = death_row->hashinfo; + struct inet_sock *inet = inet_sk(sk); + u32 daddr = inet->rcv_saddr; + u32 saddr = inet->daddr; + int dif = sk->sk_bound_dev_if; + INET_ADDR_COOKIE(acookie, saddr, daddr) + const __u32 ports = INET_COMBINED_PORTS(inet->dport, lport); + unsigned int hash = inet_ehashfn(daddr, lport, saddr, inet->dport); + struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); + struct sock *sk2; + const struct hlist_node *node; + struct inet_timewait_sock *tw; + + prefetch(head->chain.first); + write_lock(&head->lock); + + /* Check TIME-WAIT sockets first. */ + sk_for_each(sk2, node, &(head + hinfo->ehash_size)->chain) { + tw = inet_twsk(sk2); + + if (INET_TW_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif)) { + if (twsk_unique(sk, sk2, twp)) + goto unique; + else + goto not_unique; + } + } + tw = NULL; + + /* And established part... */ + sk_for_each(sk2, node, &head->chain) { + if (INET_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif)) + goto not_unique; + } + +unique: + /* Must record num and sport now. Otherwise we will see + * in hash table socket with a funny identity. */ + inet->num = lport; + inet->sport = htons(lport); + sk->sk_hash = hash; + BUG_TRAP(sk_unhashed(sk)); + __sk_add_node(sk, &head->chain); + sock_prot_inc_use(sk->sk_prot); + write_unlock(&head->lock); + + if (twp) { + *twp = tw; + NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); + } else if (tw) { + /* Silly. Should hash-dance instead... */ + inet_twsk_deschedule(tw, death_row); + NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); + + inet_twsk_put(tw); + } + + return 0; + +not_unique: + write_unlock(&head->lock); + return -EADDRNOTAVAIL; +} + +static inline u32 inet_sk_port_offset(const struct sock *sk) +{ + const struct inet_sock *inet = inet_sk(sk); + return secure_ipv4_port_ephemeral(inet->rcv_saddr, inet->daddr, + inet->dport); +} + +/* + * Bind a port for a connect operation and hash it. + */ +int inet_hash_connect(struct inet_timewait_death_row *death_row, + struct sock *sk) +{ + struct inet_hashinfo *hinfo = death_row->hashinfo; + const unsigned short snum = inet_sk(sk)->num; + struct inet_bind_hashbucket *head; + struct inet_bind_bucket *tb; + int ret; + + if (!snum) { + int low = sysctl_local_port_range[0]; + int high = sysctl_local_port_range[1]; + int range = high - low; + int i; + int port; + static u32 hint; + u32 offset = hint + inet_sk_port_offset(sk); + struct hlist_node *node; + struct inet_timewait_sock *tw = NULL; + + local_bh_disable(); + for (i = 1; i <= range; i++) { + port = low + (i + offset) % range; + head = &hinfo->bhash[inet_bhashfn(port, hinfo->bhash_size)]; + spin_lock(&head->lock); + + /* Does not bother with rcv_saddr checks, + * because the established check is already + * unique enough. + */ + inet_bind_bucket_for_each(tb, node, &head->chain) { + if (tb->port == port) { + BUG_TRAP(!hlist_empty(&tb->owners)); + if (tb->fastreuse >= 0) + goto next_port; + if (!__inet_check_established(death_row, + sk, port, + &tw)) + goto ok; + goto next_port; + } + } + + tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, head, port); + if (!tb) { + spin_unlock(&head->lock); + break; + } + tb->fastreuse = -1; + goto ok; + + next_port: + spin_unlock(&head->lock); + } + local_bh_enable(); + + return -EADDRNOTAVAIL; + +ok: + hint += i; + + /* Head lock still held and bh's disabled */ + inet_bind_hash(sk, tb, port); + if (sk_unhashed(sk)) { + inet_sk(sk)->sport = htons(port); + __inet_hash(hinfo, sk, 0); + } + spin_unlock(&head->lock); + + if (tw) { + inet_twsk_deschedule(tw, death_row);; + inet_twsk_put(tw); + } + + ret = 0; + goto out; + } + + head = &hinfo->bhash[inet_bhashfn(snum, hinfo->bhash_size)]; + tb = inet_csk(sk)->icsk_bind_hash; + spin_lock_bh(&head->lock); + if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { + __inet_hash(hinfo, sk, 0); + spin_unlock_bh(&head->lock); + return 0; + } else { + spin_unlock(&head->lock); + /* No definite answer... Walk to established hash table */ + ret = __inet_check_established(death_row, sk, snum, NULL); +out: + local_bh_enable(); + return ret; + } +} + +EXPORT_SYMBOL_GPL(inet_hash_connect); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 6728772a943a..c2fe61becd61 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -152,177 +152,6 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) EXPORT_SYMBOL_GPL(tcp_twsk_unique); -/* called with local bh disabled */ -static int __tcp_v4_check_established(struct sock *sk, __u16 lport, - struct inet_timewait_sock **twp) -{ - struct inet_sock *inet = inet_sk(sk); - u32 daddr = inet->rcv_saddr; - u32 saddr = inet->daddr; - int dif = sk->sk_bound_dev_if; - INET_ADDR_COOKIE(acookie, saddr, daddr) - const __u32 ports = INET_COMBINED_PORTS(inet->dport, lport); - unsigned int hash = inet_ehashfn(daddr, lport, saddr, inet->dport); - struct inet_ehash_bucket *head = inet_ehash_bucket(&tcp_hashinfo, hash); - struct sock *sk2; - const struct hlist_node *node; - struct inet_timewait_sock *tw; - - prefetch(head->chain.first); - write_lock(&head->lock); - - /* Check TIME-WAIT sockets first. */ - sk_for_each(sk2, node, &(head + tcp_hashinfo.ehash_size)->chain) { - tw = inet_twsk(sk2); - - if (INET_TW_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif)) { - if (twsk_unique(sk, sk2, twp)) - goto unique; - else - goto not_unique; - } - } - tw = NULL; - - /* And established part... */ - sk_for_each(sk2, node, &head->chain) { - if (INET_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif)) - goto not_unique; - } - -unique: - /* Must record num and sport now. Otherwise we will see - * in hash table socket with a funny identity. */ - inet->num = lport; - inet->sport = htons(lport); - sk->sk_hash = hash; - BUG_TRAP(sk_unhashed(sk)); - __sk_add_node(sk, &head->chain); - sock_prot_inc_use(sk->sk_prot); - write_unlock(&head->lock); - - if (twp) { - *twp = tw; - NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); - } else if (tw) { - /* Silly. Should hash-dance instead... */ - inet_twsk_deschedule(tw, &tcp_death_row); - NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); - - inet_twsk_put(tw); - } - - return 0; - -not_unique: - write_unlock(&head->lock); - return -EADDRNOTAVAIL; -} - -static inline u32 connect_port_offset(const struct sock *sk) -{ - const struct inet_sock *inet = inet_sk(sk); - - return secure_tcp_port_ephemeral(inet->rcv_saddr, inet->daddr, - inet->dport); -} - -/* - * Bind a port for a connect operation and hash it. - */ -static inline int tcp_v4_hash_connect(struct sock *sk) -{ - const unsigned short snum = inet_sk(sk)->num; - struct inet_bind_hashbucket *head; - struct inet_bind_bucket *tb; - int ret; - - if (!snum) { - int low = sysctl_local_port_range[0]; - int high = sysctl_local_port_range[1]; - int range = high - low; - int i; - int port; - static u32 hint; - u32 offset = hint + connect_port_offset(sk); - struct hlist_node *node; - struct inet_timewait_sock *tw = NULL; - - local_bh_disable(); - for (i = 1; i <= range; i++) { - port = low + (i + offset) % range; - head = &tcp_hashinfo.bhash[inet_bhashfn(port, tcp_hashinfo.bhash_size)]; - spin_lock(&head->lock); - - /* Does not bother with rcv_saddr checks, - * because the established check is already - * unique enough. - */ - inet_bind_bucket_for_each(tb, node, &head->chain) { - if (tb->port == port) { - BUG_TRAP(!hlist_empty(&tb->owners)); - if (tb->fastreuse >= 0) - goto next_port; - if (!__tcp_v4_check_established(sk, - port, - &tw)) - goto ok; - goto next_port; - } - } - - tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, port); - if (!tb) { - spin_unlock(&head->lock); - break; - } - tb->fastreuse = -1; - goto ok; - - next_port: - spin_unlock(&head->lock); - } - local_bh_enable(); - - return -EADDRNOTAVAIL; - -ok: - hint += i; - - /* Head lock still held and bh's disabled */ - inet_bind_hash(sk, tb, port); - if (sk_unhashed(sk)) { - inet_sk(sk)->sport = htons(port); - __inet_hash(&tcp_hashinfo, sk, 0); - } - spin_unlock(&head->lock); - - if (tw) { - inet_twsk_deschedule(tw, &tcp_death_row);; - inet_twsk_put(tw); - } - - ret = 0; - goto out; - } - - head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)]; - tb = inet_csk(sk)->icsk_bind_hash; - spin_lock_bh(&head->lock); - if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { - __inet_hash(&tcp_hashinfo, sk, 0); - spin_unlock_bh(&head->lock); - return 0; - } else { - spin_unlock(&head->lock); - /* No definite answer... Walk to established hash table */ - ret = __tcp_v4_check_established(sk, snum, NULL); -out: - local_bh_enable(); - return ret; - } -} - /* This will initiate an outgoing connection. */ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { @@ -403,7 +232,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) * complete initialization after this. */ tcp_set_state(sk, TCP_SYN_SENT); - err = tcp_v4_hash_connect(sk); + err = inet_hash_connect(&tcp_death_row, sk); if (err) goto failure; -- cgit v1.2.3 From d8313f5ca2b1f86b7df6c99fc4b3fffa1f84e92b Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 13 Dec 2005 23:25:44 -0800 Subject: [INET6]: Generalise tcp_v6_hash_connect Renaming it to inet6_hash_connect, making it possible to ditch dccp_v6_hash_connect and share the same code with TCP instead. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- drivers/char/random.c | 4 +- include/linux/random.h | 4 +- include/net/ipv6.h | 3 + net/dccp/ipv6.c | 171 +---------------------------------------- net/ipv6/inet6_hashtables.c | 183 +++++++++++++++++++++++++++++++++++++++++++- net/ipv6/tcp_ipv6.c | 173 +---------------------------------------- 6 files changed, 190 insertions(+), 348 deletions(-) (limited to 'include/net') diff --git a/drivers/char/random.c b/drivers/char/random.c index 79b59d986af4..bdfdfd28594d 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -1573,7 +1573,7 @@ u32 secure_ipv4_port_ephemeral(__u32 saddr, __u32 daddr, __u16 dport) } #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) -u32 secure_tcpv6_port_ephemeral(const __u32 *saddr, const __u32 *daddr, __u16 dport) +u32 secure_ipv6_port_ephemeral(const __u32 *saddr, const __u32 *daddr, __u16 dport) { struct keydata *keyptr = get_keyptr(); u32 hash[12]; @@ -1584,7 +1584,7 @@ u32 secure_tcpv6_port_ephemeral(const __u32 *saddr, const __u32 *daddr, __u16 dp return twothirdsMD4Transform(daddr, hash); } -EXPORT_SYMBOL(secure_tcpv6_port_ephemeral); +EXPORT_SYMBOL(secure_ipv6_port_ephemeral); #endif #if defined(CONFIG_IP_DCCP) || defined(CONFIG_IP_DCCP_MODULE) diff --git a/include/linux/random.h b/include/linux/random.h index 01424a8e621c..5d6456bcdeba 100644 --- a/include/linux/random.h +++ b/include/linux/random.h @@ -53,8 +53,8 @@ void generate_random_uuid(unsigned char uuid_out[16]); extern __u32 secure_ip_id(__u32 daddr); extern u32 secure_ipv4_port_ephemeral(__u32 saddr, __u32 daddr, __u16 dport); -extern u32 secure_tcpv6_port_ephemeral(const __u32 *saddr, const __u32 *daddr, - __u16 dport); +extern u32 secure_ipv6_port_ephemeral(const __u32 *saddr, const __u32 *daddr, + __u16 dport); extern __u32 secure_tcp_sequence_number(__u32 saddr, __u32 daddr, __u16 sport, __u16 dport); extern __u32 secure_tcpv6_sequence_number(__u32 *saddr, __u32 *daddr, diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 851376108ac2..e3d5d7bc8837 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -527,6 +527,9 @@ extern int inet6_getname(struct socket *sock, struct sockaddr *uaddr, extern int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); +extern int inet6_hash_connect(struct inet_timewait_death_row *death_row, + struct sock *sk); + /* * reassembly.c */ diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 4d078f5b911b..71bf04eb21e1 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -84,175 +84,6 @@ static __u32 dccp_v6_init_sequence(struct sock *sk, struct sk_buff *skb) dh->dccph_sport); } -static int __dccp_v6_check_established(struct sock *sk, const __u16 lport, - struct inet_timewait_sock **twp) -{ - struct inet_sock *inet = inet_sk(sk); - const struct ipv6_pinfo *np = inet6_sk(sk); - const struct in6_addr *daddr = &np->rcv_saddr; - const struct in6_addr *saddr = &np->daddr; - const int dif = sk->sk_bound_dev_if; - const u32 ports = INET_COMBINED_PORTS(inet->dport, lport); - const unsigned int hash = inet6_ehashfn(daddr, inet->num, - saddr, inet->dport); - struct inet_ehash_bucket *head = inet_ehash_bucket(&dccp_hashinfo, hash); - struct sock *sk2; - const struct hlist_node *node; - struct inet_timewait_sock *tw; - - prefetch(head->chain.first); - write_lock(&head->lock); - - /* Check TIME-WAIT sockets first. */ - sk_for_each(sk2, node, &(head + dccp_hashinfo.ehash_size)->chain) { - const struct inet6_timewait_sock *tw6 = inet6_twsk(sk2); - - tw = inet_twsk(sk2); - - if(*((__u32 *)&(tw->tw_dport)) == ports && - sk2->sk_family == PF_INET6 && - ipv6_addr_equal(&tw6->tw_v6_daddr, saddr) && - ipv6_addr_equal(&tw6->tw_v6_rcv_saddr, daddr) && - sk2->sk_bound_dev_if == sk->sk_bound_dev_if) - goto not_unique; - } - tw = NULL; - - /* And established part... */ - sk_for_each(sk2, node, &head->chain) { - if (INET6_MATCH(sk2, hash, saddr, daddr, ports, dif)) - goto not_unique; - } - - BUG_TRAP(sk_unhashed(sk)); - __sk_add_node(sk, &head->chain); - sk->sk_hash = hash; - sock_prot_inc_use(sk->sk_prot); - write_unlock(&head->lock); - - if (twp) { - *twp = tw; - NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); - } else if (tw) { - /* Silly. Should hash-dance instead... */ - inet_twsk_deschedule(tw, &dccp_death_row); - NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); - - inet_twsk_put(tw); - } - return 0; - -not_unique: - write_unlock(&head->lock); - return -EADDRNOTAVAIL; -} - -static inline u32 dccp_v6_port_offset(const struct sock *sk) -{ - const struct inet_sock *inet = inet_sk(sk); - const struct ipv6_pinfo *np = inet6_sk(sk); - - return secure_tcpv6_port_ephemeral(np->rcv_saddr.s6_addr32, - np->daddr.s6_addr32, - inet->dport); -} - -static int dccp_v6_hash_connect(struct sock *sk) -{ - const unsigned short snum = inet_sk(sk)->num; - struct inet_bind_hashbucket *head; - struct inet_bind_bucket *tb; - int ret; - - if (snum == 0) { - int low = sysctl_local_port_range[0]; - int high = sysctl_local_port_range[1]; - int range = high - low; - int i; - int port; - static u32 hint; - u32 offset = hint + dccp_v6_port_offset(sk); - struct hlist_node *node; - struct inet_timewait_sock *tw = NULL; - - local_bh_disable(); - for (i = 1; i <= range; i++) { - port = low + (i + offset) % range; - head = &dccp_hashinfo.bhash[inet_bhashfn(port, - dccp_hashinfo.bhash_size)]; - spin_lock(&head->lock); - - /* Does not bother with rcv_saddr checks, - * because the established check is already - * unique enough. - */ - inet_bind_bucket_for_each(tb, node, &head->chain) { - if (tb->port == port) { - BUG_TRAP(!hlist_empty(&tb->owners)); - if (tb->fastreuse >= 0) - goto next_port; - if (!__dccp_v6_check_established(sk, - port, - &tw)) - goto ok; - goto next_port; - } - } - - tb = inet_bind_bucket_create(dccp_hashinfo.bind_bucket_cachep, - head, port); - if (!tb) { - spin_unlock(&head->lock); - break; - } - tb->fastreuse = -1; - goto ok; - - next_port: - spin_unlock(&head->lock); - } - local_bh_enable(); - - return -EADDRNOTAVAIL; -ok: - hint += i; - - /* Head lock still held and bh's disabled */ - inet_bind_hash(sk, tb, port); - if (sk_unhashed(sk)) { - inet_sk(sk)->sport = htons(port); - __inet6_hash(&dccp_hashinfo, sk); - } - spin_unlock(&head->lock); - - if (tw) { - inet_twsk_deschedule(tw, &dccp_death_row); - inet_twsk_put(tw); - } - - ret = 0; - goto out; - } - - head = &dccp_hashinfo.bhash[inet_bhashfn(snum, - dccp_hashinfo.bhash_size)]; - tb = inet_csk(sk)->icsk_bind_hash; - spin_lock_bh(&head->lock); - - if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { - __inet6_hash(&dccp_hashinfo, sk); - spin_unlock_bh(&head->lock); - return 0; - } else { - spin_unlock(&head->lock); - /* No definite answer... Walk to established hash table */ - ret = __dccp_v6_check_established(sk, snum, NULL); -out: - local_bh_enable(); - return ret; - } -} - static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { @@ -403,7 +234,7 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr, inet->dport = usin->sin6_port; dccp_set_state(sk, DCCP_REQUESTING); - err = dccp_v6_hash_connect(sk); + err = inet6_hash_connect(&dccp_death_row, sk); if (err) goto late_failure; /* FIXME */ diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index 01d5f46d4e40..4154f3a8b6cf 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -5,7 +5,8 @@ * * Generic INET6 transport hashtables * - * Authors: Lotsa people, from code originally in tcp + * Authors: Lotsa people, from code originally in tcp, generalised here + * by Arnaldo Carvalho de Melo * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -14,12 +15,13 @@ */ #include - #include +#include #include #include #include +#include struct sock *inet6_lookup_listener(struct inet_hashinfo *hashinfo, const struct in6_addr *daddr, @@ -79,3 +81,180 @@ struct sock *inet6_lookup(struct inet_hashinfo *hashinfo, } EXPORT_SYMBOL_GPL(inet6_lookup); + +static int __inet6_check_established(struct inet_timewait_death_row *death_row, + struct sock *sk, const __u16 lport, + struct inet_timewait_sock **twp) +{ + struct inet_hashinfo *hinfo = death_row->hashinfo; + const struct inet_sock *inet = inet_sk(sk); + const struct ipv6_pinfo *np = inet6_sk(sk); + const struct in6_addr *daddr = &np->rcv_saddr; + const struct in6_addr *saddr = &np->daddr; + const int dif = sk->sk_bound_dev_if; + const u32 ports = INET_COMBINED_PORTS(inet->dport, lport); + const unsigned int hash = inet6_ehashfn(daddr, inet->num, saddr, + inet->dport); + struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); + struct sock *sk2; + const struct hlist_node *node; + struct inet_timewait_sock *tw; + + prefetch(head->chain.first); + write_lock(&head->lock); + + /* Check TIME-WAIT sockets first. */ + sk_for_each(sk2, node, &(head + hinfo->ehash_size)->chain) { + const struct inet6_timewait_sock *tw6 = inet6_twsk(sk2); + + tw = inet_twsk(sk2); + + if(*((__u32 *)&(tw->tw_dport)) == ports && + sk2->sk_family == PF_INET6 && + ipv6_addr_equal(&tw6->tw_v6_daddr, saddr) && + ipv6_addr_equal(&tw6->tw_v6_rcv_saddr, daddr) && + sk2->sk_bound_dev_if == sk->sk_bound_dev_if) { + if (twsk_unique(sk, sk2, twp)) + goto unique; + else + goto not_unique; + } + } + tw = NULL; + + /* And established part... */ + sk_for_each(sk2, node, &head->chain) { + if (INET6_MATCH(sk2, hash, saddr, daddr, ports, dif)) + goto not_unique; + } + +unique: + BUG_TRAP(sk_unhashed(sk)); + __sk_add_node(sk, &head->chain); + sk->sk_hash = hash; + sock_prot_inc_use(sk->sk_prot); + write_unlock(&head->lock); + + if (twp != NULL) { + *twp = tw; + NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); + } else if (tw != NULL) { + /* Silly. Should hash-dance instead... */ + inet_twsk_deschedule(tw, death_row); + NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); + + inet_twsk_put(tw); + } + return 0; + +not_unique: + write_unlock(&head->lock); + return -EADDRNOTAVAIL; +} + +static inline u32 inet6_sk_port_offset(const struct sock *sk) +{ + const struct inet_sock *inet = inet_sk(sk); + const struct ipv6_pinfo *np = inet6_sk(sk); + return secure_ipv6_port_ephemeral(np->rcv_saddr.s6_addr32, + np->daddr.s6_addr32, + inet->dport); +} + +int inet6_hash_connect(struct inet_timewait_death_row *death_row, + struct sock *sk) +{ + struct inet_hashinfo *hinfo = death_row->hashinfo; + const unsigned short snum = inet_sk(sk)->num; + struct inet_bind_hashbucket *head; + struct inet_bind_bucket *tb; + int ret; + + if (snum == 0) { + const int low = sysctl_local_port_range[0]; + const int high = sysctl_local_port_range[1]; + const int range = high - low; + int i, port; + static u32 hint; + const u32 offset = hint + inet6_sk_port_offset(sk); + struct hlist_node *node; + struct inet_timewait_sock *tw = NULL; + + local_bh_disable(); + for (i = 1; i <= range; i++) { + port = low + (i + offset) % range; + head = &hinfo->bhash[inet_bhashfn(port, hinfo->bhash_size)]; + spin_lock(&head->lock); + + /* Does not bother with rcv_saddr checks, + * because the established check is already + * unique enough. + */ + inet_bind_bucket_for_each(tb, node, &head->chain) { + if (tb->port == port) { + BUG_TRAP(!hlist_empty(&tb->owners)); + if (tb->fastreuse >= 0) + goto next_port; + if (!__inet6_check_established(death_row, + sk, port, + &tw)) + goto ok; + goto next_port; + } + } + + tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, + head, port); + if (!tb) { + spin_unlock(&head->lock); + break; + } + tb->fastreuse = -1; + goto ok; + + next_port: + spin_unlock(&head->lock); + } + local_bh_enable(); + + return -EADDRNOTAVAIL; + +ok: + hint += i; + + /* Head lock still held and bh's disabled */ + inet_bind_hash(sk, tb, port); + if (sk_unhashed(sk)) { + inet_sk(sk)->sport = htons(port); + __inet6_hash(hinfo, sk); + } + spin_unlock(&head->lock); + + if (tw) { + inet_twsk_deschedule(tw, death_row); + inet_twsk_put(tw); + } + + ret = 0; + goto out; + } + + head = &hinfo->bhash[inet_bhashfn(snum, hinfo->bhash_size)]; + tb = inet_csk(sk)->icsk_bind_hash; + spin_lock_bh(&head->lock); + + if (sk_head(&tb->owners) == sk && sk->sk_bind_node.next == NULL) { + __inet6_hash(hinfo, sk); + spin_unlock_bh(&head->lock); + return 0; + } else { + spin_unlock(&head->lock); + /* No definite answer... Walk to established hash table */ + ret = __inet6_check_established(death_row, sk, snum, NULL); +out: + local_bh_enable(); + return ret; + } +} + +EXPORT_SYMBOL_GPL(inet6_hash_connect); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 514b57bb80b7..a682eb9093e1 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -119,177 +119,6 @@ static __u32 tcp_v6_init_sequence(struct sock *sk, struct sk_buff *skb) } } -static int __tcp_v6_check_established(struct sock *sk, const __u16 lport, - struct inet_timewait_sock **twp) -{ - struct inet_sock *inet = inet_sk(sk); - const struct ipv6_pinfo *np = inet6_sk(sk); - const struct in6_addr *daddr = &np->rcv_saddr; - const struct in6_addr *saddr = &np->daddr; - const int dif = sk->sk_bound_dev_if; - const u32 ports = INET_COMBINED_PORTS(inet->dport, lport); - unsigned int hash = inet6_ehashfn(daddr, inet->num, saddr, inet->dport); - struct inet_ehash_bucket *head = inet_ehash_bucket(&tcp_hashinfo, hash); - struct sock *sk2; - const struct hlist_node *node; - struct inet_timewait_sock *tw; - - prefetch(head->chain.first); - write_lock(&head->lock); - - /* Check TIME-WAIT sockets first. */ - sk_for_each(sk2, node, &(head + tcp_hashinfo.ehash_size)->chain) { - const struct inet6_timewait_sock *tw6 = inet6_twsk(sk2); - - tw = inet_twsk(sk2); - - if(*((__u32 *)&(tw->tw_dport)) == ports && - sk2->sk_family == PF_INET6 && - ipv6_addr_equal(&tw6->tw_v6_daddr, saddr) && - ipv6_addr_equal(&tw6->tw_v6_rcv_saddr, daddr) && - sk2->sk_bound_dev_if == sk->sk_bound_dev_if) { - if (twsk_unique(sk, sk2, twp)) - goto unique; - else - goto not_unique; - } - } - tw = NULL; - - /* And established part... */ - sk_for_each(sk2, node, &head->chain) { - if (INET6_MATCH(sk2, hash, saddr, daddr, ports, dif)) - goto not_unique; - } - -unique: - BUG_TRAP(sk_unhashed(sk)); - __sk_add_node(sk, &head->chain); - sk->sk_hash = hash; - sock_prot_inc_use(sk->sk_prot); - write_unlock(&head->lock); - - if (twp) { - *twp = tw; - NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); - } else if (tw) { - /* Silly. Should hash-dance instead... */ - inet_twsk_deschedule(tw, &tcp_death_row); - NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); - - inet_twsk_put(tw); - } - return 0; - -not_unique: - write_unlock(&head->lock); - return -EADDRNOTAVAIL; -} - -static inline u32 tcpv6_port_offset(const struct sock *sk) -{ - const struct inet_sock *inet = inet_sk(sk); - const struct ipv6_pinfo *np = inet6_sk(sk); - - return secure_tcpv6_port_ephemeral(np->rcv_saddr.s6_addr32, - np->daddr.s6_addr32, - inet->dport); -} - -static int tcp_v6_hash_connect(struct sock *sk) -{ - unsigned short snum = inet_sk(sk)->num; - struct inet_bind_hashbucket *head; - struct inet_bind_bucket *tb; - int ret; - - if (!snum) { - int low = sysctl_local_port_range[0]; - int high = sysctl_local_port_range[1]; - int range = high - low; - int i; - int port; - static u32 hint; - u32 offset = hint + tcpv6_port_offset(sk); - struct hlist_node *node; - struct inet_timewait_sock *tw = NULL; - - local_bh_disable(); - for (i = 1; i <= range; i++) { - port = low + (i + offset) % range; - head = &tcp_hashinfo.bhash[inet_bhashfn(port, tcp_hashinfo.bhash_size)]; - spin_lock(&head->lock); - - /* Does not bother with rcv_saddr checks, - * because the established check is already - * unique enough. - */ - inet_bind_bucket_for_each(tb, node, &head->chain) { - if (tb->port == port) { - BUG_TRAP(!hlist_empty(&tb->owners)); - if (tb->fastreuse >= 0) - goto next_port; - if (!__tcp_v6_check_established(sk, - port, - &tw)) - goto ok; - goto next_port; - } - } - - tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, port); - if (!tb) { - spin_unlock(&head->lock); - break; - } - tb->fastreuse = -1; - goto ok; - - next_port: - spin_unlock(&head->lock); - } - local_bh_enable(); - - return -EADDRNOTAVAIL; - -ok: - hint += i; - - /* Head lock still held and bh's disabled */ - inet_bind_hash(sk, tb, port); - if (sk_unhashed(sk)) { - inet_sk(sk)->sport = htons(port); - __inet6_hash(&tcp_hashinfo, sk); - } - spin_unlock(&head->lock); - - if (tw) { - inet_twsk_deschedule(tw, &tcp_death_row); - inet_twsk_put(tw); - } - - ret = 0; - goto out; - } - - head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)]; - tb = inet_csk(sk)->icsk_bind_hash; - spin_lock_bh(&head->lock); - - if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { - __inet6_hash(&tcp_hashinfo, sk); - spin_unlock_bh(&head->lock); - return 0; - } else { - spin_unlock(&head->lock); - /* No definite answer... Walk to established hash table */ - ret = __tcp_v6_check_established(sk, snum, NULL); -out: - local_bh_enable(); - return ret; - } -} - static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { @@ -450,7 +279,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, inet->dport = usin->sin6_port; tcp_set_state(sk, TCP_SYN_SENT); - err = tcp_v6_hash_connect(sk); + err = inet6_hash_connect(&tcp_death_row, sk); if (err) goto late_failure; -- cgit v1.2.3 From 22712813620fa8e682dbfb253a60ca0131da1e07 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 13 Dec 2005 23:25:56 -0800 Subject: [TCP]: Move the TCPF_ enum to tcp_states.h Upcoming patches will make, for instance, ip_sockglue.c need just this enum and not all of tcp.h. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- include/linux/tcp.h | 16 ---------------- include/net/tcp_states.h | 16 ++++++++++++++++ 2 files changed, 16 insertions(+), 16 deletions(-) (limited to 'include/net') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 4e1434007f44..da38eea1994b 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -55,22 +55,6 @@ struct tcphdr { __u16 urg_ptr; }; -#define TCP_ACTION_FIN (1 << 7) - -enum { - TCPF_ESTABLISHED = (1 << 1), - TCPF_SYN_SENT = (1 << 2), - TCPF_SYN_RECV = (1 << 3), - TCPF_FIN_WAIT1 = (1 << 4), - TCPF_FIN_WAIT2 = (1 << 5), - TCPF_TIME_WAIT = (1 << 6), - TCPF_CLOSE = (1 << 7), - TCPF_CLOSE_WAIT = (1 << 8), - TCPF_LAST_ACK = (1 << 9), - TCPF_LISTEN = (1 << 10), - TCPF_CLOSING = (1 << 11) -}; - /* * The union cast uses a gcc extension to avoid aliasing problems * (union is compatible to any of its members) diff --git a/include/net/tcp_states.h b/include/net/tcp_states.h index b9d4176b2d15..b0b645988bd8 100644 --- a/include/net/tcp_states.h +++ b/include/net/tcp_states.h @@ -31,4 +31,20 @@ enum { #define TCP_STATE_MASK 0xF +#define TCP_ACTION_FIN (1 << 7) + +enum { + TCPF_ESTABLISHED = (1 << 1), + TCPF_SYN_SENT = (1 << 2), + TCPF_SYN_RECV = (1 << 3), + TCPF_FIN_WAIT1 = (1 << 4), + TCPF_FIN_WAIT2 = (1 << 5), + TCPF_TIME_WAIT = (1 << 6), + TCPF_CLOSE = (1 << 7), + TCPF_CLOSE_WAIT = (1 << 8), + TCPF_LAST_ACK = (1 << 9), + TCPF_LISTEN = (1 << 10), + TCPF_CLOSING = (1 << 11) +}; + #endif /* _LINUX_TCP_STATES_H */ -- cgit v1.2.3 From d83d8461f902c672bc1bd8fbc6a94e19f092da97 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 13 Dec 2005 23:26:10 -0800 Subject: [IP_SOCKGLUE]: Remove most of the tcp specific calls As DCCP needs to be called in the same spots. Now we have a member in inet_sock (is_icsk), set at sock creation time from struct inet_protosw->flags (if INET_PROTOSW_ICSK is set, like for TCP and DCCP) to see if a struct sock instance is a inet_connection_sock for places like the ones in ip_sockglue.c (v4 and v6) where we previously were looking if sk_type was SOCK_STREAM, that is insufficient because we now use the same code for DCCP, that has sk_type SOCK_DCCP. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- include/linux/dccp.h | 4 ---- include/linux/ip.h | 1 + include/linux/tcp.h | 3 +-- include/net/inet_connection_sock.h | 6 +++++- include/net/protocol.h | 1 + net/dccp/diag.c | 2 +- net/dccp/input.c | 2 +- net/dccp/ipv4.c | 12 +++++++----- net/dccp/ipv6.c | 26 ++++++++++++++------------ net/dccp/output.c | 7 ++++--- net/dccp/proto.c | 2 +- net/ipv4/af_inet.c | 4 +++- net/ipv4/ip_sockglue.c | 13 ++++++------- net/ipv4/tcp.c | 2 +- net/ipv4/tcp_input.c | 10 ++++------ net/ipv4/tcp_ipv4.c | 12 ++++++------ net/ipv4/tcp_output.c | 18 ++++++++++-------- net/ipv6/af_inet6.c | 1 + net/ipv6/ipv6_sockglue.c | 24 +++++++++++++----------- net/ipv6/tcp_ipv6.c | 30 +++++++++++++++++------------- 20 files changed, 97 insertions(+), 83 deletions(-) (limited to 'include/net') diff --git a/include/linux/dccp.h b/include/linux/dccp.h index 71fab4311e92..d0bdb499cf8d 100644 --- a/include/linux/dccp.h +++ b/include/linux/dccp.h @@ -408,8 +408,6 @@ struct dccp_ackvec; * @dccps_gar - greatest valid ack number received on a non-Sync; initialized to %dccps_iss * @dccps_timestamp_time - time of latest TIMESTAMP option * @dccps_timestamp_echo - latest timestamp received on a TIMESTAMP option - * @dccps_ext_header_len - network protocol overhead (IP/IPv6 options) - * @dccps_pmtu_cookie - Last pmtu seen by socket * @dccps_packet_size - Set thru setsockopt * @dccps_role - Role of this sock, one of %dccp_role * @dccps_ndp_count - number of Non Data Packets since last data packet @@ -434,8 +432,6 @@ struct dccp_sock { __u32 dccps_timestamp_echo; __u32 dccps_packet_size; unsigned long dccps_ndp_count; - __u16 dccps_ext_header_len; - __u32 dccps_pmtu_cookie; __u32 dccps_mss_cache; struct dccp_options dccps_options; struct dccp_ackvec *dccps_hc_rx_ackvec; diff --git a/include/linux/ip.h b/include/linux/ip.h index 5a560daeade5..6ccc596c19c8 100644 --- a/include/linux/ip.h +++ b/include/linux/ip.h @@ -155,6 +155,7 @@ struct inet_sock { __u8 mc_ttl; /* Multicasting TTL */ __u8 pmtudisc; unsigned recverr : 1, + is_icsk : 1, /* inet_connection_sock? */ freebind : 1, hdrincl : 1, mc_loop : 1; diff --git a/include/linux/tcp.h b/include/linux/tcp.h index da38eea1994b..f2bb2396853f 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -238,10 +238,9 @@ struct tcp_sock { __u32 snd_wl1; /* Sequence for window update */ __u32 snd_wnd; /* The window we expect to receive */ __u32 max_window; /* Maximal window ever seen from peer */ - __u32 pmtu_cookie; /* Last pmtu seen by socket */ __u32 mss_cache; /* Cached effective mss, not including SACKS */ __u16 xmit_size_goal; /* Goal for segmenting output packets */ - __u16 ext_header_len; /* Network protocol overhead (IP/IPv6 options) */ + /* XXX Two bytes hole, try to pack */ __u32 window_clamp; /* Maximal window to advertise */ __u32 rcv_ssthresh; /* Current window clamp */ diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index e50e2b890c6d..91888967d3e3 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -60,6 +60,7 @@ struct inet_connection_sock_af_ops { * @icsk_timeout: Timeout * @icsk_retransmit_timer: Resend (no ack) * @icsk_rto: Retransmit timeout + * @icsk_pmtu_cookie Last pmtu seen by socket * @icsk_ca_ops Pluggable congestion control hook * @icsk_af_ops Operations which are AF_INET{4,6} specific * @icsk_ca_state: Congestion control state @@ -68,6 +69,7 @@ struct inet_connection_sock_af_ops { * @icsk_backoff: Backoff * @icsk_syn_retries: Number of allowed SYN (or equivalent) retries * @icsk_probes_out: unanswered 0 window probes + * @icsk_ext_hdr_len: Network protocol overhead (IP/IPv6 options) * @icsk_ack: Delayed ACK control data */ struct inet_connection_sock { @@ -79,15 +81,17 @@ struct inet_connection_sock { struct timer_list icsk_retransmit_timer; struct timer_list icsk_delack_timer; __u32 icsk_rto; + __u32 icsk_pmtu_cookie; struct tcp_congestion_ops *icsk_ca_ops; struct inet_connection_sock_af_ops *icsk_af_ops; + unsigned int (*icsk_sync_mss)(struct sock *sk, u32 pmtu); __u8 icsk_ca_state; __u8 icsk_retransmits; __u8 icsk_pending; __u8 icsk_backoff; __u8 icsk_syn_retries; __u8 icsk_probes_out; - /* 2 BYTES HOLE, TRY TO PACK! */ + __u16 icsk_ext_hdr_len; struct { __u8 pending; /* ACK is pending */ __u8 quick; /* Scheduled number of quick acks */ diff --git a/include/net/protocol.h b/include/net/protocol.h index 357691f6a45f..a29cb29647d0 100644 --- a/include/net/protocol.h +++ b/include/net/protocol.h @@ -76,6 +76,7 @@ struct inet_protosw { }; #define INET_PROTOSW_REUSE 0x01 /* Are ports automatically reusable? */ #define INET_PROTOSW_PERMANENT 0x02 /* Permanent protocols are unremovable. */ +#define INET_PROTOSW_ICSK 0x04 /* Is this an inet_connection_sock? */ extern struct net_protocol *inet_protocol_base; extern struct net_protocol *inet_protos[MAX_INET_PROTOS]; diff --git a/net/dccp/diag.c b/net/dccp/diag.c index f675d8e642d3..3f78c00e3822 100644 --- a/net/dccp/diag.c +++ b/net/dccp/diag.c @@ -28,7 +28,7 @@ static void dccp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_retransmits = icsk->icsk_retransmits; info->tcpi_probes = icsk->icsk_probes_out; info->tcpi_backoff = icsk->icsk_backoff; - info->tcpi_pmtu = dp->dccps_pmtu_cookie; + info->tcpi_pmtu = icsk->icsk_pmtu_cookie; if (dp->dccps_options.dccpo_send_ack_vector) info->tcpi_options |= TCPI_OPT_SACK; diff --git a/net/dccp/input.c b/net/dccp/input.c index 9a724ff2a622..55e921bdd131 100644 --- a/net/dccp/input.c +++ b/net/dccp/input.c @@ -311,7 +311,7 @@ static int dccp_rcv_request_sent_state_process(struct sock *sk, goto out_invalid_packet; } - dccp_sync_mss(sk, dp->dccps_pmtu_cookie); + dccp_sync_mss(sk, icsk->icsk_pmtu_cookie); /* * Step 10: Process REQUEST state (second part) diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 671fbf3b2379..c363051a7f16 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -104,9 +104,9 @@ int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) inet->dport = usin->sin_port; inet->daddr = daddr; - dp->dccps_ext_header_len = 0; + inet_csk(sk)->icsk_ext_hdr_len = 0; if (inet->opt != NULL) - dp->dccps_ext_header_len = inet->opt->optlen; + inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen; /* * Socket identity is still unknown (sport may be zero). * However we set state to DCCP_REQUESTING and not releasing socket @@ -191,7 +191,7 @@ static inline void dccp_do_pmtu_discovery(struct sock *sk, mtu = dst_mtu(dst); if (inet->pmtudisc != IP_PMTUDISC_DONT && - dp->dccps_pmtu_cookie > mtu) { + inet_csk(sk)->icsk_pmtu_cookie > mtu) { dccp_sync_mss(sk, mtu); /* @@ -1051,6 +1051,7 @@ struct inet_connection_sock_af_ops dccp_ipv4_af_ops = { int dccp_v4_init_sock(struct sock *sk) { struct dccp_sock *dp = dccp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); static int dccp_ctl_socket_init = 1; dccp_options_init(&dp->dccps_options); @@ -1090,10 +1091,11 @@ int dccp_v4_init_sock(struct sock *sk) dccp_ctl_socket_init = 0; dccp_init_xmit_timers(sk); - inet_csk(sk)->icsk_rto = DCCP_TIMEOUT_INIT; + icsk->icsk_rto = DCCP_TIMEOUT_INIT; sk->sk_state = DCCP_CLOSED; sk->sk_write_space = dccp_write_space; - inet_csk(sk)->icsk_af_ops = &dccp_ipv4_af_ops; + icsk->icsk_af_ops = &dccp_ipv4_af_ops; + icsk->icsk_sync_mss = dccp_sync_mss; dp->dccps_mss_cache = 536; dp->dccps_role = DCCP_ROLE_UNDEFINED; dp->dccps_service = DCCP_SERVICE_INVALID_VALUE; diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 71bf04eb21e1..599b0be21515 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -88,6 +88,7 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr; + struct inet_connection_sock *icsk = inet_csk(sk); struct inet_sock *inet = inet_sk(sk); struct ipv6_pinfo *np = inet6_sk(sk); struct dccp_sock *dp = dccp_sk(sk); @@ -158,7 +159,7 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr, */ if (addr_type == IPV6_ADDR_MAPPED) { - u32 exthdrlen = dp->dccps_ext_header_len; + u32 exthdrlen = icsk->icsk_ext_hdr_len; struct sockaddr_in sin; SOCK_DEBUG(sk, "connect: ipv4 mapped\n"); @@ -170,14 +171,14 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr, sin.sin_port = usin->sin6_port; sin.sin_addr.s_addr = usin->sin6_addr.s6_addr32[3]; - inet_csk(sk)->icsk_af_ops = &dccp_ipv6_mapped; + icsk->icsk_af_ops = &dccp_ipv6_mapped; sk->sk_backlog_rcv = dccp_v4_do_rcv; err = dccp_v4_connect(sk, (struct sockaddr *)&sin, sizeof(sin)); if (err) { - dp->dccps_ext_header_len = exthdrlen; - inet_csk(sk)->icsk_af_ops = &dccp_ipv6_af_ops; + icsk->icsk_ext_hdr_len = exthdrlen; + icsk->icsk_af_ops = &dccp_ipv6_af_ops; sk->sk_backlog_rcv = dccp_v6_do_rcv; goto failure; } else { @@ -227,9 +228,10 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr, ip6_dst_store(sk, dst, NULL); - dp->dccps_ext_header_len = 0; + icsk->icsk_ext_hdr_len = 0; if (np->opt) - dp->dccps_ext_header_len = np->opt->opt_flen + np->opt->opt_nflen; + icsk->icsk_ext_hdr_len = (np->opt->opt_flen + + np->opt->opt_nflen); inet->dport = usin->sin6_port; @@ -292,7 +294,6 @@ static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, np = inet6_sk(sk); if (type == ICMPV6_PKT_TOOBIG) { - struct dccp_sock *dp = dccp_sk(sk); struct dst_entry *dst = NULL; if (sock_owned_by_user(sk)) @@ -332,7 +333,7 @@ static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, } else dst_hold(dst); - if (dp->dccps_pmtu_cookie > dst_mtu(dst)) { + if (inet_csk(sk)->icsk_pmtu_cookie > dst_mtu(dst)) { dccp_sync_mss(sk, dst_mtu(dst)); } /* else let the usual retransmit timer handle it */ dst_release(dst); @@ -808,7 +809,7 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk, worked with IPv6 icsk.icsk_af_ops. Sync it now. */ - dccp_sync_mss(newsk, newdp->dccps_pmtu_cookie); + dccp_sync_mss(newsk, inet_csk(newsk)->icsk_pmtu_cookie); return newsk; } @@ -916,10 +917,10 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk, sock_kfree_s(sk, opt, opt->tot_len); } - newdp->dccps_ext_header_len = 0; + inet_csk(newsk)->icsk_ext_hdr_len = 0; if (newnp->opt) - newdp->dccps_ext_header_len = newnp->opt->opt_nflen + - newnp->opt->opt_flen; + inet_csk(newsk)->icsk_ext_hdr_len = (newnp->opt->opt_nflen + + newnp->opt->opt_flen); dccp_sync_mss(newsk, dst_mtu(dst)); @@ -1230,6 +1231,7 @@ static struct inet_protosw dccp_v6_protosw = { .prot = &dccp_v6_prot, .ops = &inet6_dccp_ops, .capability = -1, + .flags = INET_PROTOSW_ICSK, }; static int __init dccp_v6_init(void) diff --git a/net/dccp/output.c b/net/dccp/output.c index c40f7f8a328b..95a3c2c6a3ce 100644 --- a/net/dccp/output.c +++ b/net/dccp/output.c @@ -134,12 +134,13 @@ static int dccp_transmit_skb(struct sock *sk, struct sk_buff *skb) unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu) { + struct inet_connection_sock *icsk = inet_csk(sk); struct dccp_sock *dp = dccp_sk(sk); - int mss_now = (pmtu - inet_csk(sk)->icsk_af_ops->net_header_len - + int mss_now = (pmtu - icsk->icsk_af_ops->net_header_len - sizeof(struct dccp_hdr) - sizeof(struct dccp_hdr_ext)); /* Now subtract optional transport overhead */ - mss_now -= dp->dccps_ext_header_len; + mss_now -= icsk->icsk_ext_hdr_len; /* * FIXME: this should come from the CCID infrastructure, where, say, @@ -152,7 +153,7 @@ unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu) mss_now -= ((5 + 6 + 10 + 6 + 6 + 6 + 3) / 4) * 4; /* And store cached results */ - dp->dccps_pmtu_cookie = pmtu; + icsk->icsk_pmtu_cookie = pmtu; dp->dccps_mss_cache = mss_now; return mss_now; diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 51dfacd22a6e..40a4c6899051 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -712,7 +712,7 @@ static struct inet_protosw dccp_v4_protosw = { .ops = &inet_dccp_ops, .capability = -1, .no_check = 0, - .flags = 0, + .flags = INET_PROTOSW_ICSK, }; /* diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index d368cf249000..617e858beff1 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -302,6 +302,7 @@ lookup_protocol: sk->sk_reuse = 1; inet = inet_sk(sk); + inet->is_icsk = INET_PROTOSW_ICSK & answer_flags; if (SOCK_RAW == sock->type) { inet->num = protocol; @@ -869,7 +870,8 @@ static struct inet_protosw inetsw_array[] = .ops = &inet_stream_ops, .capability = -1, .no_check = 0, - .flags = INET_PROTOSW_PERMANENT, + .flags = INET_PROTOSW_PERMANENT | + INET_PROTOSW_ICSK, }, { diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 4f2d87257309..add019c746f8 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -29,8 +29,7 @@ #include #include #include -#include -#include +#include #include #include #include @@ -427,8 +426,8 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval, err = ip_options_get_from_user(&opt, optval, optlen); if (err) break; - if (sk->sk_type == SOCK_STREAM) { - struct tcp_sock *tp = tcp_sk(sk); + if (inet->is_icsk) { + struct inet_connection_sock *icsk = inet_csk(sk); #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) if (sk->sk_family == PF_INET || (!((1 << sk->sk_state) & @@ -436,10 +435,10 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval, inet->daddr != LOOPBACK4_IPV6)) { #endif if (inet->opt) - tp->ext_header_len -= inet->opt->optlen; + icsk->icsk_ext_hdr_len -= inet->opt->optlen; if (opt) - tp->ext_header_len += opt->optlen; - tcp_sync_mss(sk, tp->pmtu_cookie); + icsk->icsk_ext_hdr_len += opt->optlen; + icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie); #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) } #endif diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index eacfe6a3442c..00aa80e93243 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1914,7 +1914,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime); info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp); - info->tcpi_pmtu = tp->pmtu_cookie; + info->tcpi_pmtu = icsk->icsk_pmtu_cookie; info->tcpi_rcv_ssthresh = tp->rcv_ssthresh; info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3; info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 7de6184d4bd8..981d1203b152 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2342,7 +2342,7 @@ static int tcp_ack_update_window(struct sock *sk, struct tcp_sock *tp, if (nwin > tp->max_window) { tp->max_window = nwin; - tcp_sync_mss(sk, tp->pmtu_cookie); + tcp_sync_mss(sk, inet_csk(sk)->icsk_pmtu_cookie); } } } @@ -3967,12 +3967,12 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, struct tcphdr *th, unsigned len) { struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); int saved_clamp = tp->rx_opt.mss_clamp; tcp_parse_options(skb, &tp->rx_opt, 0); if (th->ack) { - struct inet_connection_sock *icsk; /* rfc793: * "If the state is SYN-SENT then * first check the ACK bit @@ -4061,7 +4061,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, if (tp->rx_opt.sack_ok && sysctl_tcp_fack) tp->rx_opt.sack_ok |= 2; - tcp_sync_mss(sk, tp->pmtu_cookie); + tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); tcp_initialize_rcv_mss(sk); /* Remember, tcp_poll() does not lock socket! @@ -4071,8 +4071,6 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, mb(); tcp_set_state(sk, TCP_ESTABLISHED); - icsk = inet_csk(sk); - /* Make sure socket is routed, for correct metrics. */ icsk->icsk_af_ops->rebuild_header(sk); @@ -4173,7 +4171,7 @@ discard: if (tp->ecn_flags&TCP_ECN_OK) sock_set_flag(sk, SOCK_NO_LARGESEND); - tcp_sync_mss(sk, tp->pmtu_cookie); + tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); tcp_initialize_rcv_mss(sk); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index c2fe61becd61..9b62d80bb20f 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -220,9 +220,9 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) inet->dport = usin->sin_port; inet->daddr = daddr; - tp->ext_header_len = 0; + inet_csk(sk)->icsk_ext_hdr_len = 0; if (inet->opt) - tp->ext_header_len = inet->opt->optlen; + inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen; tp->rx_opt.mss_clamp = 536; @@ -275,7 +275,6 @@ static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, { struct dst_entry *dst; struct inet_sock *inet = inet_sk(sk); - struct tcp_sock *tp = tcp_sk(sk); /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs * send out by Linux are always <576bytes so they should go through @@ -304,7 +303,7 @@ static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, mtu = dst_mtu(dst); if (inet->pmtudisc != IP_PMTUDISC_DONT && - tp->pmtu_cookie > mtu) { + inet_csk(sk)->icsk_pmtu_cookie > mtu) { tcp_sync_mss(sk, mtu); /* Resend the TCP packet because it's @@ -895,9 +894,9 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, ireq->opt = NULL; newinet->mc_index = inet_iif(skb); newinet->mc_ttl = skb->nh.iph->ttl; - newtp->ext_header_len = 0; + inet_csk(newsk)->icsk_ext_hdr_len = 0; if (newinet->opt) - newtp->ext_header_len = newinet->opt->optlen; + inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen; newinet->id = newtp->write_seq ^ jiffies; tcp_sync_mss(newsk, dst_mtu(dst)); @@ -1266,6 +1265,7 @@ static int tcp_v4_init_sock(struct sock *sk) sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); icsk->icsk_af_ops = &ipv4_specific; + icsk->icsk_sync_mss = tcp_sync_mss; sk->sk_sndbuf = sysctl_tcp_wmem[1]; sk->sk_rcvbuf = sysctl_tcp_rmem[1]; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index af1946c52c37..3a0a914de917 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -621,7 +621,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) It is minimum of user_mss and mss received with SYN. It also does not include TCP options. - tp->pmtu_cookie is last pmtu, seen by this function. + inet_csk(sk)->icsk_pmtu_cookie is last pmtu, seen by this function. tp->mss_cache is current effective sending mss, including all tcp options except for SACKs. It is evaluated, @@ -631,17 +631,18 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) NOTE1. rfc1122 clearly states that advertised MSS DOES NOT include either tcp or ip options. - NOTE2. tp->pmtu_cookie and tp->mss_cache are READ ONLY outside - this function. --ANK (980731) + NOTE2. inet_csk(sk)->icsk_pmtu_cookie and tp->mss_cache + are READ ONLY outside this function. --ANK (980731) */ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu) { struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); /* Calculate base mss without TCP options: It is MMS_S - sizeof(tcphdr) of rfc1122 */ - int mss_now = (pmtu - inet_csk(sk)->icsk_af_ops->net_header_len - + int mss_now = (pmtu - icsk->icsk_af_ops->net_header_len - sizeof(struct tcphdr)); /* Clamp it (mss_clamp does not include tcp options) */ @@ -649,7 +650,7 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu) mss_now = tp->rx_opt.mss_clamp; /* Now subtract optional transport overhead */ - mss_now -= tp->ext_header_len; + mss_now -= icsk->icsk_ext_hdr_len; /* Then reserve room for full set of TCP options and 8 bytes of data */ if (mss_now < 48) @@ -663,7 +664,7 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu) mss_now = max((tp->max_window>>1), 68U - tp->tcp_header_len); /* And store cached results */ - tp->pmtu_cookie = pmtu; + icsk->icsk_pmtu_cookie = pmtu; tp->mss_cache = mss_now; return mss_now; @@ -693,7 +694,7 @@ unsigned int tcp_current_mss(struct sock *sk, int large_allowed) if (dst) { u32 mtu = dst_mtu(dst); - if (mtu != tp->pmtu_cookie) + if (mtu != inet_csk(sk)->icsk_pmtu_cookie) mss_now = tcp_sync_mss(sk, mtu); } @@ -706,7 +707,8 @@ unsigned int tcp_current_mss(struct sock *sk, int large_allowed) if (doing_tso) { xmit_size_goal = (65535 - inet_csk(sk)->icsk_af_ops->net_header_len - - tp->ext_header_len - tp->tcp_header_len); + inet_csk(sk)->icsk_ext_hdr_len - + tp->tcp_header_len); if (tp->max_window && (xmit_size_goal > (tp->max_window >> 1))) diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index bf17aab9b776..70a510ff31ee 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -167,6 +167,7 @@ lookup_protocol: sk->sk_reuse = 1; inet = inet_sk(sk); + inet->is_icsk = INET_PROTOSW_ICSK & answer_flags; if (SOCK_RAW == sock->type) { inet->num = protocol; diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index b6b63fa8454c..c63868dd2ca2 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -163,17 +163,17 @@ int ipv6_setsockopt(struct sock *sk, int level, int optname, sk_refcnt_debug_dec(sk); if (sk->sk_protocol == IPPROTO_TCP) { - struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); local_bh_disable(); sock_prot_dec_use(sk->sk_prot); sock_prot_inc_use(&tcp_prot); local_bh_enable(); sk->sk_prot = &tcp_prot; - inet_csk(sk)->icsk_af_ops = &ipv4_specific; + icsk->icsk_af_ops = &ipv4_specific; sk->sk_socket->ops = &inet_stream_ops; sk->sk_family = PF_INET; - tcp_sync_mss(sk, tp->pmtu_cookie); + tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); } else { local_bh_disable(); sock_prot_dec_use(sk->sk_prot); @@ -317,14 +317,15 @@ int ipv6_setsockopt(struct sock *sk, int level, int optname, } retv = 0; - if (sk->sk_type == SOCK_STREAM) { + if (inet_sk(sk)->is_icsk) { if (opt) { - struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); if (!((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) && inet_sk(sk)->daddr != LOOPBACK4_IPV6) { - tp->ext_header_len = opt->opt_flen + opt->opt_nflen; - tcp_sync_mss(sk, tp->pmtu_cookie); + icsk->icsk_ext_hdr_len = + opt->opt_flen + opt->opt_nflen; + icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie); } } opt = xchg(&np->opt, opt); @@ -380,14 +381,15 @@ sticky_done: goto done; update: retv = 0; - if (sk->sk_type == SOCK_STREAM) { + if (inet_sk(sk)->is_icsk) { if (opt) { - struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); if (!((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) && inet_sk(sk)->daddr != LOOPBACK4_IPV6) { - tp->ext_header_len = opt->opt_flen + opt->opt_nflen; - tcp_sync_mss(sk, tp->pmtu_cookie); + icsk->icsk_ext_hdr_len = + opt->opt_flen + opt->opt_nflen; + icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie); } } opt = xchg(&np->opt, opt); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index a682eb9093e1..2947bc56d8a0 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -123,7 +123,8 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr; - struct inet_sock *inet = inet_sk(sk); + struct inet_sock *inet = inet_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); struct ipv6_pinfo *np = inet6_sk(sk); struct tcp_sock *tp = tcp_sk(sk); struct in6_addr *saddr = NULL, *final_p = NULL, final; @@ -198,7 +199,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, */ if (addr_type == IPV6_ADDR_MAPPED) { - u32 exthdrlen = tp->ext_header_len; + u32 exthdrlen = icsk->icsk_ext_hdr_len; struct sockaddr_in sin; SOCK_DEBUG(sk, "connect: ipv4 mapped\n"); @@ -210,14 +211,14 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, sin.sin_port = usin->sin6_port; sin.sin_addr.s_addr = usin->sin6_addr.s6_addr32[3]; - inet_csk(sk)->icsk_af_ops = &ipv6_mapped; + icsk->icsk_af_ops = &ipv6_mapped; sk->sk_backlog_rcv = tcp_v4_do_rcv; err = tcp_v4_connect(sk, (struct sockaddr *)&sin, sizeof(sin)); if (err) { - tp->ext_header_len = exthdrlen; - inet_csk(sk)->icsk_af_ops = &ipv6_specific; + icsk->icsk_ext_hdr_len = exthdrlen; + icsk->icsk_af_ops = &ipv6_specific; sk->sk_backlog_rcv = tcp_v6_do_rcv; goto failure; } else { @@ -270,9 +271,10 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, sk->sk_route_caps = dst->dev->features & ~(NETIF_F_IP_CSUM | NETIF_F_TSO); - tp->ext_header_len = 0; + icsk->icsk_ext_hdr_len = 0; if (np->opt) - tp->ext_header_len = np->opt->opt_flen + np->opt->opt_nflen; + icsk->icsk_ext_hdr_len = (np->opt->opt_flen + + np->opt->opt_nflen); tp->rx_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr); @@ -385,7 +387,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, } else dst_hold(dst); - if (tp->pmtu_cookie > dst_mtu(dst)) { + if (inet_csk(sk)->icsk_pmtu_cookie > dst_mtu(dst)) { tcp_sync_mss(sk, dst_mtu(dst)); tcp_simple_retransmit(sk); } /* else let the usual retransmit timer handle it */ @@ -869,7 +871,7 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, worked with IPv6 icsk.icsk_af_ops. Sync it now. */ - tcp_sync_mss(newsk, newtp->pmtu_cookie); + tcp_sync_mss(newsk, inet_csk(newsk)->icsk_pmtu_cookie); return newsk; } @@ -976,10 +978,10 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, sock_kfree_s(sk, opt, opt->tot_len); } - newtp->ext_header_len = 0; + inet_csk(newsk)->icsk_ext_hdr_len = 0; if (newnp->opt) - newtp->ext_header_len = newnp->opt->opt_nflen + - newnp->opt->opt_flen; + inet_csk(newsk)->icsk_ext_hdr_len = (newnp->opt->opt_nflen + + newnp->opt->opt_flen); tcp_sync_mss(newsk, dst_mtu(dst)); newtp->advmss = dst_metric(dst, RTAX_ADVMSS); @@ -1361,6 +1363,7 @@ static int tcp_v6_init_sock(struct sock *sk) icsk->icsk_af_ops = &ipv6_specific; icsk->icsk_ca_ops = &tcp_init_congestion_ops; + icsk->icsk_sync_mss = tcp_sync_mss; sk->sk_write_space = sk_stream_write_space; sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); @@ -1591,7 +1594,8 @@ static struct inet_protosw tcpv6_protosw = { .ops = &inet6_stream_ops, .capability = -1, .no_check = 0, - .flags = INET_PROTOSW_PERMANENT, + .flags = INET_PROTOSW_PERMANENT | + INET_PROTOSW_ICSK, }; void __init tcpv6_init(void) -- cgit v1.2.3 From fbe9cc4a87030d5cad5f944ffaef6af7efd119e4 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 13 Dec 2005 23:26:29 -0800 Subject: [AF_UNIX]: Use spinlock for unix_table_lock This lock is actually taken mostly as a writer, so using a rwlock actually just makes performance worse especially on chips like the Intel P4. Signed-off-by: David S. Miller --- include/net/af_unix.h | 2 +- net/unix/af_unix.c | 34 +++++++++++++++++----------------- net/unix/garbage.c | 4 ++-- 3 files changed, 20 insertions(+), 20 deletions(-) (limited to 'include/net') diff --git a/include/net/af_unix.h b/include/net/af_unix.h index b5d785ab4a0e..3f302ae98c03 100644 --- a/include/net/af_unix.h +++ b/include/net/af_unix.h @@ -13,7 +13,7 @@ extern void unix_gc(void); #define UNIX_HASH_SIZE 256 extern struct hlist_head unix_socket_table[UNIX_HASH_SIZE + 1]; -extern rwlock_t unix_table_lock; +extern spinlock_t unix_table_lock; extern atomic_t unix_tot_inflight; diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 1dc3685048f3..04e850e04e3d 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -121,7 +121,7 @@ int sysctl_unix_max_dgram_qlen = 10; struct hlist_head unix_socket_table[UNIX_HASH_SIZE + 1]; -DEFINE_RWLOCK(unix_table_lock); +DEFINE_SPINLOCK(unix_table_lock); static atomic_t unix_nr_socks = ATOMIC_INIT(0); #define unix_sockets_unbound (&unix_socket_table[UNIX_HASH_SIZE]) @@ -130,7 +130,7 @@ static atomic_t unix_nr_socks = ATOMIC_INIT(0); /* * SMP locking strategy: - * hash table is protected with rwlock unix_table_lock + * hash table is protected with spinlock unix_table_lock * each socket state is protected by separate rwlock. */ @@ -214,16 +214,16 @@ static void __unix_insert_socket(struct hlist_head *list, struct sock *sk) static inline void unix_remove_socket(struct sock *sk) { - write_lock(&unix_table_lock); + spin_lock(&unix_table_lock); __unix_remove_socket(sk); - write_unlock(&unix_table_lock); + spin_unlock(&unix_table_lock); } static inline void unix_insert_socket(struct hlist_head *list, struct sock *sk) { - write_lock(&unix_table_lock); + spin_lock(&unix_table_lock); __unix_insert_socket(list, sk); - write_unlock(&unix_table_lock); + spin_unlock(&unix_table_lock); } static struct sock *__unix_find_socket_byname(struct sockaddr_un *sunname, @@ -250,11 +250,11 @@ static inline struct sock *unix_find_socket_byname(struct sockaddr_un *sunname, { struct sock *s; - read_lock(&unix_table_lock); + spin_lock(&unix_table_lock); s = __unix_find_socket_byname(sunname, len, type, hash); if (s) sock_hold(s); - read_unlock(&unix_table_lock); + spin_unlock(&unix_table_lock); return s; } @@ -263,7 +263,7 @@ static struct sock *unix_find_socket_byinode(struct inode *i) struct sock *s; struct hlist_node *node; - read_lock(&unix_table_lock); + spin_lock(&unix_table_lock); sk_for_each(s, node, &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) { struct dentry *dentry = unix_sk(s)->dentry; @@ -276,7 +276,7 @@ static struct sock *unix_find_socket_byinode(struct inode *i) } s = NULL; found: - read_unlock(&unix_table_lock); + spin_unlock(&unix_table_lock); return s; } @@ -642,12 +642,12 @@ retry: addr->len = sprintf(addr->name->sun_path+1, "%05x", ordernum) + 1 + sizeof(short); addr->hash = unix_hash_fold(csum_partial((void*)addr->name, addr->len, 0)); - write_lock(&unix_table_lock); + spin_lock(&unix_table_lock); ordernum = (ordernum+1)&0xFFFFF; if (__unix_find_socket_byname(addr->name, addr->len, sock->type, addr->hash)) { - write_unlock(&unix_table_lock); + spin_unlock(&unix_table_lock); /* Sanity yield. It is unusual case, but yet... */ if (!(ordernum&0xFF)) yield(); @@ -658,7 +658,7 @@ retry: __unix_remove_socket(sk); u->addr = addr; __unix_insert_socket(&unix_socket_table[addr->hash], sk); - write_unlock(&unix_table_lock); + spin_unlock(&unix_table_lock); err = 0; out: up(&u->readsem); @@ -791,7 +791,7 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) addr->hash = UNIX_HASH_SIZE; } - write_lock(&unix_table_lock); + spin_lock(&unix_table_lock); if (!sunaddr->sun_path[0]) { err = -EADDRINUSE; @@ -814,7 +814,7 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) __unix_insert_socket(list, sk); out_unlock: - write_unlock(&unix_table_lock); + spin_unlock(&unix_table_lock); out_up: up(&u->readsem); out: @@ -1916,7 +1916,7 @@ static struct sock *unix_seq_idx(int *iter, loff_t pos) static void *unix_seq_start(struct seq_file *seq, loff_t *pos) { - read_lock(&unix_table_lock); + spin_lock(&unix_table_lock); return *pos ? unix_seq_idx(seq->private, *pos - 1) : ((void *) 1); } @@ -1931,7 +1931,7 @@ static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos) static void unix_seq_stop(struct seq_file *seq, void *v) { - read_unlock(&unix_table_lock); + spin_unlock(&unix_table_lock); } static int unix_seq_show(struct seq_file *seq, void *v) diff --git a/net/unix/garbage.c b/net/unix/garbage.c index 6ffc64e1712d..411802bd4d37 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -182,7 +182,7 @@ void unix_gc(void) if (down_trylock(&unix_gc_sem)) return; - read_lock(&unix_table_lock); + spin_lock(&unix_table_lock); forall_unix_sockets(i, s) { @@ -301,7 +301,7 @@ void unix_gc(void) } u->gc_tree = GC_ORPHAN; } - read_unlock(&unix_table_lock); + spin_unlock(&unix_table_lock); /* * Here we are. Hitlist is filled. Die. -- cgit v1.2.3 From 52ccb8e90c0ace233b8b740f2fc5de0dbd706b27 Mon Sep 17 00:00:00 2001 From: Frank Filz Date: Thu, 22 Dec 2005 11:36:46 -0800 Subject: [SCTP]: Update SCTP_PEER_ADDR_PARAMS socket option to the latest api draft. This patch adds support to set/get heartbeat interval, maximum number of retransmissions, pathmtu, sackdelay time for a particular transport/ association/socket as per the latest SCTP sockets api draft11. Signed-off-by: Frank Filz Signed-off-by: Sridhar Samudrala Signed-off-by: David S. Miller --- include/net/sctp/structs.h | 76 +++++-- include/net/sctp/user.h | 16 ++ net/sctp/associola.c | 81 ++++--- net/sctp/input.c | 36 +++- net/sctp/output.c | 17 +- net/sctp/sm_sideeffect.c | 26 ++- net/sctp/sm_statefuns.c | 20 +- net/sctp/socket.c | 511 ++++++++++++++++++++++++++++++++++----------- net/sctp/transport.c | 32 +-- 9 files changed, 595 insertions(+), 220 deletions(-) (limited to 'include/net') diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 8e7794ee27ff..f5c22d77feab 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -277,6 +277,24 @@ struct sctp_sock { __u32 default_context; __u32 default_timetolive; + /* Heartbeat interval: The endpoint sends out a Heartbeat chunk to + * the destination address every heartbeat interval. This value + * will be inherited by all new associations. + */ + __u32 hbinterval; + + /* This is the max_retrans value for new associations. */ + __u16 pathmaxrxt; + + /* The initial Path MTU to use for new associations. */ + __u32 pathmtu; + + /* The default SACK delay timeout for new associations. */ + __u32 sackdelay; + + /* Flags controling Heartbeat, SACK delay, and Path MTU Discovery. */ + __u32 param_flags; + struct sctp_initmsg initmsg; struct sctp_rtoinfo rtoinfo; struct sctp_paddrparams paddrparam; @@ -845,9 +863,6 @@ struct sctp_transport { /* Data that has been sent, but not acknowledged. */ __u32 flight_size; - /* PMTU : The current known path MTU. */ - __u32 pmtu; - /* Destination */ struct dst_entry *dst; /* Source address. */ @@ -862,7 +877,22 @@ struct sctp_transport { /* Heartbeat interval: The endpoint sends out a Heartbeat chunk to * the destination address every heartbeat interval. */ - int hb_interval; + __u32 hbinterval; + + /* This is the max_retrans value for the transport and will + * be initialized from the assocs value. This can be changed + * using SCTP_SET_PEER_ADDR_PARAMS socket option. + */ + __u16 pathmaxrxt; + + /* PMTU : The current known path MTU. */ + __u32 pathmtu; + + /* SACK delay timeout */ + __u32 sackdelay; + + /* Flags controling Heartbeat, SACK delay, and Path MTU Discovery. */ + __u32 param_flags; /* When was the last time (in jiffies) that we heard from this * transport? We use this to pick new active and retran paths. @@ -882,22 +912,11 @@ struct sctp_transport { */ int state; - /* hb_allowed : The current heartbeat state of this destination, - * : i.e. ALLOW-HB, NO-HEARTBEAT, etc. - */ - int hb_allowed; - /* These are the error stats for this destination. */ /* Error count : The current error count for this destination. */ unsigned short error_count; - /* This is the max_retrans value for the transport and will - * be initialized to proto.max_retrans.path. This can be changed - * using SCTP_SET_PEER_ADDR_PARAMS socket option. - */ - int max_retrans; - /* Per : A timer used by each destination. * Destination : * Timer : @@ -1502,6 +1521,28 @@ struct sctp_association { /* The largest timeout or RTO value to use in attempting an INIT */ __u16 max_init_timeo; + /* Heartbeat interval: The endpoint sends out a Heartbeat chunk to + * the destination address every heartbeat interval. This value + * will be inherited by all new transports. + */ + __u32 hbinterval; + + /* This is the max_retrans value for new transports in the + * association. + */ + __u16 pathmaxrxt; + + /* Association : The smallest PMTU discovered for all of the + * PMTU : peer's transport addresses. + */ + __u32 pathmtu; + + /* SACK delay timeout */ + __u32 sackdelay; + + /* Flags controling Heartbeat, SACK delay, and Path MTU Discovery. */ + __u32 param_flags; + int timeouts[SCTP_NUM_TIMEOUT_TYPES]; struct timer_list timers[SCTP_NUM_TIMEOUT_TYPES]; @@ -1571,11 +1612,6 @@ struct sctp_association { */ wait_queue_head_t wait; - /* Association : The smallest PMTU discovered for all of the - * PMTU : peer's transport addresses. - */ - __u32 pmtu; - /* The message size at which SCTP fragmentation will occur. */ __u32 frag_point; diff --git a/include/net/sctp/user.h b/include/net/sctp/user.h index f1c3bc54526a..b9052864fa5a 100644 --- a/include/net/sctp/user.h +++ b/include/net/sctp/user.h @@ -503,11 +503,27 @@ struct sctp_setadaption { * unreachable. The following structure is used to access and modify an * address's parameters: */ +enum sctp_spp_flags { + SPP_HB_ENABLE = 1, /*Enable heartbeats*/ + SPP_HB_DISABLE = 2, /*Disable heartbeats*/ + SPP_HB = SPP_HB_ENABLE | SPP_HB_DISABLE, + SPP_HB_DEMAND = 4, /*Send heartbeat immediately*/ + SPP_PMTUD_ENABLE = 8, /*Enable PMTU discovery*/ + SPP_PMTUD_DISABLE = 16, /*Disable PMTU discovery*/ + SPP_PMTUD = SPP_PMTUD_ENABLE | SPP_PMTUD_DISABLE, + SPP_SACKDELAY_ENABLE = 32, /*Enable SACK*/ + SPP_SACKDELAY_DISABLE = 64, /*Disable SACK*/ + SPP_SACKDELAY = SPP_SACKDELAY_ENABLE | SPP_SACKDELAY_DISABLE, +}; + struct sctp_paddrparams { sctp_assoc_t spp_assoc_id; struct sockaddr_storage spp_address; __u32 spp_hbinterval; __u16 spp_pathmaxrxt; + __u32 spp_pathmtu; + __u32 spp_sackdelay; + __u32 spp_flags; } __attribute__((packed, aligned(4))); /* diff --git a/net/sctp/associola.c b/net/sctp/associola.c index dec68a604773..9d05e13e92f6 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -110,7 +110,6 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a asoc->cookie_life.tv_sec = sp->assocparams.sasoc_cookie_life / 1000; asoc->cookie_life.tv_usec = (sp->assocparams.sasoc_cookie_life % 1000) * 1000; - asoc->pmtu = 0; asoc->frag_point = 0; /* Set the association max_retrans and RTO values from the @@ -123,6 +122,25 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a asoc->overall_error_count = 0; + /* Initialize the association's heartbeat interval based on the + * sock configured value. + */ + asoc->hbinterval = msecs_to_jiffies(sp->hbinterval); + + /* Initialize path max retrans value. */ + asoc->pathmaxrxt = sp->pathmaxrxt; + + /* Initialize default path MTU. */ + asoc->pathmtu = sp->pathmtu; + + /* Set association default SACK delay */ + asoc->sackdelay = msecs_to_jiffies(sp->sackdelay); + + /* Set the association default flags controlling + * Heartbeat, SACK delay, and Path MTU Discovery. + */ + asoc->param_flags = sp->param_flags; + /* Initialize the maximum mumber of new data packets that can be sent * in a burst. */ @@ -144,8 +162,7 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a = 5 * asoc->rto_max; asoc->timeouts[SCTP_EVENT_TIMEOUT_HEARTBEAT] = 0; - asoc->timeouts[SCTP_EVENT_TIMEOUT_SACK] = - SCTP_DEFAULT_TIMEOUT_SACK; + asoc->timeouts[SCTP_EVENT_TIMEOUT_SACK] = asoc->sackdelay; asoc->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE] = sp->autoclose * HZ; @@ -540,23 +557,46 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc, sctp_transport_set_owner(peer, asoc); + /* Initialize the peer's heartbeat interval based on the + * association configured value. + */ + peer->hbinterval = asoc->hbinterval; + + /* Set the path max_retrans. */ + peer->pathmaxrxt = asoc->pathmaxrxt; + + /* Initialize the peer's SACK delay timeout based on the + * association configured value. + */ + peer->sackdelay = asoc->sackdelay; + + /* Enable/disable heartbeat, SACK delay, and path MTU discovery + * based on association setting. + */ + peer->param_flags = asoc->param_flags; + /* Initialize the pmtu of the transport. */ - sctp_transport_pmtu(peer); + if (peer->param_flags & SPP_PMTUD_ENABLE) + sctp_transport_pmtu(peer); + else if (asoc->pathmtu) + peer->pathmtu = asoc->pathmtu; + else + peer->pathmtu = SCTP_DEFAULT_MAXSEGMENT; /* If this is the first transport addr on this association, * initialize the association PMTU to the peer's PMTU. * If not and the current association PMTU is higher than the new * peer's PMTU, reset the association PMTU to the new peer's PMTU. */ - if (asoc->pmtu) - asoc->pmtu = min_t(int, peer->pmtu, asoc->pmtu); + if (asoc->pathmtu) + asoc->pathmtu = min_t(int, peer->pathmtu, asoc->pathmtu); else - asoc->pmtu = peer->pmtu; + asoc->pathmtu = peer->pathmtu; SCTP_DEBUG_PRINTK("sctp_assoc_add_peer:association %p PMTU set to " - "%d\n", asoc, asoc->pmtu); + "%d\n", asoc, asoc->pathmtu); - asoc->frag_point = sctp_frag_point(sp, asoc->pmtu); + asoc->frag_point = sctp_frag_point(sp, asoc->pathmtu); /* The asoc->peer.port might not be meaningful yet, but * initialize the packet structure anyway. @@ -574,7 +614,7 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc, * (for example, implementations MAY use the size of the * receiver advertised window). */ - peer->cwnd = min(4*asoc->pmtu, max_t(__u32, 2*asoc->pmtu, 4380)); + peer->cwnd = min(4*asoc->pathmtu, max_t(__u32, 2*asoc->pathmtu, 4380)); /* At this point, we may not have the receiver's advertised window, * so initialize ssthresh to the default value and it will be set @@ -585,17 +625,6 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc, peer->partial_bytes_acked = 0; peer->flight_size = 0; - /* By default, enable heartbeat for peer address. */ - peer->hb_allowed = 1; - - /* Initialize the peer's heartbeat interval based on the - * sock configured value. - */ - peer->hb_interval = msecs_to_jiffies(sp->paddrparam.spp_hbinterval); - - /* Set the path max_retrans. */ - peer->max_retrans = sp->paddrparam.spp_pathmaxrxt; - /* Set the transport's RTO.initial value */ peer->rto = asoc->rto_initial; @@ -1155,18 +1184,18 @@ void sctp_assoc_sync_pmtu(struct sctp_association *asoc) /* Get the lowest pmtu of all the transports. */ list_for_each(pos, &asoc->peer.transport_addr_list) { t = list_entry(pos, struct sctp_transport, transports); - if (!pmtu || (t->pmtu < pmtu)) - pmtu = t->pmtu; + if (!pmtu || (t->pathmtu < pmtu)) + pmtu = t->pathmtu; } if (pmtu) { struct sctp_sock *sp = sctp_sk(asoc->base.sk); - asoc->pmtu = pmtu; + asoc->pathmtu = pmtu; asoc->frag_point = sctp_frag_point(sp, pmtu); } SCTP_DEBUG_PRINTK("%s: asoc:%p, pmtu:%d, frag_point:%d\n", - __FUNCTION__, asoc, asoc->pmtu, asoc->frag_point); + __FUNCTION__, asoc, asoc->pathmtu, asoc->frag_point); } /* Should we send a SACK to update our peer? */ @@ -1179,7 +1208,7 @@ static inline int sctp_peer_needs_update(struct sctp_association *asoc) case SCTP_STATE_SHUTDOWN_SENT: if ((asoc->rwnd > asoc->a_rwnd) && ((asoc->rwnd - asoc->a_rwnd) >= - min_t(__u32, (asoc->base.sk->sk_rcvbuf >> 1), asoc->pmtu))) + min_t(__u32, (asoc->base.sk->sk_rcvbuf >> 1), asoc->pathmtu))) return 1; break; default: diff --git a/net/sctp/input.c b/net/sctp/input.c index b24ff2c1aef5..238f1bffa684 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -305,18 +305,36 @@ int sctp_backlog_rcv(struct sock *sk, struct sk_buff *skb) void sctp_icmp_frag_needed(struct sock *sk, struct sctp_association *asoc, struct sctp_transport *t, __u32 pmtu) { - if (unlikely(pmtu < SCTP_DEFAULT_MINSEGMENT)) { - printk(KERN_WARNING "%s: Reported pmtu %d too low, " - "using default minimum of %d\n", __FUNCTION__, pmtu, - SCTP_DEFAULT_MINSEGMENT); - pmtu = SCTP_DEFAULT_MINSEGMENT; - } + if (sock_owned_by_user(sk) || !t || (t->pathmtu == pmtu)) + return; - if (!sock_owned_by_user(sk) && t && (t->pmtu != pmtu)) { - t->pmtu = pmtu; + if (t->param_flags & SPP_PMTUD_ENABLE) { + if (unlikely(pmtu < SCTP_DEFAULT_MINSEGMENT)) { + printk(KERN_WARNING "%s: Reported pmtu %d too low, " + "using default minimum of %d\n", + __FUNCTION__, pmtu, + SCTP_DEFAULT_MINSEGMENT); + /* Use default minimum segment size and disable + * pmtu discovery on this transport. + */ + t->pathmtu = SCTP_DEFAULT_MINSEGMENT; + t->param_flags = (t->param_flags & ~SPP_HB) | + SPP_PMTUD_DISABLE; + } else { + t->pathmtu = pmtu; + } + + /* Update association pmtu. */ sctp_assoc_sync_pmtu(asoc); - sctp_retransmit(&asoc->outqueue, t, SCTP_RTXR_PMTUD); } + + /* Retransmit with the new pmtu setting. + * Normally, if PMTU discovery is disabled, an ICMP Fragmentation + * Needed will never be sent, but if a message was sent before + * PMTU discovery was disabled that was larger than the PMTU, it + * would not be fragmented, so it must be re-transmitted fragmented. + */ + sctp_retransmit(&asoc->outqueue, t, SCTP_RTXR_PMTUD); } /* diff --git a/net/sctp/output.c b/net/sctp/output.c index 931371633464..a40991ef72c9 100644 --- a/net/sctp/output.c +++ b/net/sctp/output.c @@ -234,8 +234,8 @@ sctp_xmit_t sctp_packet_append_chunk(struct sctp_packet *packet, goto finish; pmtu = ((packet->transport->asoc) ? - (packet->transport->asoc->pmtu) : - (packet->transport->pmtu)); + (packet->transport->asoc->pathmtu) : + (packet->transport->pathmtu)); too_big = (psize + chunk_len > pmtu); @@ -482,7 +482,9 @@ int sctp_packet_transmit(struct sctp_packet *packet) if (!dst || (dst->obsolete > 1)) { dst_release(dst); sctp_transport_route(tp, NULL, sctp_sk(sk)); - sctp_assoc_sync_pmtu(asoc); + if (asoc->param_flags & SPP_PMTUD_ENABLE) { + sctp_assoc_sync_pmtu(asoc); + } } nskb->dst = dst_clone(tp->dst); @@ -492,7 +494,10 @@ int sctp_packet_transmit(struct sctp_packet *packet) SCTP_DEBUG_PRINTK("***sctp_transmit_packet*** skb len %d\n", nskb->len); - (*tp->af_specific->sctp_xmit)(nskb, tp, packet->ipfragok); + if (tp->param_flags & SPP_PMTUD_ENABLE) + (*tp->af_specific->sctp_xmit)(nskb, tp, packet->ipfragok); + else + (*tp->af_specific->sctp_xmit)(nskb, tp, 1); out: packet->size = packet->overhead; @@ -577,7 +582,7 @@ static sctp_xmit_t sctp_packet_append_data(struct sctp_packet *packet, * if ((flightsize + Max.Burst * MTU) < cwnd) * cwnd = flightsize + Max.Burst * MTU */ - max_burst_bytes = asoc->max_burst * asoc->pmtu; + max_burst_bytes = asoc->max_burst * asoc->pathmtu; if ((transport->flight_size + max_burst_bytes) < transport->cwnd) { transport->cwnd = transport->flight_size + max_burst_bytes; SCTP_DEBUG_PRINTK("%s: cwnd limited by max_burst: " @@ -622,7 +627,7 @@ static sctp_xmit_t sctp_packet_append_data(struct sctp_packet *packet, * data will fit or delay in hopes of bundling a full * sized packet. */ - if (len < asoc->pmtu - packet->overhead) { + if (len < asoc->pathmtu - packet->overhead) { retval = SCTP_XMIT_NAGLE_DELAY; goto finish; } diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c index 823947170a33..2d7d8a5db2ac 100644 --- a/net/sctp/sm_sideeffect.c +++ b/net/sctp/sm_sideeffect.c @@ -157,9 +157,12 @@ static int sctp_gen_sack(struct sctp_association *asoc, int force, { __u32 ctsn, max_tsn_seen; struct sctp_chunk *sack; + struct sctp_transport *trans = asoc->peer.last_data_from; int error = 0; - if (force) + if (force || + (!trans && (asoc->param_flags & SPP_SACKDELAY_DISABLE)) || + (trans && (trans->param_flags & SPP_SACKDELAY_DISABLE))) asoc->peer.sack_needed = 1; ctsn = sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map); @@ -189,7 +192,22 @@ static int sctp_gen_sack(struct sctp_association *asoc, int force, if (!asoc->peer.sack_needed) { /* We will need a SACK for the next packet. */ asoc->peer.sack_needed = 1; - goto out; + + /* Set the SACK delay timeout based on the + * SACK delay for the last transport + * data was received from, or the default + * for the association. + */ + if (trans) + asoc->timeouts[SCTP_EVENT_TIMEOUT_SACK] = + trans->sackdelay; + else + asoc->timeouts[SCTP_EVENT_TIMEOUT_SACK] = + asoc->sackdelay; + + /* Restart the SACK timer. */ + sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART, + SCTP_TO(SCTP_EVENT_TIMEOUT_SACK)); } else { if (asoc->a_rwnd > asoc->rwnd) asoc->a_rwnd = asoc->rwnd; @@ -205,7 +223,7 @@ static int sctp_gen_sack(struct sctp_association *asoc, int force, sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP, SCTP_TO(SCTP_EVENT_TIMEOUT_SACK)); } -out: + return error; nomem: error = -ENOMEM; @@ -415,7 +433,7 @@ static void sctp_do_8_2_transport_strike(struct sctp_association *asoc, asoc->overall_error_count++; if (transport->state != SCTP_INACTIVE && - (transport->error_count++ >= transport->max_retrans)) { + (transport->error_count++ >= transport->pathmaxrxt)) { SCTP_DEBUG_PRINTK_IPADDR("transport_strike:association %p", " transport IP: port:%d failed.\n", asoc, diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index 475bfb4972d9..557a7d90b92a 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -900,7 +900,7 @@ sctp_disposition_t sctp_sf_sendbeat_8_3(const struct sctp_endpoint *ep, * HEARTBEAT is sent (see Section 8.3). */ - if (transport->hb_allowed) { + if (transport->param_flags & SPP_HB_ENABLE) { if (SCTP_DISPOSITION_NOMEM == sctp_sf_heartbeat(ep, asoc, type, arg, commands)) @@ -1051,7 +1051,7 @@ sctp_disposition_t sctp_sf_backbeat_8_3(const struct sctp_endpoint *ep, return SCTP_DISPOSITION_DISCARD; } - max_interval = link->hb_interval + link->rto; + max_interval = link->hbinterval + link->rto; /* Check if the timestamp looks valid. */ if (time_after(hbinfo->sent_at, jiffies) || @@ -2691,14 +2691,9 @@ sctp_disposition_t sctp_sf_eat_data_6_2(const struct sctp_endpoint *ep, * document allow. However, an SCTP transmitter MUST NOT be * more aggressive than the following algorithms allow. */ - if (chunk->end_of_packet) { + if (chunk->end_of_packet) sctp_add_cmd_sf(commands, SCTP_CMD_GEN_SACK, SCTP_NOFORCE()); - /* Start the SACK timer. */ - sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART, - SCTP_TO(SCTP_EVENT_TIMEOUT_SACK)); - } - return SCTP_DISPOSITION_CONSUME; discard_force: @@ -2721,13 +2716,9 @@ discard_force: return SCTP_DISPOSITION_DISCARD; discard_noforce: - if (chunk->end_of_packet) { + if (chunk->end_of_packet) sctp_add_cmd_sf(commands, SCTP_CMD_GEN_SACK, SCTP_NOFORCE()); - /* Start the SACK timer. */ - sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART, - SCTP_TO(SCTP_EVENT_TIMEOUT_SACK)); - } return SCTP_DISPOSITION_DISCARD; consume: return SCTP_DISPOSITION_CONSUME; @@ -3442,9 +3433,6 @@ sctp_disposition_t sctp_sf_eat_fwd_tsn(const struct sctp_endpoint *ep, * send another. */ sctp_add_cmd_sf(commands, SCTP_CMD_GEN_SACK, SCTP_NOFORCE()); - /* Start the SACK timer. */ - sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART, - SCTP_TO(SCTP_EVENT_TIMEOUT_SACK)); return SCTP_DISPOSITION_CONSUME; diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 9df888e932c5..adb5ee62c2a6 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -1941,106 +1941,275 @@ static int sctp_setsockopt_autoclose(struct sock *sk, char __user *optval, * address's parameters: * * struct sctp_paddrparams { - * sctp_assoc_t spp_assoc_id; - * struct sockaddr_storage spp_address; - * uint32_t spp_hbinterval; - * uint16_t spp_pathmaxrxt; - * }; - * - * spp_assoc_id - (UDP style socket) This is filled in the application, - * and identifies the association for this query. + * sctp_assoc_t spp_assoc_id; + * struct sockaddr_storage spp_address; + * uint32_t spp_hbinterval; + * uint16_t spp_pathmaxrxt; + * uint32_t spp_pathmtu; + * uint32_t spp_sackdelay; + * uint32_t spp_flags; + * }; + * + * spp_assoc_id - (one-to-many style socket) This is filled in the + * application, and identifies the association for + * this query. * spp_address - This specifies which address is of interest. * spp_hbinterval - This contains the value of the heartbeat interval, - * in milliseconds. A value of 0, when modifying the - * parameter, specifies that the heartbeat on this - * address should be disabled. A value of UINT32_MAX - * (4294967295), when modifying the parameter, - * specifies that a heartbeat should be sent - * immediately to the peer address, and the current - * interval should remain unchanged. + * in milliseconds. If a value of zero + * is present in this field then no changes are to + * be made to this parameter. * spp_pathmaxrxt - This contains the maximum number of * retransmissions before this address shall be - * considered unreachable. + * considered unreachable. If a value of zero + * is present in this field then no changes are to + * be made to this parameter. + * spp_pathmtu - When Path MTU discovery is disabled the value + * specified here will be the "fixed" path mtu. + * Note that if the spp_address field is empty + * then all associations on this address will + * have this fixed path mtu set upon them. + * + * spp_sackdelay - When delayed sack is enabled, this value specifies + * the number of milliseconds that sacks will be delayed + * for. This value will apply to all addresses of an + * association if the spp_address field is empty. Note + * also, that if delayed sack is enabled and this + * value is set to 0, no change is made to the last + * recorded delayed sack timer value. + * + * spp_flags - These flags are used to control various features + * on an association. The flag field may contain + * zero or more of the following options. + * + * SPP_HB_ENABLE - Enable heartbeats on the + * specified address. Note that if the address + * field is empty all addresses for the association + * have heartbeats enabled upon them. + * + * SPP_HB_DISABLE - Disable heartbeats on the + * speicifed address. Note that if the address + * field is empty all addresses for the association + * will have their heartbeats disabled. Note also + * that SPP_HB_ENABLE and SPP_HB_DISABLE are + * mutually exclusive, only one of these two should + * be specified. Enabling both fields will have + * undetermined results. + * + * SPP_HB_DEMAND - Request a user initiated heartbeat + * to be made immediately. + * + * SPP_PMTUD_ENABLE - This field will enable PMTU + * discovery upon the specified address. Note that + * if the address feild is empty then all addresses + * on the association are effected. + * + * SPP_PMTUD_DISABLE - This field will disable PMTU + * discovery upon the specified address. Note that + * if the address feild is empty then all addresses + * on the association are effected. Not also that + * SPP_PMTUD_ENABLE and SPP_PMTUD_DISABLE are mutually + * exclusive. Enabling both will have undetermined + * results. + * + * SPP_SACKDELAY_ENABLE - Setting this flag turns + * on delayed sack. The time specified in spp_sackdelay + * is used to specify the sack delay for this address. Note + * that if spp_address is empty then all addresses will + * enable delayed sack and take on the sack delay + * value specified in spp_sackdelay. + * SPP_SACKDELAY_DISABLE - Setting this flag turns + * off delayed sack. If the spp_address field is blank then + * delayed sack is disabled for the entire association. Note + * also that this field is mutually exclusive to + * SPP_SACKDELAY_ENABLE, setting both will have undefined + * results. */ +int sctp_apply_peer_addr_params(struct sctp_paddrparams *params, + struct sctp_transport *trans, + struct sctp_association *asoc, + struct sctp_sock *sp, + int hb_change, + int pmtud_change, + int sackdelay_change) +{ + int error; + + if (params->spp_flags & SPP_HB_DEMAND && trans) { + error = sctp_primitive_REQUESTHEARTBEAT (trans->asoc, trans); + if (error) + return error; + } + + if (params->spp_hbinterval) { + if (trans) { + trans->hbinterval = msecs_to_jiffies(params->spp_hbinterval); + } else if (asoc) { + asoc->hbinterval = msecs_to_jiffies(params->spp_hbinterval); + } else { + sp->hbinterval = params->spp_hbinterval; + } + } + + if (hb_change) { + if (trans) { + trans->param_flags = + (trans->param_flags & ~SPP_HB) | hb_change; + } else if (asoc) { + asoc->param_flags = + (asoc->param_flags & ~SPP_HB) | hb_change; + } else { + sp->param_flags = + (sp->param_flags & ~SPP_HB) | hb_change; + } + } + + if (params->spp_pathmtu) { + if (trans) { + trans->pathmtu = params->spp_pathmtu; + sctp_assoc_sync_pmtu(asoc); + } else if (asoc) { + asoc->pathmtu = params->spp_pathmtu; + sctp_frag_point(sp, params->spp_pathmtu); + } else { + sp->pathmtu = params->spp_pathmtu; + } + } + + if (pmtud_change) { + if (trans) { + int update = (trans->param_flags & SPP_PMTUD_DISABLE) && + (params->spp_flags & SPP_PMTUD_ENABLE); + trans->param_flags = + (trans->param_flags & ~SPP_PMTUD) | pmtud_change; + if (update) { + sctp_transport_pmtu(trans); + sctp_assoc_sync_pmtu(asoc); + } + } else if (asoc) { + asoc->param_flags = + (asoc->param_flags & ~SPP_PMTUD) | pmtud_change; + } else { + sp->param_flags = + (sp->param_flags & ~SPP_PMTUD) | pmtud_change; + } + } + + if (params->spp_sackdelay) { + if (trans) { + trans->sackdelay = + msecs_to_jiffies(params->spp_sackdelay); + } else if (asoc) { + asoc->sackdelay = + msecs_to_jiffies(params->spp_sackdelay); + } else { + sp->sackdelay = params->spp_sackdelay; + } + } + + if (sackdelay_change) { + if (trans) { + trans->param_flags = + (trans->param_flags & ~SPP_SACKDELAY) | + sackdelay_change; + } else if (asoc) { + asoc->param_flags = + (asoc->param_flags & ~SPP_SACKDELAY) | + sackdelay_change; + } else { + sp->param_flags = + (sp->param_flags & ~SPP_SACKDELAY) | + sackdelay_change; + } + } + + if (params->spp_pathmaxrxt) { + if (trans) { + trans->pathmaxrxt = params->spp_pathmaxrxt; + } else if (asoc) { + asoc->pathmaxrxt = params->spp_pathmaxrxt; + } else { + sp->pathmaxrxt = params->spp_pathmaxrxt; + } + } + + return 0; +} + static int sctp_setsockopt_peer_addr_params(struct sock *sk, char __user *optval, int optlen) { - struct sctp_paddrparams params; - struct sctp_transport *trans; + struct sctp_paddrparams params; + struct sctp_transport *trans = NULL; + struct sctp_association *asoc = NULL; + struct sctp_sock *sp = sctp_sk(sk); int error; + int hb_change, pmtud_change, sackdelay_change; if (optlen != sizeof(struct sctp_paddrparams)) - return -EINVAL; + return - EINVAL; + if (copy_from_user(¶ms, optval, optlen)) return -EFAULT; - /* - * API 7. Socket Options (setting the default value for the endpoint) - * All options that support specific settings on an association by - * filling in either an association id variable or a sockaddr_storage - * SHOULD also support setting of the same value for the entire endpoint - * (i.e. future associations). To accomplish this the following logic is - * used when setting one of these options: - - * c) If neither the sockaddr_storage or association identification is - * set i.e. the sockaddr_storage is set to all 0's (INADDR_ANY) and - * the association identification is 0, the settings are a default - * and to be applied to the endpoint (all future associations). - */ + /* Validate flags and value parameters. */ + hb_change = params.spp_flags & SPP_HB; + pmtud_change = params.spp_flags & SPP_PMTUD; + sackdelay_change = params.spp_flags & SPP_SACKDELAY; + + if (hb_change == SPP_HB || + pmtud_change == SPP_PMTUD || + sackdelay_change == SPP_SACKDELAY || + params.spp_sackdelay > 500 || + (params.spp_pathmtu + && params.spp_pathmtu < SCTP_DEFAULT_MINSEGMENT)) + return -EINVAL; - /* update default value for endpoint (all future associations) */ - if (!params.spp_assoc_id && - sctp_is_any(( union sctp_addr *)¶ms.spp_address)) { - /* Manual heartbeat on an endpoint is invalid. */ - if (0xffffffff == params.spp_hbinterval) + /* If an address other than INADDR_ANY is specified, and + * no transport is found, then the request is invalid. + */ + if (!sctp_is_any(( union sctp_addr *)¶ms.spp_address)) { + trans = sctp_addr_id2transport(sk, ¶ms.spp_address, + params.spp_assoc_id); + if (!trans) return -EINVAL; - else if (params.spp_hbinterval) - sctp_sk(sk)->paddrparam.spp_hbinterval = - params.spp_hbinterval; - if (params.spp_pathmaxrxt) - sctp_sk(sk)->paddrparam.spp_pathmaxrxt = - params.spp_pathmaxrxt; - return 0; } - trans = sctp_addr_id2transport(sk, ¶ms.spp_address, - params.spp_assoc_id); - if (!trans) + /* Get association, if assoc_id != 0 and the socket is a one + * to many style socket, and an association was not found, then + * the id was invalid. + */ + asoc = sctp_id2assoc(sk, params.spp_assoc_id); + if (!asoc && params.spp_assoc_id && sctp_style(sk, UDP)) return -EINVAL; - /* Applications can enable or disable heartbeats for any peer address - * of an association, modify an address's heartbeat interval, force a - * heartbeat to be sent immediately, and adjust the address's maximum - * number of retransmissions sent before an address is considered - * unreachable. - * - * The value of the heartbeat interval, in milliseconds. A value of - * UINT32_MAX (4294967295), when modifying the parameter, specifies - * that a heartbeat should be sent immediately to the peer address, - * and the current interval should remain unchanged. - */ - if (0xffffffff == params.spp_hbinterval) { - error = sctp_primitive_REQUESTHEARTBEAT (trans->asoc, trans); - if (error) - return error; - } else { - /* The value of the heartbeat interval, in milliseconds. A value of 0, - * when modifying the parameter, specifies that the heartbeat on this - * address should be disabled. + /* Heartbeat demand can only be sent on a transport or + * association, but not a socket. */ - if (params.spp_hbinterval) { - trans->hb_allowed = 1; - trans->hb_interval = - msecs_to_jiffies(params.spp_hbinterval); - } else - trans->hb_allowed = 0; - } + if (params.spp_flags & SPP_HB_DEMAND && !trans && !asoc) + return -EINVAL; + + /* Process parameters. */ + error = sctp_apply_peer_addr_params(¶ms, trans, asoc, sp, + hb_change, pmtud_change, + sackdelay_change); - /* spp_pathmaxrxt contains the maximum number of retransmissions - * before this address shall be considered unreachable. + if (error) + return error; + + /* If changes are for association, also apply parameters to each + * transport. */ - if (params.spp_pathmaxrxt) - trans->max_retrans = params.spp_pathmaxrxt; + if (!trans && asoc) { + struct list_head *pos; + + list_for_each(pos, &asoc->peer.transport_addr_list) { + trans = list_entry(pos, struct sctp_transport, + transports); + sctp_apply_peer_addr_params(¶ms, trans, asoc, sp, + hb_change, pmtud_change, + sackdelay_change); + } + } return 0; } @@ -2334,7 +2503,7 @@ static int sctp_setsockopt_maxseg(struct sock *sk, char __user *optval, int optl /* Update the frag_point of the existing associations. */ list_for_each(pos, &(sp->ep->asocs)) { asoc = list_entry(pos, struct sctp_association, asocs); - asoc->frag_point = sctp_frag_point(sp, asoc->pmtu); + asoc->frag_point = sctp_frag_point(sp, asoc->pathmtu); } return 0; @@ -2715,8 +2884,13 @@ SCTP_STATIC int sctp_init_sock(struct sock *sk) /* Default Peer Address Parameters. These defaults can * be modified via SCTP_PEER_ADDR_PARAMS */ - sp->paddrparam.spp_hbinterval = jiffies_to_msecs(sctp_hb_interval); - sp->paddrparam.spp_pathmaxrxt = sctp_max_retrans_path; + sp->hbinterval = jiffies_to_msecs(sctp_hb_interval); + sp->pathmaxrxt = sctp_max_retrans_path; + sp->pathmtu = 0; // allow default discovery + sp->sackdelay = sctp_sack_timeout; + sp->param_flags = SPP_HB_ENABLE | + SPP_PMTUD_ENABLE | + SPP_SACKDELAY_ENABLE; /* If enabled no SCTP message fragmentation will be performed. * Configure through SCTP_DISABLE_FRAGMENTS socket option. @@ -2865,7 +3039,7 @@ static int sctp_getsockopt_sctp_status(struct sock *sk, int len, status.sstat_primary.spinfo_cwnd = transport->cwnd; status.sstat_primary.spinfo_srtt = transport->srtt; status.sstat_primary.spinfo_rto = jiffies_to_msecs(transport->rto); - status.sstat_primary.spinfo_mtu = transport->pmtu; + status.sstat_primary.spinfo_mtu = transport->pathmtu; if (status.sstat_primary.spinfo_state == SCTP_UNKNOWN) status.sstat_primary.spinfo_state = SCTP_ACTIVE; @@ -2924,7 +3098,7 @@ static int sctp_getsockopt_peer_addr_info(struct sock *sk, int len, pinfo.spinfo_cwnd = transport->cwnd; pinfo.spinfo_srtt = transport->srtt; pinfo.spinfo_rto = jiffies_to_msecs(transport->rto); - pinfo.spinfo_mtu = transport->pmtu; + pinfo.spinfo_mtu = transport->pathmtu; if (pinfo.spinfo_state == SCTP_UNKNOWN) pinfo.spinfo_state = SCTP_ACTIVE; @@ -3086,69 +3260,154 @@ out: * address's parameters: * * struct sctp_paddrparams { - * sctp_assoc_t spp_assoc_id; - * struct sockaddr_storage spp_address; - * uint32_t spp_hbinterval; - * uint16_t spp_pathmaxrxt; - * }; - * - * spp_assoc_id - (UDP style socket) This is filled in the application, - * and identifies the association for this query. + * sctp_assoc_t spp_assoc_id; + * struct sockaddr_storage spp_address; + * uint32_t spp_hbinterval; + * uint16_t spp_pathmaxrxt; + * uint32_t spp_pathmtu; + * uint32_t spp_sackdelay; + * uint32_t spp_flags; + * }; + * + * spp_assoc_id - (one-to-many style socket) This is filled in the + * application, and identifies the association for + * this query. * spp_address - This specifies which address is of interest. * spp_hbinterval - This contains the value of the heartbeat interval, - * in milliseconds. A value of 0, when modifying the - * parameter, specifies that the heartbeat on this - * address should be disabled. A value of UINT32_MAX - * (4294967295), when modifying the parameter, - * specifies that a heartbeat should be sent - * immediately to the peer address, and the current - * interval should remain unchanged. + * in milliseconds. If a value of zero + * is present in this field then no changes are to + * be made to this parameter. * spp_pathmaxrxt - This contains the maximum number of * retransmissions before this address shall be - * considered unreachable. + * considered unreachable. If a value of zero + * is present in this field then no changes are to + * be made to this parameter. + * spp_pathmtu - When Path MTU discovery is disabled the value + * specified here will be the "fixed" path mtu. + * Note that if the spp_address field is empty + * then all associations on this address will + * have this fixed path mtu set upon them. + * + * spp_sackdelay - When delayed sack is enabled, this value specifies + * the number of milliseconds that sacks will be delayed + * for. This value will apply to all addresses of an + * association if the spp_address field is empty. Note + * also, that if delayed sack is enabled and this + * value is set to 0, no change is made to the last + * recorded delayed sack timer value. + * + * spp_flags - These flags are used to control various features + * on an association. The flag field may contain + * zero or more of the following options. + * + * SPP_HB_ENABLE - Enable heartbeats on the + * specified address. Note that if the address + * field is empty all addresses for the association + * have heartbeats enabled upon them. + * + * SPP_HB_DISABLE - Disable heartbeats on the + * speicifed address. Note that if the address + * field is empty all addresses for the association + * will have their heartbeats disabled. Note also + * that SPP_HB_ENABLE and SPP_HB_DISABLE are + * mutually exclusive, only one of these two should + * be specified. Enabling both fields will have + * undetermined results. + * + * SPP_HB_DEMAND - Request a user initiated heartbeat + * to be made immediately. + * + * SPP_PMTUD_ENABLE - This field will enable PMTU + * discovery upon the specified address. Note that + * if the address feild is empty then all addresses + * on the association are effected. + * + * SPP_PMTUD_DISABLE - This field will disable PMTU + * discovery upon the specified address. Note that + * if the address feild is empty then all addresses + * on the association are effected. Not also that + * SPP_PMTUD_ENABLE and SPP_PMTUD_DISABLE are mutually + * exclusive. Enabling both will have undetermined + * results. + * + * SPP_SACKDELAY_ENABLE - Setting this flag turns + * on delayed sack. The time specified in spp_sackdelay + * is used to specify the sack delay for this address. Note + * that if spp_address is empty then all addresses will + * enable delayed sack and take on the sack delay + * value specified in spp_sackdelay. + * SPP_SACKDELAY_DISABLE - Setting this flag turns + * off delayed sack. If the spp_address field is blank then + * delayed sack is disabled for the entire association. Note + * also that this field is mutually exclusive to + * SPP_SACKDELAY_ENABLE, setting both will have undefined + * results. */ static int sctp_getsockopt_peer_addr_params(struct sock *sk, int len, - char __user *optval, int __user *optlen) + char __user *optval, int __user *optlen) { - struct sctp_paddrparams params; - struct sctp_transport *trans; + struct sctp_paddrparams params; + struct sctp_transport *trans = NULL; + struct sctp_association *asoc = NULL; + struct sctp_sock *sp = sctp_sk(sk); if (len != sizeof(struct sctp_paddrparams)) return -EINVAL; + if (copy_from_user(¶ms, optval, len)) return -EFAULT; - /* If no association id is specified retrieve the default value - * for the endpoint that will be used for all future associations + /* If an address other than INADDR_ANY is specified, and + * no transport is found, then the request is invalid. */ - if (!params.spp_assoc_id && - sctp_is_any(( union sctp_addr *)¶ms.spp_address)) { - params.spp_hbinterval = sctp_sk(sk)->paddrparam.spp_hbinterval; - params.spp_pathmaxrxt = sctp_sk(sk)->paddrparam.spp_pathmaxrxt; - - goto done; + if (!sctp_is_any(( union sctp_addr *)¶ms.spp_address)) { + trans = sctp_addr_id2transport(sk, ¶ms.spp_address, + params.spp_assoc_id); + if (!trans) { + SCTP_DEBUG_PRINTK("Failed no transport\n"); + return -EINVAL; + } } - trans = sctp_addr_id2transport(sk, ¶ms.spp_address, - params.spp_assoc_id); - if (!trans) - return -EINVAL; - - /* The value of the heartbeat interval, in milliseconds. A value of 0, - * when modifying the parameter, specifies that the heartbeat on this - * address should be disabled. + /* Get association, if assoc_id != 0 and the socket is a one + * to many style socket, and an association was not found, then + * the id was invalid. */ - if (!trans->hb_allowed) - params.spp_hbinterval = 0; - else - params.spp_hbinterval = jiffies_to_msecs(trans->hb_interval); + asoc = sctp_id2assoc(sk, params.spp_assoc_id); + if (!asoc && params.spp_assoc_id && sctp_style(sk, UDP)) { + SCTP_DEBUG_PRINTK("Failed no association\n"); + return -EINVAL; + } - /* spp_pathmaxrxt contains the maximum number of retransmissions - * before this address shall be considered unreachable. - */ - params.spp_pathmaxrxt = trans->max_retrans; + if (trans) { + /* Fetch transport values. */ + params.spp_hbinterval = jiffies_to_msecs(trans->hbinterval); + params.spp_pathmtu = trans->pathmtu; + params.spp_pathmaxrxt = trans->pathmaxrxt; + params.spp_sackdelay = jiffies_to_msecs(trans->sackdelay); + + /*draft-11 doesn't say what to return in spp_flags*/ + params.spp_flags = trans->param_flags; + } else if (asoc) { + /* Fetch association values. */ + params.spp_hbinterval = jiffies_to_msecs(asoc->hbinterval); + params.spp_pathmtu = asoc->pathmtu; + params.spp_pathmaxrxt = asoc->pathmaxrxt; + params.spp_sackdelay = jiffies_to_msecs(asoc->sackdelay); + + /*draft-11 doesn't say what to return in spp_flags*/ + params.spp_flags = asoc->param_flags; + } else { + /* Fetch socket values. */ + params.spp_hbinterval = sp->hbinterval; + params.spp_pathmtu = sp->pathmtu; + params.spp_sackdelay = sp->sackdelay; + params.spp_pathmaxrxt = sp->pathmaxrxt; + + /*draft-11 doesn't say what to return in spp_flags*/ + params.spp_flags = sp->param_flags; + } -done: if (copy_to_user(optval, ¶ms, len)) return -EFAULT; diff --git a/net/sctp/transport.c b/net/sctp/transport.c index 268ddaf2dc0f..68d73e2dd155 100644 --- a/net/sctp/transport.c +++ b/net/sctp/transport.c @@ -86,10 +86,13 @@ static struct sctp_transport *sctp_transport_init(struct sctp_transport *peer, peer->init_sent_count = 0; peer->state = SCTP_ACTIVE; - peer->hb_allowed = 0; + peer->param_flags = SPP_HB_DISABLE | + SPP_PMTUD_ENABLE | + SPP_SACKDELAY_ENABLE; + peer->hbinterval = 0; /* Initialize the default path max_retrans. */ - peer->max_retrans = sctp_max_retrans_path; + peer->pathmaxrxt = sctp_max_retrans_path; peer->error_count = 0; INIT_LIST_HEAD(&peer->transmitted); @@ -229,10 +232,10 @@ void sctp_transport_pmtu(struct sctp_transport *transport) dst = transport->af_specific->get_dst(NULL, &transport->ipaddr, NULL); if (dst) { - transport->pmtu = dst_mtu(dst); + transport->pathmtu = dst_mtu(dst); dst_release(dst); } else - transport->pmtu = SCTP_DEFAULT_MAXSEGMENT; + transport->pathmtu = SCTP_DEFAULT_MAXSEGMENT; } /* Caches the dst entry and source address for a transport's destination @@ -254,8 +257,11 @@ void sctp_transport_route(struct sctp_transport *transport, af->get_saddr(asoc, dst, daddr, &transport->saddr); transport->dst = dst; + if ((transport->param_flags & SPP_PMTUD_DISABLE) && transport->pathmtu) { + return; + } if (dst) { - transport->pmtu = dst_mtu(dst); + transport->pathmtu = dst_mtu(dst); /* Initialize sk->sk_rcv_saddr, if the transport is the * association's active path for getsockname(). @@ -264,7 +270,7 @@ void sctp_transport_route(struct sctp_transport *transport, opt->pf->af->to_sk_saddr(&transport->saddr, asoc->base.sk); } else - transport->pmtu = SCTP_DEFAULT_MAXSEGMENT; + transport->pathmtu = SCTP_DEFAULT_MAXSEGMENT; } /* Hold a reference to a transport. */ @@ -369,7 +375,7 @@ void sctp_transport_raise_cwnd(struct sctp_transport *transport, ssthresh = transport->ssthresh; pba = transport->partial_bytes_acked; - pmtu = transport->asoc->pmtu; + pmtu = transport->asoc->pathmtu; if (cwnd <= ssthresh) { /* RFC 2960 7.2.1, sctpimpguide-05 2.14.2 When cwnd is less @@ -441,8 +447,8 @@ void sctp_transport_lower_cwnd(struct sctp_transport *transport, * partial_bytes_acked = 0 */ transport->ssthresh = max(transport->cwnd/2, - 4*transport->asoc->pmtu); - transport->cwnd = transport->asoc->pmtu; + 4*transport->asoc->pathmtu); + transport->cwnd = transport->asoc->pathmtu; break; case SCTP_LOWER_CWND_FAST_RTX: @@ -459,7 +465,7 @@ void sctp_transport_lower_cwnd(struct sctp_transport *transport, * partial_bytes_acked = 0 */ transport->ssthresh = max(transport->cwnd/2, - 4*transport->asoc->pmtu); + 4*transport->asoc->pathmtu); transport->cwnd = transport->ssthresh; break; @@ -479,7 +485,7 @@ void sctp_transport_lower_cwnd(struct sctp_transport *transport, if ((jiffies - transport->last_time_ecne_reduced) > transport->rtt) { transport->ssthresh = max(transport->cwnd/2, - 4*transport->asoc->pmtu); + 4*transport->asoc->pathmtu); transport->cwnd = transport->ssthresh; transport->last_time_ecne_reduced = jiffies; } @@ -496,7 +502,7 @@ void sctp_transport_lower_cwnd(struct sctp_transport *transport, */ if ((jiffies - transport->last_time_used) > transport->rto) transport->cwnd = max(transport->cwnd/2, - 4*transport->asoc->pmtu); + 4*transport->asoc->pathmtu); break; }; @@ -511,7 +517,7 @@ void sctp_transport_lower_cwnd(struct sctp_transport *transport, unsigned long sctp_transport_timeout(struct sctp_transport *t) { unsigned long timeout; - timeout = t->hb_interval + t->rto + sctp_jitter(t->rto); + timeout = t->hbinterval + t->rto + sctp_jitter(t->rto); timeout += jiffies; return timeout; } -- cgit v1.2.3 From 7708610b1bff4a0ba8a73733d3c7c4bda9f94b21 Mon Sep 17 00:00:00 2001 From: Frank Filz Date: Thu, 22 Dec 2005 11:37:30 -0800 Subject: [SCTP]: Add support for SCTP_DELAYED_ACK_TIME socket option. Signed-off-by: Frank Filz Signed-off-by: Sridhar Samudrala Signed-off-by: David S. Miller --- include/net/sctp/user.h | 14 ++++ net/sctp/socket.c | 184 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 198 insertions(+) (limited to 'include/net') diff --git a/include/net/sctp/user.h b/include/net/sctp/user.h index b9052864fa5a..8a6bef6f91eb 100644 --- a/include/net/sctp/user.h +++ b/include/net/sctp/user.h @@ -93,6 +93,8 @@ enum sctp_optname { #define SCTP_STATUS SCTP_STATUS SCTP_GET_PEER_ADDR_INFO, #define SCTP_GET_PEER_ADDR_INFO SCTP_GET_PEER_ADDR_INFO + SCTP_DELAYED_ACK_TIME, +#define SCTP_DELAYED_ACK_TIME SCTP_DELAYED_ACK_TIME /* Internal Socket Options. Some of the sctp library functions are * implemented using these socket options. @@ -526,6 +528,18 @@ struct sctp_paddrparams { __u32 spp_flags; } __attribute__((packed, aligned(4))); +/* 7.1.24. Delayed Ack Timer (SCTP_DELAYED_ACK_TIME) + * + * This options will get or set the delayed ack timer. The time is set + * in milliseconds. If the assoc_id is 0, then this sets or gets the + * endpoints default delayed ack timer value. If the assoc_id field is + * non-zero, then the set or get effects the specified association. + */ +struct sctp_assoc_value { + sctp_assoc_t assoc_id; + uint32_t assoc_value; +}; + /* * 7.2.2 Peer Address Information * diff --git a/net/sctp/socket.c b/net/sctp/socket.c index adb5ee62c2a6..fc04d185fa33 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -2214,6 +2214,109 @@ static int sctp_setsockopt_peer_addr_params(struct sock *sk, return 0; } +/* 7.1.24. Delayed Ack Timer (SCTP_DELAYED_ACK_TIME) + * + * This options will get or set the delayed ack timer. The time is set + * in milliseconds. If the assoc_id is 0, then this sets or gets the + * endpoints default delayed ack timer value. If the assoc_id field is + * non-zero, then the set or get effects the specified association. + * + * struct sctp_assoc_value { + * sctp_assoc_t assoc_id; + * uint32_t assoc_value; + * }; + * + * assoc_id - This parameter, indicates which association the + * user is preforming an action upon. Note that if + * this field's value is zero then the endpoints + * default value is changed (effecting future + * associations only). + * + * assoc_value - This parameter contains the number of milliseconds + * that the user is requesting the delayed ACK timer + * be set to. Note that this value is defined in + * the standard to be between 200 and 500 milliseconds. + * + * Note: a value of zero will leave the value alone, + * but disable SACK delay. A non-zero value will also + * enable SACK delay. + */ + +static int sctp_setsockopt_delayed_ack_time(struct sock *sk, + char __user *optval, int optlen) +{ + struct sctp_assoc_value params; + struct sctp_transport *trans = NULL; + struct sctp_association *asoc = NULL; + struct sctp_sock *sp = sctp_sk(sk); + + if (optlen != sizeof(struct sctp_assoc_value)) + return - EINVAL; + + if (copy_from_user(¶ms, optval, optlen)) + return -EFAULT; + + /* Validate value parameter. */ + if (params.assoc_value > 500) + return -EINVAL; + + /* Get association, if assoc_id != 0 and the socket is a one + * to many style socket, and an association was not found, then + * the id was invalid. + */ + asoc = sctp_id2assoc(sk, params.assoc_id); + if (!asoc && params.assoc_id && sctp_style(sk, UDP)) + return -EINVAL; + + if (params.assoc_value) { + if (asoc) { + asoc->sackdelay = + msecs_to_jiffies(params.assoc_value); + asoc->param_flags = + (asoc->param_flags & ~SPP_SACKDELAY) | + SPP_SACKDELAY_ENABLE; + } else { + sp->sackdelay = params.assoc_value; + sp->param_flags = + (sp->param_flags & ~SPP_SACKDELAY) | + SPP_SACKDELAY_ENABLE; + } + } else { + if (asoc) { + asoc->param_flags = + (asoc->param_flags & ~SPP_SACKDELAY) | + SPP_SACKDELAY_DISABLE; + } else { + sp->param_flags = + (sp->param_flags & ~SPP_SACKDELAY) | + SPP_SACKDELAY_DISABLE; + } + } + + /* If change is for association, also apply to each transport. */ + if (asoc) { + struct list_head *pos; + + list_for_each(pos, &asoc->peer.transport_addr_list) { + trans = list_entry(pos, struct sctp_transport, + transports); + if (params.assoc_value) { + trans->sackdelay = + msecs_to_jiffies(params.assoc_value); + trans->param_flags = + (trans->param_flags & ~SPP_SACKDELAY) | + SPP_SACKDELAY_ENABLE; + } else { + trans->param_flags = + (trans->param_flags & ~SPP_SACKDELAY) | + SPP_SACKDELAY_DISABLE; + } + } + } + + return 0; +} + /* 7.1.3 Initialization Parameters (SCTP_INITMSG) * * Applications can specify protocol parameters for the default association @@ -2660,6 +2763,10 @@ SCTP_STATIC int sctp_setsockopt(struct sock *sk, int level, int optname, retval = sctp_setsockopt_peer_addr_params(sk, optval, optlen); break; + case SCTP_DELAYED_ACK_TIME: + retval = sctp_setsockopt_delayed_ack_time(sk, optval, optlen); + break; + case SCTP_INITMSG: retval = sctp_setsockopt_initmsg(sk, optval, optlen); break; @@ -3417,6 +3524,79 @@ static int sctp_getsockopt_peer_addr_params(struct sock *sk, int len, return 0; } +/* 7.1.24. Delayed Ack Timer (SCTP_DELAYED_ACK_TIME) + * + * This options will get or set the delayed ack timer. The time is set + * in milliseconds. If the assoc_id is 0, then this sets or gets the + * endpoints default delayed ack timer value. If the assoc_id field is + * non-zero, then the set or get effects the specified association. + * + * struct sctp_assoc_value { + * sctp_assoc_t assoc_id; + * uint32_t assoc_value; + * }; + * + * assoc_id - This parameter, indicates which association the + * user is preforming an action upon. Note that if + * this field's value is zero then the endpoints + * default value is changed (effecting future + * associations only). + * + * assoc_value - This parameter contains the number of milliseconds + * that the user is requesting the delayed ACK timer + * be set to. Note that this value is defined in + * the standard to be between 200 and 500 milliseconds. + * + * Note: a value of zero will leave the value alone, + * but disable SACK delay. A non-zero value will also + * enable SACK delay. + */ +static int sctp_getsockopt_delayed_ack_time(struct sock *sk, int len, + char __user *optval, + int __user *optlen) +{ + struct sctp_assoc_value params; + struct sctp_association *asoc = NULL; + struct sctp_sock *sp = sctp_sk(sk); + + if (len != sizeof(struct sctp_assoc_value)) + return - EINVAL; + + if (copy_from_user(¶ms, optval, len)) + return -EFAULT; + + /* Get association, if assoc_id != 0 and the socket is a one + * to many style socket, and an association was not found, then + * the id was invalid. + */ + asoc = sctp_id2assoc(sk, params.assoc_id); + if (!asoc && params.assoc_id && sctp_style(sk, UDP)) + return -EINVAL; + + if (asoc) { + /* Fetch association values. */ + if (asoc->param_flags & SPP_SACKDELAY_ENABLE) + params.assoc_value = jiffies_to_msecs( + asoc->sackdelay); + else + params.assoc_value = 0; + } else { + /* Fetch socket values. */ + if (sp->param_flags & SPP_SACKDELAY_ENABLE) + params.assoc_value = sp->sackdelay; + else + params.assoc_value = 0; + } + + if (copy_to_user(optval, ¶ms, len)) + return -EFAULT; + + if (put_user(len, optlen)) + return -EFAULT; + + return 0; +} + /* 7.1.3 Initialization Parameters (SCTP_INITMSG) * * Applications can specify protocol parameters for the default association @@ -4274,6 +4454,10 @@ SCTP_STATIC int sctp_getsockopt(struct sock *sk, int level, int optname, retval = sctp_getsockopt_peer_addr_params(sk, len, optval, optlen); break; + case SCTP_DELAYED_ACK_TIME: + retval = sctp_getsockopt_delayed_ack_time(sk, len, optval, + optlen); + break; case SCTP_INITMSG: retval = sctp_getsockopt_initmsg(sk, len, optval, optlen); break; -- cgit v1.2.3 From 90ddc4f0470427df306f308ad03db6b6b21644b8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 22 Dec 2005 12:49:22 -0800 Subject: [NET]: move struct proto_ops to const I noticed that some of 'struct proto_ops' used in the kernel may share a cache line used by locks or other heavily modified data. (default linker alignement is 32 bytes, and L1_CACHE_LINE is 64 or 128 at least) This patch makes sure a 'struct proto_ops' can be declared as const, so that all cpus can share all parts of it without false sharing. This is not mandatory : a driver can still use a read/write structure if it needs to (and eventually a __read_mostly) I made a global stubstitute to change all existing occurences to make them const. This should reduce the possibility of false sharing on SMP, and speedup some socket system calls. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/net.h | 4 ++-- include/net/inet_common.h | 4 ++-- include/net/ipv6.h | 4 ++-- include/net/protocol.h | 2 +- net/appletalk/ddp.c | 4 ++-- net/atm/pvc.c | 2 +- net/atm/svc.c | 2 +- net/ax25/af_ax25.c | 4 ++-- net/bluetooth/bnep/sock.c | 2 +- net/bluetooth/cmtp/sock.c | 2 +- net/bluetooth/hci_sock.c | 2 +- net/bluetooth/hidp/sock.c | 2 +- net/bluetooth/l2cap.c | 4 ++-- net/bluetooth/rfcomm/sock.c | 4 ++-- net/bluetooth/sco.c | 4 ++-- net/dccp/proto.c | 2 +- net/decnet/af_decnet.c | 4 ++-- net/econet/af_econet.c | 4 ++-- net/ipv4/af_inet.c | 6 +++--- net/ipv6/af_inet6.c | 6 +++--- net/ipx/af_ipx.c | 4 ++-- net/irda/af_irda.c | 16 ++++++++-------- net/key/af_key.c | 4 ++-- net/llc/af_llc.c | 4 ++-- net/netlink/af_netlink.c | 4 ++-- net/netrom/af_netrom.c | 4 ++-- net/packet/af_packet.c | 8 ++++---- net/sctp/ipv6.c | 2 +- net/sctp/protocol.c | 2 +- net/sunrpc/svcsock.c | 2 +- net/unix/af_unix.c | 6 +++--- net/wanrouter/af_wanpipe.c | 4 ++-- net/x25/af_x25.c | 4 ++-- 33 files changed, 66 insertions(+), 66 deletions(-) (limited to 'include/net') diff --git a/include/linux/net.h b/include/linux/net.h index d6a41e6577f6..28195a2d8ff0 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -107,7 +107,7 @@ enum sock_type { struct socket { socket_state state; unsigned long flags; - struct proto_ops *ops; + const struct proto_ops *ops; struct fasync_struct *fasync_list; struct file *file; struct sock *sk; @@ -260,7 +260,7 @@ SOCKCALL_WRAP(name, recvmsg, (struct kiocb *iocb, struct socket *sock, struct ms SOCKCALL_WRAP(name, mmap, (struct file *file, struct socket *sock, struct vm_area_struct *vma), \ (file, sock, vma)) \ \ -static struct proto_ops name##_ops = { \ +static const struct proto_ops name##_ops = { \ .family = fam, \ .owner = THIS_MODULE, \ .release = __lock_##name##_release, \ diff --git a/include/net/inet_common.h b/include/net/inet_common.h index f943306ce5ff..227adcbdfec8 100644 --- a/include/net/inet_common.h +++ b/include/net/inet_common.h @@ -1,8 +1,8 @@ #ifndef _INET_COMMON_H #define _INET_COMMON_H -extern struct proto_ops inet_stream_ops; -extern struct proto_ops inet_dgram_ops; +extern const struct proto_ops inet_stream_ops; +extern const struct proto_ops inet_dgram_ops; /* * INET4 prototypes used by INET6 diff --git a/include/net/ipv6.h b/include/net/ipv6.h index e3d5d7bc8837..11a725662c36 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -538,8 +538,8 @@ extern int sysctl_ip6frag_low_thresh; extern int sysctl_ip6frag_time; extern int sysctl_ip6frag_secret_interval; -extern struct proto_ops inet6_stream_ops; -extern struct proto_ops inet6_dgram_ops; +extern const struct proto_ops inet6_stream_ops; +extern const struct proto_ops inet6_dgram_ops; extern int ip6_mc_source(int add, int omode, struct sock *sk, struct group_source_req *pgsr); diff --git a/include/net/protocol.h b/include/net/protocol.h index a29cb29647d0..63f7db99c2a6 100644 --- a/include/net/protocol.h +++ b/include/net/protocol.h @@ -65,7 +65,7 @@ struct inet_protosw { int protocol; /* This is the L4 protocol number. */ struct proto *prot; - struct proto_ops *ops; + const struct proto_ops *ops; int capability; /* Which (if any) capability do * we need to use this socket diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c index 7982656b9c83..296f186802ff 100644 --- a/net/appletalk/ddp.c +++ b/net/appletalk/ddp.c @@ -63,7 +63,7 @@ #include struct datalink_proto *ddp_dl, *aarp_dl; -static struct proto_ops atalk_dgram_ops; +static const struct proto_ops atalk_dgram_ops; /**************************************************************************\ * * @@ -1841,7 +1841,7 @@ static struct net_proto_family atalk_family_ops = { .owner = THIS_MODULE, }; -static struct proto_ops SOCKOPS_WRAPPED(atalk_dgram_ops) = { +static const struct proto_ops SOCKOPS_WRAPPED(atalk_dgram_ops) = { .family = PF_APPLETALK, .owner = THIS_MODULE, .release = atalk_release, diff --git a/net/atm/pvc.c b/net/atm/pvc.c index 2684a92da22b..f2c541774dcd 100644 --- a/net/atm/pvc.c +++ b/net/atm/pvc.c @@ -102,7 +102,7 @@ static int pvc_getname(struct socket *sock,struct sockaddr *sockaddr, } -static struct proto_ops pvc_proto_ops = { +static const struct proto_ops pvc_proto_ops = { .family = PF_ATMPVC, .owner = THIS_MODULE, diff --git a/net/atm/svc.c b/net/atm/svc.c index d7b266136bf6..3a180cfd7b48 100644 --- a/net/atm/svc.c +++ b/net/atm/svc.c @@ -613,7 +613,7 @@ static int svc_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) return error; } -static struct proto_ops svc_proto_ops = { +static const struct proto_ops svc_proto_ops = { .family = PF_ATMSVC, .owner = THIS_MODULE, diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index 1b683f302657..8b5d10aaba05 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -54,7 +54,7 @@ HLIST_HEAD(ax25_list); DEFINE_SPINLOCK(ax25_list_lock); -static struct proto_ops ax25_proto_ops; +static const struct proto_ops ax25_proto_ops; static void ax25_free_sock(struct sock *sk) { @@ -1944,7 +1944,7 @@ static struct net_proto_family ax25_family_ops = { .owner = THIS_MODULE, }; -static struct proto_ops ax25_proto_ops = { +static const struct proto_ops ax25_proto_ops = { .family = PF_AX25, .owner = THIS_MODULE, .release = ax25_release, diff --git a/net/bluetooth/bnep/sock.c b/net/bluetooth/bnep/sock.c index 9778c6acd53b..ccbaf69afc5b 100644 --- a/net/bluetooth/bnep/sock.c +++ b/net/bluetooth/bnep/sock.c @@ -146,7 +146,7 @@ static int bnep_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long return 0; } -static struct proto_ops bnep_sock_ops = { +static const struct proto_ops bnep_sock_ops = { .family = PF_BLUETOOTH, .owner = THIS_MODULE, .release = bnep_sock_release, diff --git a/net/bluetooth/cmtp/sock.c b/net/bluetooth/cmtp/sock.c index beb045bf5714..5e22343b6090 100644 --- a/net/bluetooth/cmtp/sock.c +++ b/net/bluetooth/cmtp/sock.c @@ -137,7 +137,7 @@ static int cmtp_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long return -EINVAL; } -static struct proto_ops cmtp_sock_ops = { +static const struct proto_ops cmtp_sock_ops = { .family = PF_BLUETOOTH, .owner = THIS_MODULE, .release = cmtp_sock_release, diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index 1d6d0a15c099..84e6c93a044a 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -575,7 +575,7 @@ static int hci_sock_getsockopt(struct socket *sock, int level, int optname, char return 0; } -static struct proto_ops hci_sock_ops = { +static const struct proto_ops hci_sock_ops = { .family = PF_BLUETOOTH, .owner = THIS_MODULE, .release = hci_sock_release, diff --git a/net/bluetooth/hidp/sock.c b/net/bluetooth/hidp/sock.c index f8986f881431..8f8dd931b294 100644 --- a/net/bluetooth/hidp/sock.c +++ b/net/bluetooth/hidp/sock.c @@ -143,7 +143,7 @@ static int hidp_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long return -EINVAL; } -static struct proto_ops hidp_sock_ops = { +static const struct proto_ops hidp_sock_ops = { .family = PF_BLUETOOTH, .owner = THIS_MODULE, .release = hidp_sock_release, diff --git a/net/bluetooth/l2cap.c b/net/bluetooth/l2cap.c index 95f33cc7a24e..7f0781e4326f 100644 --- a/net/bluetooth/l2cap.c +++ b/net/bluetooth/l2cap.c @@ -57,7 +57,7 @@ #define VERSION "2.8" -static struct proto_ops l2cap_sock_ops; +static const struct proto_ops l2cap_sock_ops; static struct bt_sock_list l2cap_sk_list = { .lock = RW_LOCK_UNLOCKED @@ -2161,7 +2161,7 @@ static ssize_t l2cap_sysfs_show(struct class *dev, char *buf) static CLASS_ATTR(l2cap, S_IRUGO, l2cap_sysfs_show, NULL); -static struct proto_ops l2cap_sock_ops = { +static const struct proto_ops l2cap_sock_ops = { .family = PF_BLUETOOTH, .owner = THIS_MODULE, .release = l2cap_sock_release, diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c index 6c34261b232e..757d2dd3b02f 100644 --- a/net/bluetooth/rfcomm/sock.c +++ b/net/bluetooth/rfcomm/sock.c @@ -58,7 +58,7 @@ #define BT_DBG(D...) #endif -static struct proto_ops rfcomm_sock_ops; +static const struct proto_ops rfcomm_sock_ops; static struct bt_sock_list rfcomm_sk_list = { .lock = RW_LOCK_UNLOCKED @@ -907,7 +907,7 @@ static ssize_t rfcomm_sock_sysfs_show(struct class *dev, char *buf) static CLASS_ATTR(rfcomm, S_IRUGO, rfcomm_sock_sysfs_show, NULL); -static struct proto_ops rfcomm_sock_ops = { +static const struct proto_ops rfcomm_sock_ops = { .family = PF_BLUETOOTH, .owner = THIS_MODULE, .release = rfcomm_sock_release, diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index 648181430699..6b61323ce23c 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -56,7 +56,7 @@ #define VERSION "0.5" -static struct proto_ops sco_sock_ops; +static const struct proto_ops sco_sock_ops; static struct bt_sock_list sco_sk_list = { .lock = RW_LOCK_UNLOCKED @@ -914,7 +914,7 @@ static ssize_t sco_sysfs_show(struct class *dev, char *buf) static CLASS_ATTR(sco, S_IRUGO, sco_sysfs_show, NULL); -static struct proto_ops sco_sock_ops = { +static const struct proto_ops sco_sock_ops = { .family = PF_BLUETOOTH, .owner = THIS_MODULE, .release = sco_sock_release, diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 40a4c6899051..e4e629ed9bf7 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -680,7 +680,7 @@ void dccp_shutdown(struct sock *sk, int how) EXPORT_SYMBOL_GPL(dccp_shutdown); -static struct proto_ops inet_dccp_ops = { +static const struct proto_ops inet_dccp_ops = { .family = PF_INET, .owner = THIS_MODULE, .release = inet_release, diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c index d402e9020c68..65e3baed0251 100644 --- a/net/decnet/af_decnet.c +++ b/net/decnet/af_decnet.c @@ -149,7 +149,7 @@ static void dn_keepalive(struct sock *sk); #define DN_SK_HASH_MASK (DN_SK_HASH_SIZE - 1) -static struct proto_ops dn_proto_ops; +static const struct proto_ops dn_proto_ops; static DEFINE_RWLOCK(dn_hash_lock); static struct hlist_head dn_sk_hash[DN_SK_HASH_SIZE]; static struct hlist_head dn_wild_sk; @@ -2342,7 +2342,7 @@ static struct net_proto_family dn_family_ops = { .owner = THIS_MODULE, }; -static struct proto_ops dn_proto_ops = { +static const struct proto_ops dn_proto_ops = { .family = AF_DECnet, .owner = THIS_MODULE, .release = dn_release, diff --git a/net/econet/af_econet.c b/net/econet/af_econet.c index 34fdac51df96..ff58f49c8b4a 100644 --- a/net/econet/af_econet.c +++ b/net/econet/af_econet.c @@ -45,7 +45,7 @@ #include #include -static struct proto_ops econet_ops; +static const struct proto_ops econet_ops; static struct hlist_head econet_sklist; static DEFINE_RWLOCK(econet_lock); @@ -698,7 +698,7 @@ static struct net_proto_family econet_family_ops = { .owner = THIS_MODULE, }; -static struct proto_ops SOCKOPS_WRAPPED(econet_ops) = { +static const struct proto_ops SOCKOPS_WRAPPED(econet_ops) = { .family = PF_ECONET, .owner = THIS_MODULE, .release = econet_release, diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 617e858beff1..4ed8a814c6cb 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -785,7 +785,7 @@ int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) return err; } -struct proto_ops inet_stream_ops = { +const struct proto_ops inet_stream_ops = { .family = PF_INET, .owner = THIS_MODULE, .release = inet_release, @@ -806,7 +806,7 @@ struct proto_ops inet_stream_ops = { .sendpage = tcp_sendpage }; -struct proto_ops inet_dgram_ops = { +const struct proto_ops inet_dgram_ops = { .family = PF_INET, .owner = THIS_MODULE, .release = inet_release, @@ -831,7 +831,7 @@ struct proto_ops inet_dgram_ops = { * For SOCK_RAW sockets; should be the same as inet_dgram_ops but without * udp_poll */ -static struct proto_ops inet_sockraw_ops = { +static const struct proto_ops inet_sockraw_ops = { .family = PF_INET, .owner = THIS_MODULE, .release = inet_release, diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 70a510ff31ee..7c9f19269f21 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -462,7 +462,7 @@ int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) return(0); } -struct proto_ops inet6_stream_ops = { +const struct proto_ops inet6_stream_ops = { .family = PF_INET6, .owner = THIS_MODULE, .release = inet6_release, @@ -483,7 +483,7 @@ struct proto_ops inet6_stream_ops = { .sendpage = tcp_sendpage }; -struct proto_ops inet6_dgram_ops = { +const struct proto_ops inet6_dgram_ops = { .family = PF_INET6, .owner = THIS_MODULE, .release = inet6_release, @@ -511,7 +511,7 @@ static struct net_proto_family inet6_family_ops = { }; /* Same as inet6_dgram_ops, sans udp_poll. */ -static struct proto_ops inet6_sockraw_ops = { +static const struct proto_ops inet6_sockraw_ops = { .family = PF_INET6, .owner = THIS_MODULE, .release = inet6_release, diff --git a/net/ipx/af_ipx.c b/net/ipx/af_ipx.c index 34b3bb868409..6c464c11bb09 100644 --- a/net/ipx/af_ipx.c +++ b/net/ipx/af_ipx.c @@ -75,7 +75,7 @@ static struct datalink_proto *pEII_datalink; static struct datalink_proto *p8023_datalink; static struct datalink_proto *pSNAP_datalink; -static struct proto_ops ipx_dgram_ops; +static const struct proto_ops ipx_dgram_ops; LIST_HEAD(ipx_interfaces); DEFINE_SPINLOCK(ipx_interfaces_lock); @@ -1901,7 +1901,7 @@ static struct net_proto_family ipx_family_ops = { .owner = THIS_MODULE, }; -static struct proto_ops SOCKOPS_WRAPPED(ipx_dgram_ops) = { +static const struct proto_ops SOCKOPS_WRAPPED(ipx_dgram_ops) = { .family = PF_IPX, .owner = THIS_MODULE, .release = ipx_release, diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c index f121f7de2032..e57683d424f7 100644 --- a/net/irda/af_irda.c +++ b/net/irda/af_irda.c @@ -62,12 +62,12 @@ static int irda_create(struct socket *sock, int protocol); -static struct proto_ops irda_stream_ops; -static struct proto_ops irda_seqpacket_ops; -static struct proto_ops irda_dgram_ops; +static const struct proto_ops irda_stream_ops; +static const struct proto_ops irda_seqpacket_ops; +static const struct proto_ops irda_dgram_ops; #ifdef CONFIG_IRDA_ULTRA -static struct proto_ops irda_ultra_ops; +static const struct proto_ops irda_ultra_ops; #define ULTRA_MAX_DATA 382 #endif /* CONFIG_IRDA_ULTRA */ @@ -2464,7 +2464,7 @@ static struct net_proto_family irda_family_ops = { .owner = THIS_MODULE, }; -static struct proto_ops SOCKOPS_WRAPPED(irda_stream_ops) = { +static const struct proto_ops SOCKOPS_WRAPPED(irda_stream_ops) = { .family = PF_IRDA, .owner = THIS_MODULE, .release = irda_release, @@ -2485,7 +2485,7 @@ static struct proto_ops SOCKOPS_WRAPPED(irda_stream_ops) = { .sendpage = sock_no_sendpage, }; -static struct proto_ops SOCKOPS_WRAPPED(irda_seqpacket_ops) = { +static const struct proto_ops SOCKOPS_WRAPPED(irda_seqpacket_ops) = { .family = PF_IRDA, .owner = THIS_MODULE, .release = irda_release, @@ -2506,7 +2506,7 @@ static struct proto_ops SOCKOPS_WRAPPED(irda_seqpacket_ops) = { .sendpage = sock_no_sendpage, }; -static struct proto_ops SOCKOPS_WRAPPED(irda_dgram_ops) = { +static const struct proto_ops SOCKOPS_WRAPPED(irda_dgram_ops) = { .family = PF_IRDA, .owner = THIS_MODULE, .release = irda_release, @@ -2528,7 +2528,7 @@ static struct proto_ops SOCKOPS_WRAPPED(irda_dgram_ops) = { }; #ifdef CONFIG_IRDA_ULTRA -static struct proto_ops SOCKOPS_WRAPPED(irda_ultra_ops) = { +static const struct proto_ops SOCKOPS_WRAPPED(irda_ultra_ops) = { .family = PF_IRDA, .owner = THIS_MODULE, .release = irda_release, diff --git a/net/key/af_key.c b/net/key/af_key.c index d32f7791f1e4..52efd04cbedb 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -113,7 +113,7 @@ static __inline__ void pfkey_unlock_table(void) } -static struct proto_ops pfkey_ops; +static const struct proto_ops pfkey_ops; static void pfkey_insert(struct sock *sk) { @@ -3127,7 +3127,7 @@ out: return err; } -static struct proto_ops pfkey_ops = { +static const struct proto_ops pfkey_ops = { .family = PF_KEY, .owner = THIS_MODULE, /* Operations that make no sense on pfkey sockets. */ diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index b6d3df5c911c..9cf65f9d8902 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -36,7 +36,7 @@ static u16 llc_ui_sap_last_autoport = LLC_SAP_DYN_START; static u16 llc_ui_sap_link_no_max[256]; static struct sockaddr_llc llc_ui_addrnull; -static struct proto_ops llc_ui_ops; +static const struct proto_ops llc_ui_ops; static int llc_ui_wait_for_conn(struct sock *sk, long timeout); static int llc_ui_wait_for_disc(struct sock *sk, long timeout); @@ -1098,7 +1098,7 @@ static struct net_proto_family llc_ui_family_ops = { .owner = THIS_MODULE, }; -static struct proto_ops llc_ui_ops = { +static const struct proto_ops llc_ui_ops = { .family = PF_LLC, .owner = THIS_MODULE, .release = llc_ui_release, diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 96020d7087e8..7849cac14d3a 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -293,7 +293,7 @@ static inline int nl_pid_hash_dilute(struct nl_pid_hash *hash, int len) return 0; } -static struct proto_ops netlink_ops; +static const struct proto_ops netlink_ops; static int netlink_insert(struct sock *sk, u32 pid) { @@ -1656,7 +1656,7 @@ int netlink_unregister_notifier(struct notifier_block *nb) return notifier_chain_unregister(&netlink_chain, nb); } -static struct proto_ops netlink_ops = { +static const struct proto_ops netlink_ops = { .family = PF_NETLINK, .owner = THIS_MODULE, .release = netlink_release, diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c index e5d82d711cae..05b653c05971 100644 --- a/net/netrom/af_netrom.c +++ b/net/netrom/af_netrom.c @@ -63,7 +63,7 @@ static unsigned short circuit = 0x101; static HLIST_HEAD(nr_list); static DEFINE_SPINLOCK(nr_list_lock); -static struct proto_ops nr_proto_ops; +static const struct proto_ops nr_proto_ops; /* * Socket removal during an interrupt is now safe. @@ -1337,7 +1337,7 @@ static struct net_proto_family nr_family_ops = { .owner = THIS_MODULE, }; -static struct proto_ops nr_proto_ops = { +static const struct proto_ops nr_proto_ops = { .family = PF_NETROM, .owner = THIS_MODULE, .release = nr_release, diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 3e2462760413..deda6fdb1e53 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -251,10 +251,10 @@ static void packet_sock_destruct(struct sock *sk) } -static struct proto_ops packet_ops; +static const struct proto_ops packet_ops; #ifdef CONFIG_SOCK_PACKET -static struct proto_ops packet_ops_spkt; +static const struct proto_ops packet_ops_spkt; static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { @@ -1784,7 +1784,7 @@ out: #ifdef CONFIG_SOCK_PACKET -static struct proto_ops packet_ops_spkt = { +static const struct proto_ops packet_ops_spkt = { .family = PF_PACKET, .owner = THIS_MODULE, .release = packet_release, @@ -1806,7 +1806,7 @@ static struct proto_ops packet_ops_spkt = { }; #endif -static struct proto_ops packet_ops = { +static const struct proto_ops packet_ops = { .family = PF_PACKET, .owner = THIS_MODULE, .release = packet_release, diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index fa3be2b8fb5f..15c05165c905 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -866,7 +866,7 @@ static int sctp_inet6_supported_addrs(const struct sctp_sock *opt, return 2; } -static struct proto_ops inet6_seqpacket_ops = { +static const struct proto_ops inet6_seqpacket_ops = { .family = PF_INET6, .owner = THIS_MODULE, .release = inet6_release, diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index f775d78aa59d..d1b0747a5b9d 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -829,7 +829,7 @@ static struct notifier_block sctp_inetaddr_notifier = { }; /* Socket operations. */ -static struct proto_ops inet_seqpacket_ops = { +static const struct proto_ops inet_seqpacket_ops = { .family = PF_INET, .owner = THIS_MODULE, .release = inet_release, /* Needs to be wrapped... */ diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index c6a51911e71e..d68eba481291 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -758,7 +758,7 @@ svc_tcp_accept(struct svc_sock *svsk) struct svc_serv *serv = svsk->sk_server; struct socket *sock = svsk->sk_sock; struct socket *newsock; - struct proto_ops *ops; + const struct proto_ops *ops; struct svc_sock *newsvsk; int err, slen; diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 04e850e04e3d..7d3fe6aebcdb 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -473,7 +473,7 @@ static int unix_dgram_connect(struct socket *, struct sockaddr *, static int unix_seqpacket_sendmsg(struct kiocb *, struct socket *, struct msghdr *, size_t); -static struct proto_ops unix_stream_ops = { +static const struct proto_ops unix_stream_ops = { .family = PF_UNIX, .owner = THIS_MODULE, .release = unix_release, @@ -494,7 +494,7 @@ static struct proto_ops unix_stream_ops = { .sendpage = sock_no_sendpage, }; -static struct proto_ops unix_dgram_ops = { +static const struct proto_ops unix_dgram_ops = { .family = PF_UNIX, .owner = THIS_MODULE, .release = unix_release, @@ -515,7 +515,7 @@ static struct proto_ops unix_dgram_ops = { .sendpage = sock_no_sendpage, }; -static struct proto_ops unix_seqpacket_ops = { +static const struct proto_ops unix_seqpacket_ops = { .family = PF_UNIX, .owner = THIS_MODULE, .release = unix_release, diff --git a/net/wanrouter/af_wanpipe.c b/net/wanrouter/af_wanpipe.c index 59fec59b2132..67948bf22dc4 100644 --- a/net/wanrouter/af_wanpipe.c +++ b/net/wanrouter/af_wanpipe.c @@ -181,7 +181,7 @@ struct wanpipe_opt #endif static int sk_count; -extern struct proto_ops wanpipe_ops; +extern const struct proto_ops wanpipe_ops; static unsigned long find_free_critical; static void wanpipe_unlink_driver(struct sock *sk); @@ -2546,7 +2546,7 @@ static int wanpipe_connect(struct socket *sock, struct sockaddr *uaddr, int addr return 0; } -struct proto_ops wanpipe_ops = { +const struct proto_ops wanpipe_ops = { .family = PF_WANPIPE, .owner = THIS_MODULE, .release = wanpipe_release, diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c index 020d73cc8414..ca8b3b0b920d 100644 --- a/net/x25/af_x25.c +++ b/net/x25/af_x25.c @@ -64,7 +64,7 @@ int sysctl_x25_ack_holdback_timeout = X25_DEFAULT_T2; HLIST_HEAD(x25_list); DEFINE_RWLOCK(x25_list_lock); -static struct proto_ops x25_proto_ops; +static const struct proto_ops x25_proto_ops; static struct x25_address null_x25_address = {" "}; @@ -1391,7 +1391,7 @@ static struct net_proto_family x25_family_ops = { .owner = THIS_MODULE, }; -static struct proto_ops SOCKOPS_WRAPPED(x25_proto_ops) = { +static const struct proto_ops SOCKOPS_WRAPPED(x25_proto_ops) = { .family = AF_X25, .owner = THIS_MODULE, .release = x25_release, -- cgit v1.2.3 From 25995ff577675b58dbd848b7758e7bad87411947 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 27 Dec 2005 02:42:22 -0200 Subject: [SOCK]: Introduce sk_receive_skb Its common enough to to justify that, TCP still can't use it as it has the prequeueing stuff, still to be made generic in the not so distant future :-) Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- drivers/net/pppoe.c | 22 ++-------------------- include/net/sock.h | 23 +++++++++++++++++++++++ net/dccp/ipv4.c | 23 ++--------------------- net/dccp/ipv6.c | 17 +---------------- net/decnet/dn_nsp_in.c | 17 +---------------- 5 files changed, 29 insertions(+), 73 deletions(-) (limited to 'include/net') diff --git a/drivers/net/pppoe.c b/drivers/net/pppoe.c index a842ecc60a34..71e303b28646 100644 --- a/drivers/net/pppoe.c +++ b/drivers/net/pppoe.c @@ -383,8 +383,6 @@ static int pppoe_rcv(struct sk_buff *skb, { struct pppoe_hdr *ph; struct pppox_sock *po; - struct sock *sk; - int ret; if (!pskb_may_pull(skb, sizeof(struct pppoe_hdr))) goto drop; @@ -395,24 +393,8 @@ static int pppoe_rcv(struct sk_buff *skb, ph = (struct pppoe_hdr *) skb->nh.raw; po = get_item((unsigned long) ph->sid, eth_hdr(skb)->h_source); - if (!po) - goto drop; - - sk = sk_pppox(po); - bh_lock_sock(sk); - - /* Socket state is unknown, must put skb into backlog. */ - if (sock_owned_by_user(sk) != 0) { - sk_add_backlog(sk, skb); - ret = NET_RX_SUCCESS; - } else { - ret = pppoe_rcv_core(sk, skb); - } - - bh_unlock_sock(sk); - sock_put(sk); - - return ret; + if (po != NULL) + return sk_receive_skb(sk_pppox(po), skb); drop: kfree_skb(skb); out: diff --git a/include/net/sock.h b/include/net/sock.h index 91d28957dc10..6961700ff3a0 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -926,6 +926,29 @@ static inline void sock_put(struct sock *sk) sk_free(sk); } +static inline int sk_receive_skb(struct sock *sk, struct sk_buff *skb) +{ + int rc = NET_RX_SUCCESS; + + if (sk_filter(sk, skb, 0)) + goto discard_and_relse; + + skb->dev = NULL; + + bh_lock_sock(sk); + if (!sock_owned_by_user(sk)) + rc = sk->sk_backlog_rcv(sk, skb); + else + sk_add_backlog(sk, skb); + bh_unlock_sock(sk); +out: + sock_put(sk); + return rc; +discard_and_relse: + kfree_skb(skb); + goto out; +} + /* Detach socket from process context. * Announce socket dead, detach it from wait queue and inode. * Note that parent inode held reference count on this struct sock, diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index c363051a7f16..99e8afa7ba1e 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -914,7 +914,6 @@ int dccp_v4_rcv(struct sk_buff *skb) { const struct dccp_hdr *dh; struct sock *sk; - int rc; /* Step 1: Check header basics: */ @@ -984,28 +983,10 @@ int dccp_v4_rcv(struct sk_buff *skb) goto do_time_wait; } - if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) { - dccp_pr_debug("xfrm4_policy_check failed\n"); + if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) goto discard_and_relse; - } - - if (sk_filter(sk, skb, 0)) { - dccp_pr_debug("sk_filter failed\n"); - goto discard_and_relse; - } - - skb->dev = NULL; - bh_lock_sock(sk); - rc = 0; - if (!sock_owned_by_user(sk)) - rc = dccp_v4_do_rcv(sk, skb); - else - sk_add_backlog(sk, skb); - bh_unlock_sock(sk); - - sock_put(sk); - return rc; + return sk_receive_skb(sk, skb); no_dccp_socket: if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 599b0be21515..2e194c8f9953 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -1032,7 +1032,6 @@ static int dccp_v6_rcv(struct sk_buff **pskb, unsigned int *nhoffp) const struct dccp_hdr *dh; struct sk_buff *skb = *pskb; struct sock *sk; - int rc; /* Step 1: Check header basics: */ @@ -1077,21 +1076,7 @@ static int dccp_v6_rcv(struct sk_buff **pskb, unsigned int *nhoffp) if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) goto discard_and_relse; - if (sk_filter(sk, skb, 0)) - goto discard_and_relse; - - skb->dev = NULL; - - bh_lock_sock(sk); - rc = 0; - if (!sock_owned_by_user(sk)) - rc = dccp_v6_do_rcv(sk, skb); - else - sk_add_backlog(sk, skb); - bh_unlock_sock(sk); - - sock_put(sk); - return rc ? -1 : 0; + return sk_receive_skb(sk, skb) ? -1 : 0; no_dccp_socket: if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) diff --git a/net/decnet/dn_nsp_in.c b/net/decnet/dn_nsp_in.c index 369f25b60f3f..44bda85e678f 100644 --- a/net/decnet/dn_nsp_in.c +++ b/net/decnet/dn_nsp_in.c @@ -793,7 +793,6 @@ static int dn_nsp_rx_packet(struct sk_buff *skb) got_it: if (sk != NULL) { struct dn_scp *scp = DN_SK(sk); - int ret; /* Reset backoff */ scp->nsp_rxtshift = 0; @@ -807,21 +806,7 @@ got_it: goto free_out; } - bh_lock_sock(sk); - ret = NET_RX_SUCCESS; - if (decnet_debug_level & 8) - printk(KERN_DEBUG "NSP: 0x%02x 0x%02x 0x%04x 0x%04x %d\n", - (int)cb->rt_flags, (int)cb->nsp_flags, - (int)cb->src_port, (int)cb->dst_port, - !!sock_owned_by_user(sk)); - if (!sock_owned_by_user(sk)) - ret = dn_nsp_backlog_rcv(sk, skb); - else - sk_add_backlog(sk, skb); - bh_unlock_sock(sk); - sock_put(sk); - - return ret; + return sk_receive_skb(sk, skb); } return dn_nsp_no_socket(skb, reason); -- cgit v1.2.3 From 14c850212ed8f8cbb5972ad6b8812e08a0bc901c Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 27 Dec 2005 02:43:12 -0200 Subject: [INET_SOCK]: Move struct inet_sock & helper functions to net/inet_sock.h To help in reducing the number of include dependencies, several files were touched as they were getting needed headers indirectly for stuff they use. Thanks also to Alan Menegotto for pointing out that net/dccp/proto.c had linux/dccp.h include twice. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- drivers/infiniband/ulp/ipoib/ipoib_main.c | 2 + drivers/infiniband/ulp/ipoib/ipoib_multicast.c | 2 + drivers/net/ns83820.c | 1 + drivers/net/sk98lin/skge.c | 1 + drivers/net/skge.c | 1 + drivers/net/tg3.c | 1 + fs/9p/trans_sock.c | 1 + fs/nfs/callback.c | 3 + include/linux/dccp.h | 3 +- include/linux/ip.h | 126 +--------------- include/linux/ipv6.h | 7 +- include/linux/udp.h | 6 +- include/net/atmclip.h | 2 +- include/net/dst.h | 1 + include/net/icmp.h | 9 +- include/net/ieee80211_crypt.h | 9 +- include/net/inet_connection_sock.h | 3 +- include/net/inet_ecn.h | 2 + include/net/inet_hashtables.h | 21 +-- include/net/inet_sock.h | 193 +++++++++++++++++++++++++ include/net/inet_timewait_sock.h | 2 +- include/net/ip.h | 17 ++- include/net/ip_fib.h | 2 + include/net/ip_vs.h | 12 +- include/net/ipv6.h | 3 + include/net/ndisc.h | 17 ++- include/net/neighbour.h | 2 +- include/net/pkt_act.h | 1 - include/net/raw.h | 2 + include/net/udp.h | 4 +- include/net/xfrm.h | 3 +- net/bridge/br_netfilter.c | 4 + net/bridge/netfilter/ebt_log.c | 1 + net/core/netpoll.c | 1 + net/dccp/ccid.h | 2 + net/dccp/ipv4.c | 1 + net/dccp/ipv6.c | 1 + net/dccp/output.c | 1 + net/dccp/proto.c | 3 +- net/econet/af_econet.c | 1 + net/ipv4/af_inet.c | 1 + net/ipv4/ah4.c | 1 + net/ipv4/arp.c | 1 + net/ipv4/devinet.c | 1 + net/ipv4/esp4.c | 1 + net/ipv4/fib_frontend.c | 1 + net/ipv4/fib_hash.c | 1 + net/ipv4/fib_rules.c | 1 + net/ipv4/fib_semantics.c | 2 + net/ipv4/icmp.c | 1 + net/ipv4/igmp.c | 2 + net/ipv4/ip_input.c | 1 + net/ipv4/ip_options.c | 1 + net/ipv4/ip_sockglue.c | 1 + net/ipv4/ipcomp.c | 1 + net/ipv4/ipconfig.c | 2 + net/ipv4/ipmr.c | 1 + net/ipv4/ipvs/ip_vs_conn.c | 2 + net/ipv4/ipvs/ip_vs_ctl.c | 1 + net/ipv4/ipvs/ip_vs_dh.c | 2 + net/ipv4/ipvs/ip_vs_est.c | 3 + net/ipv4/ipvs/ip_vs_lblc.c | 2 + net/ipv4/ipvs/ip_vs_lblcr.c | 2 + net/ipv4/ipvs/ip_vs_proto_ah.c | 2 + net/ipv4/ipvs/ip_vs_proto_esp.c | 2 + net/ipv4/ipvs/ip_vs_proto_udp.c | 3 + net/ipv4/ipvs/ip_vs_sh.c | 2 + net/ipv4/ipvs/ip_vs_sync.c | 2 + net/ipv4/netfilter/ip_conntrack_amanda.c | 2 + net/ipv4/netfilter/ip_conntrack_proto_gre.c | 1 + net/ipv4/netfilter/ip_conntrack_proto_udp.c | 1 + net/ipv4/netfilter/ip_conntrack_standalone.c | 1 + net/ipv4/netfilter/ip_nat_snmp_basic.c | 2 + net/ipv4/netfilter/ipt_MASQUERADE.c | 2 + net/ipv4/netfilter/ipt_physdev.c | 1 + net/ipv4/proc.c | 1 + net/ipv4/sysctl_net_ipv4.c | 1 + net/ipv4/udp.c | 1 + net/ipv6/ah6.c | 1 + net/ipv6/esp6.c | 1 + net/ipv6/ipcomp6.c | 1 + net/ipv6/netfilter/ip6_tables.c | 1 + net/ipv6/netfilter/ip6t_LOG.c | 1 + net/ipv6/netfilter/ip6t_ah.c | 1 + net/ipv6/netfilter/ip6t_esp.c | 1 + net/sched/sch_teql.c | 1 + net/sctp/protocol.c | 1 + 87 files changed, 355 insertions(+), 184 deletions(-) create mode 100644 include/net/inet_sock.h (limited to 'include/net') diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index 475d98fa9e26..780009c7eaa6 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -47,6 +47,8 @@ #include #include +#include + MODULE_AUTHOR("Roland Dreier"); MODULE_DESCRIPTION("IP-over-InfiniBand net driver"); MODULE_LICENSE("Dual BSD/GPL"); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c index ef3ee035bbc8..ed0c2ead8bc1 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c @@ -43,6 +43,8 @@ #include #include +#include + #include "ipoib.h" #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG diff --git a/drivers/net/ns83820.c b/drivers/net/ns83820.c index f857ae94d261..b0c3b6ab6263 100644 --- a/drivers/net/ns83820.c +++ b/drivers/net/ns83820.c @@ -115,6 +115,7 @@ #include #include #include +#include #include #include diff --git a/drivers/net/sk98lin/skge.c b/drivers/net/sk98lin/skge.c index ae7343934758..e1a2d52cc1fe 100644 --- a/drivers/net/sk98lin/skge.c +++ b/drivers/net/sk98lin/skge.c @@ -107,6 +107,7 @@ #include "h/skversion.h" +#include #include #include #include diff --git a/drivers/net/skge.c b/drivers/net/skge.c index 00d683063c01..d8cc3aea032a 100644 --- a/drivers/net/skge.c +++ b/drivers/net/skge.c @@ -25,6 +25,7 @@ */ #include +#include #include #include #include diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c index 2fc9893d69e1..59d916ccc810 100644 --- a/drivers/net/tg3.c +++ b/drivers/net/tg3.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include diff --git a/fs/9p/trans_sock.c b/fs/9p/trans_sock.c index a93c2bf94c33..6a9a75d40f73 100644 --- a/fs/9p/trans_sock.c +++ b/fs/9p/trans_sock.c @@ -26,6 +26,7 @@ */ #include +#include #include #include #include diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index f2ca782aba33..30cae3602867 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -14,6 +14,9 @@ #include #include #include + +#include + #include "nfs4_fs.h" #include "callback.h" diff --git a/include/linux/dccp.h b/include/linux/dccp.h index d0bdb499cf8d..088529f54965 100644 --- a/include/linux/dccp.h +++ b/include/linux/dccp.h @@ -192,10 +192,9 @@ enum { #include #include +#include #include -#include #include -#include enum dccp_state { DCCP_OPEN = TCP_ESTABLISHED, diff --git a/include/linux/ip.h b/include/linux/ip.h index 6ccc596c19c8..9e2eb9a602eb 100644 --- a/include/linux/ip.h +++ b/include/linux/ip.h @@ -16,6 +16,7 @@ */ #ifndef _LINUX_IP_H #define _LINUX_IP_H +#include #include #define IPTOS_TOS_MASK 0x1E @@ -78,131 +79,6 @@ #define IPOPT_TS_TSANDADDR 1 /* timestamps and addresses */ #define IPOPT_TS_PRESPEC 3 /* specified modules only */ -#ifdef __KERNEL__ -#include -#include -#include -#include -#include -#include - -struct ip_options { - __u32 faddr; /* Saved first hop address */ - unsigned char optlen; - unsigned char srr; - unsigned char rr; - unsigned char ts; - unsigned char is_setbyuser:1, /* Set by setsockopt? */ - is_data:1, /* Options in __data, rather than skb */ - is_strictroute:1, /* Strict source route */ - srr_is_hit:1, /* Packet destination addr was our one */ - is_changed:1, /* IP checksum more not valid */ - rr_needaddr:1, /* Need to record addr of outgoing dev */ - ts_needtime:1, /* Need to record timestamp */ - ts_needaddr:1; /* Need to record addr of outgoing dev */ - unsigned char router_alert; - unsigned char __pad1; - unsigned char __pad2; - unsigned char __data[0]; -}; - -#define optlength(opt) (sizeof(struct ip_options) + opt->optlen) - -struct inet_request_sock { - struct request_sock req; -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) - u16 inet6_rsk_offset; - /* 2 bytes hole, try to pack */ -#endif - u32 loc_addr; - u32 rmt_addr; - u16 rmt_port; - u16 snd_wscale : 4, - rcv_wscale : 4, - tstamp_ok : 1, - sack_ok : 1, - wscale_ok : 1, - ecn_ok : 1, - acked : 1; - struct ip_options *opt; -}; - -static inline struct inet_request_sock *inet_rsk(const struct request_sock *sk) -{ - return (struct inet_request_sock *)sk; -} - -struct ipv6_pinfo; - -struct inet_sock { - /* sk and pinet6 has to be the first two members of inet_sock */ - struct sock sk; -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) - struct ipv6_pinfo *pinet6; -#endif - /* Socket demultiplex comparisons on incoming packets. */ - __u32 daddr; /* Foreign IPv4 addr */ - __u32 rcv_saddr; /* Bound local IPv4 addr */ - __u16 dport; /* Destination port */ - __u16 num; /* Local port */ - __u32 saddr; /* Sending source */ - __s16 uc_ttl; /* Unicast TTL */ - __u16 cmsg_flags; - struct ip_options *opt; - __u16 sport; /* Source port */ - __u16 id; /* ID counter for DF pkts */ - __u8 tos; /* TOS */ - __u8 mc_ttl; /* Multicasting TTL */ - __u8 pmtudisc; - unsigned recverr : 1, - is_icsk : 1, /* inet_connection_sock? */ - freebind : 1, - hdrincl : 1, - mc_loop : 1; - int mc_index; /* Multicast device index */ - __u32 mc_addr; - struct ip_mc_socklist *mc_list; /* Group array */ - /* - * Following members are used to retain the infomation to build - * an ip header on each ip fragmentation while the socket is corked. - */ - struct { - unsigned int flags; - unsigned int fragsize; - struct ip_options *opt; - struct rtable *rt; - int length; /* Total length of all frames */ - u32 addr; - struct flowi fl; - } cork; -}; - -#define IPCORK_OPT 1 /* ip-options has been held in ipcork.opt */ -#define IPCORK_ALLFRAG 2 /* always fragment (for ipv6 for now) */ - -static inline struct inet_sock *inet_sk(const struct sock *sk) -{ - return (struct inet_sock *)sk; -} - -static inline void __inet_sk_copy_descendant(struct sock *sk_to, - const struct sock *sk_from, - const int ancestor_size) -{ - memcpy(inet_sk(sk_to) + 1, inet_sk(sk_from) + 1, - sk_from->sk_prot->obj_size - ancestor_size); -} -#if !(defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)) -static inline void inet_sk_copy_descendant(struct sock *sk_to, - const struct sock *sk_from) -{ - __inet_sk_copy_descendant(sk_to, sk_from, sizeof(struct inet_sock)); -} -#endif -#endif - -extern int inet_sk_rebuild_header(struct sock *sk); - struct iphdr { #if defined(__LITTLE_ENDIAN_BITFIELD) __u8 ihl:4, diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index a0d04891fe12..93bbed5c6cf4 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -171,12 +171,13 @@ enum { }; #ifdef __KERNEL__ -#include /* struct sockaddr_in6 */ #include -#include /* struct ipv6_mc_socklist */ #include #include +#include /* struct ipv6_mc_socklist */ +#include + /* This structure contains results of exthdrs parsing as offsets from skb->nh. @@ -346,8 +347,6 @@ static inline void inet_sk_copy_descendant(struct sock *sk_to, #define __ipv6_only_sock(sk) (inet6_sk(sk)->ipv6only) #define ipv6_only_sock(sk) ((sk)->sk_family == PF_INET6 && __ipv6_only_sock(sk)) -#include - struct inet6_timewait_sock { struct in6_addr tw_v6_daddr; struct in6_addr tw_v6_rcv_saddr; diff --git a/include/linux/udp.h b/include/linux/udp.h index b60e0b4a25c4..85a55658831c 100644 --- a/include/linux/udp.h +++ b/include/linux/udp.h @@ -35,10 +35,10 @@ struct udphdr { #define UDP_ENCAP_ESPINUDP 2 /* draft-ietf-ipsec-udp-encaps-06 */ #ifdef __KERNEL__ - #include -#include -#include +#include + +#include struct udp_sock { /* inet_sock has to be the first member */ diff --git a/include/net/atmclip.h b/include/net/atmclip.h index 47048b1d179a..90fcc98e676f 100644 --- a/include/net/atmclip.h +++ b/include/net/atmclip.h @@ -7,7 +7,6 @@ #define _ATMCLIP_H #include -#include #include #include #include @@ -18,6 +17,7 @@ #define CLIP_VCC(vcc) ((struct clip_vcc *) ((vcc)->user_back)) #define NEIGH2ENTRY(neigh) ((struct atmarp_entry *) (neigh)->primary_key) +struct sk_buff; struct clip_vcc { struct atm_vcc *vcc; /* VCC descriptor */ diff --git a/include/net/dst.h b/include/net/dst.h index 6c196a5baf24..bee8b84d329d 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -9,6 +9,7 @@ #define _NET_DST_H #include +#include #include #include #include diff --git a/include/net/icmp.h b/include/net/icmp.h index 6cdebeee5f96..e7c3f20fbafc 100644 --- a/include/net/icmp.h +++ b/include/net/icmp.h @@ -20,12 +20,9 @@ #include #include -#include -#include -#include +#include #include -#include struct icmp_err { int errno; @@ -38,6 +35,10 @@ DECLARE_SNMP_STAT(struct icmp_mib, icmp_statistics); #define ICMP_INC_STATS_BH(field) SNMP_INC_STATS_BH(icmp_statistics, field) #define ICMP_INC_STATS_USER(field) SNMP_INC_STATS_USER(icmp_statistics, field) +struct dst_entry; +struct net_proto_family; +struct sk_buff; + extern void icmp_send(struct sk_buff *skb_in, int type, int code, u32 info); extern int icmp_rcv(struct sk_buff *skb); extern int icmp_ioctl(struct sock *sk, int cmd, unsigned long arg); diff --git a/include/net/ieee80211_crypt.h b/include/net/ieee80211_crypt.h index 225fc751d464..03b766afdc39 100644 --- a/include/net/ieee80211_crypt.h +++ b/include/net/ieee80211_crypt.h @@ -23,12 +23,17 @@ #ifndef IEEE80211_CRYPT_H #define IEEE80211_CRYPT_H -#include +#include +#include +#include enum { IEEE80211_CRYPTO_TKIP_COUNTERMEASURES = (1 << 0), }; +struct sk_buff; +struct module; + struct ieee80211_crypto_ops { const char *name; struct list_head list; @@ -87,6 +92,8 @@ struct ieee80211_crypt_data { atomic_t refcnt; }; +struct ieee80211_device; + int ieee80211_register_crypto_ops(struct ieee80211_crypto_ops *ops); int ieee80211_unregister_crypto_ops(struct ieee80211_crypto_ops *ops); struct ieee80211_crypto_ops *ieee80211_get_crypto_ops(const char *name); diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index 91888967d3e3..50234fa56a68 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -16,9 +16,10 @@ #define _INET_CONNECTION_SOCK_H #include -#include #include #include + +#include #include #define INET_CSK_DEBUG 1 diff --git a/include/net/inet_ecn.h b/include/net/inet_ecn.h index b0c47e2eccf1..d599c6bfbb86 100644 --- a/include/net/inet_ecn.h +++ b/include/net/inet_ecn.h @@ -3,6 +3,8 @@ #include #include + +#include #include enum { diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index c83baa79f66e..135d80fd658e 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -26,6 +26,7 @@ #include #include +#include #include #include #include @@ -128,26 +129,6 @@ struct inet_hashinfo { kmem_cache_t *bind_bucket_cachep; }; -static inline unsigned int inet_ehashfn(const __u32 laddr, const __u16 lport, - const __u32 faddr, const __u16 fport) -{ - unsigned int h = (laddr ^ lport) ^ (faddr ^ fport); - h ^= h >> 16; - h ^= h >> 8; - return h; -} - -static inline int inet_sk_ehashfn(const struct sock *sk) -{ - const struct inet_sock *inet = inet_sk(sk); - const __u32 laddr = inet->rcv_saddr; - const __u16 lport = inet->num; - const __u32 faddr = inet->daddr; - const __u16 fport = inet->dport; - - return inet_ehashfn(laddr, lport, faddr, fport); -} - static inline struct inet_ehash_bucket *inet_ehash_bucket( struct inet_hashinfo *hashinfo, unsigned int hash) diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h new file mode 100644 index 000000000000..883eb529ef8e --- /dev/null +++ b/include/net/inet_sock.h @@ -0,0 +1,193 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Definitions for inet_sock + * + * Authors: Many, reorganised here by + * Arnaldo Carvalho de Melo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#ifndef _INET_SOCK_H +#define _INET_SOCK_H + +#include + +#include +#include + +#include +#include +#include + +/** struct ip_options - IP Options + * + * @faddr - Saved first hop address + * @is_setbyuser - Set by setsockopt? + * @is_data - Options in __data, rather than skb + * @is_strictroute - Strict source route + * @srr_is_hit - Packet destination addr was our one + * @is_changed - IP checksum more not valid + * @rr_needaddr - Need to record addr of outgoing dev + * @ts_needtime - Need to record timestamp + * @ts_needaddr - Need to record addr of outgoing dev + */ +struct ip_options { + __u32 faddr; + unsigned char optlen; + unsigned char srr; + unsigned char rr; + unsigned char ts; + unsigned char is_setbyuser:1, + is_data:1, + is_strictroute:1, + srr_is_hit:1, + is_changed:1, + rr_needaddr:1, + ts_needtime:1, + ts_needaddr:1; + unsigned char router_alert; + unsigned char __pad1; + unsigned char __pad2; + unsigned char __data[0]; +}; + +#define optlength(opt) (sizeof(struct ip_options) + opt->optlen) + +struct inet_request_sock { + struct request_sock req; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + u16 inet6_rsk_offset; + /* 2 bytes hole, try to pack */ +#endif + u32 loc_addr; + u32 rmt_addr; + u16 rmt_port; + u16 snd_wscale : 4, + rcv_wscale : 4, + tstamp_ok : 1, + sack_ok : 1, + wscale_ok : 1, + ecn_ok : 1, + acked : 1; + struct ip_options *opt; +}; + +static inline struct inet_request_sock *inet_rsk(const struct request_sock *sk) +{ + return (struct inet_request_sock *)sk; +} + +struct ip_mc_socklist; +struct ipv6_pinfo; +struct rtable; + +/** struct inet_sock - representation of INET sockets + * + * @sk - ancestor class + * @pinet6 - pointer to IPv6 control block + * @daddr - Foreign IPv4 addr + * @rcv_saddr - Bound local IPv4 addr + * @dport - Destination port + * @num - Local port + * @saddr - Sending source + * @uc_ttl - Unicast TTL + * @sport - Source port + * @id - ID counter for DF pkts + * @tos - TOS + * @mc_ttl - Multicasting TTL + * @is_icsk - is this an inet_connection_sock? + * @mc_index - Multicast device index + * @mc_list - Group array + * @cork - info to build ip hdr on each ip frag while socket is corked + */ +struct inet_sock { + /* sk and pinet6 has to be the first two members of inet_sock */ + struct sock sk; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + struct ipv6_pinfo *pinet6; +#endif + /* Socket demultiplex comparisons on incoming packets. */ + __u32 daddr; + __u32 rcv_saddr; + __u16 dport; + __u16 num; + __u32 saddr; + __s16 uc_ttl; + __u16 cmsg_flags; + struct ip_options *opt; + __u16 sport; + __u16 id; + __u8 tos; + __u8 mc_ttl; + __u8 pmtudisc; + __u8 recverr:1, + is_icsk:1, + freebind:1, + hdrincl:1, + mc_loop:1; + int mc_index; + __u32 mc_addr; + struct ip_mc_socklist *mc_list; + struct { + unsigned int flags; + unsigned int fragsize; + struct ip_options *opt; + struct rtable *rt; + int length; /* Total length of all frames */ + u32 addr; + struct flowi fl; + } cork; +}; + +#define IPCORK_OPT 1 /* ip-options has been held in ipcork.opt */ +#define IPCORK_ALLFRAG 2 /* always fragment (for ipv6 for now) */ + +static inline struct inet_sock *inet_sk(const struct sock *sk) +{ + return (struct inet_sock *)sk; +} + +static inline void __inet_sk_copy_descendant(struct sock *sk_to, + const struct sock *sk_from, + const int ancestor_size) +{ + memcpy(inet_sk(sk_to) + 1, inet_sk(sk_from) + 1, + sk_from->sk_prot->obj_size - ancestor_size); +} +#if !(defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)) +static inline void inet_sk_copy_descendant(struct sock *sk_to, + const struct sock *sk_from) +{ + __inet_sk_copy_descendant(sk_to, sk_from, sizeof(struct inet_sock)); +} +#endif + +extern int inet_sk_rebuild_header(struct sock *sk); + +static inline unsigned int inet_ehashfn(const __u32 laddr, const __u16 lport, + const __u32 faddr, const __u16 fport) +{ + unsigned int h = (laddr ^ lport) ^ (faddr ^ fport); + h ^= h >> 16; + h ^= h >> 8; + return h; +} + +static inline int inet_sk_ehashfn(const struct sock *sk) +{ + const struct inet_sock *inet = inet_sk(sk); + const __u32 laddr = inet->rcv_saddr; + const __u16 lport = inet->num; + const __u32 faddr = inet->daddr; + const __u16 fport = inet->dport; + + return inet_ehashfn(laddr, lport, faddr, fport); +} + +#endif /* _INET_SOCK_H */ diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h index e396a65473d7..1da294c47522 100644 --- a/include/net/inet_timewait_sock.h +++ b/include/net/inet_timewait_sock.h @@ -17,13 +17,13 @@ #include -#include #include #include #include #include #include +#include #include #include #include diff --git a/include/net/ip.h b/include/net/ip.h index 4d6294ba038e..f7e7fd728b67 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -24,14 +24,10 @@ #include #include -#include #include #include -#include -#include -#include -#include -#include + +#include #include struct sock; @@ -75,6 +71,13 @@ extern rwlock_t ip_ra_lock; #define IP_FRAG_TIME (30 * HZ) /* fragment lifetime */ +struct msghdr; +struct net_device; +struct packet_type; +struct rtable; +struct sk_buff; +struct sockaddr; + extern void ip_mc_dropsocket(struct sock *); extern void ip_mc_dropdevice(struct net_device *dev); extern int igmp_mc_proc_init(void); @@ -184,6 +187,8 @@ extern int sysctl_ip_dynaddr; extern void ipfrag_init(void); #ifdef CONFIG_INET +#include + /* The function in 2.2 was invalid, producing wrong result for * check=0xFEFF. It was noticed by Arthur Skawina _year_ ago. --ANK(000625) */ static inline diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 14de4ebd1211..e000fa2cd5f6 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -238,6 +238,8 @@ extern int fib_validate_source(u32 src, u32 dst, u8 tos, int oif, struct net_device *dev, u32 *spec_dst, u32 *itag); extern void fib_select_multipath(const struct flowi *flp, struct fib_result *res); +struct rtentry; + /* Exported by fib_semantics.c */ extern int ip_fib_check_default(u32 gw, struct net_device *dev); extern int fib_sync_down(u32 local, struct net_device *dev, int force); diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 3b5559a023a4..7d2674fde19a 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -251,16 +251,15 @@ struct ip_vs_daemon_user { #include #include /* for struct list_head */ #include /* for struct rwlock_t */ -#include /* for struct sk_buff */ -#include /* for struct iphdr */ #include /* for struct atomic_t */ -#include /* for struct neighbour */ -#include /* for struct dst_entry */ -#include #include +#include +#include #ifdef CONFIG_IP_VS_DEBUG +#include + extern int ip_vs_get_debug_level(void); #define IP_VS_DBG(level, msg...) \ do { \ @@ -429,8 +428,11 @@ struct ip_vs_stats spinlock_t lock; /* spin lock */ }; +struct dst_entry; +struct iphdr; struct ip_vs_conn; struct ip_vs_app; +struct sk_buff; struct ip_vs_protocol { struct ip_vs_protocol *next; diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 11a725662c36..860bbac4c4ee 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -541,6 +541,9 @@ extern int sysctl_ip6frag_secret_interval; extern const struct proto_ops inet6_stream_ops; extern const struct proto_ops inet6_dgram_ops; +struct group_source_req; +struct group_filter; + extern int ip6_mc_source(int add, int omode, struct sock *sk, struct group_source_req *pgsr); extern int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf); diff --git a/include/net/ndisc.h b/include/net/ndisc.h index f85d6e4b7442..bbac87eeb422 100644 --- a/include/net/ndisc.h +++ b/include/net/ndisc.h @@ -35,11 +35,20 @@ enum { #ifdef __KERNEL__ -#include -#include +#include +#include #include +#include +#include + #include -#include + +struct ctl_table; +struct file; +struct inet6_dev; +struct net_device; +struct net_proto_family; +struct sk_buff; extern struct neigh_table nd_tbl; @@ -108,7 +117,7 @@ extern int igmp6_event_report(struct sk_buff *skb); extern void igmp6_cleanup(void); #ifdef CONFIG_SYSCTL -extern int ndisc_ifinfo_sysctl_change(ctl_table *ctl, +extern int ndisc_ifinfo_sysctl_change(struct ctl_table *ctl, int write, struct file * filp, void __user *buffer, diff --git a/include/net/neighbour.h b/include/net/neighbour.h index 34c07731933d..6fa9ae190741 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -49,8 +49,8 @@ #ifdef __KERNEL__ #include -#include #include +#include #include #include diff --git a/include/net/pkt_act.h b/include/net/pkt_act.h index bd08964b72c0..b225d8472b7e 100644 --- a/include/net/pkt_act.h +++ b/include/net/pkt_act.h @@ -15,7 +15,6 @@ #include #include #include -#include #include #include #include diff --git a/include/net/raw.h b/include/net/raw.h index f47917469b12..e67b28a0248c 100644 --- a/include/net/raw.h +++ b/include/net/raw.h @@ -19,6 +19,8 @@ #include +#include + extern struct proto raw_prot; extern void raw_err(struct sock *, struct sk_buff *, u32 info); diff --git a/include/net/udp.h b/include/net/udp.h index 107b9d791a1f..766fba1369ce 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -22,9 +22,8 @@ #ifndef _UDP_H #define _UDP_H -#include -#include #include +#include #include #include #include @@ -62,6 +61,7 @@ static inline int udp_lport_inuse(u16 num) extern struct proto udp_prot; +struct sk_buff; extern void udp_err(struct sk_buff *, u32); diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 487abca3ca6f..07d7b50cdd76 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -2,11 +2,12 @@ #define _NET_XFRM_H #include +#include #include #include #include #include -#include +#include #include #include #include diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c index 23422bd53a5e..223f8270daee 100644 --- a/net/bridge/br_netfilter.c +++ b/net/bridge/br_netfilter.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -33,8 +34,11 @@ #include #include #include + #include #include +#include + #include #include #include "br_private.h" diff --git a/net/bridge/netfilter/ebt_log.c b/net/bridge/netfilter/ebt_log.c index c436e6c6242b..9f6e0193ae10 100644 --- a/net/bridge/netfilter/ebt_log.c +++ b/net/bridge/netfilter/ebt_log.c @@ -9,6 +9,7 @@ * */ +#include #include #include #include diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 49424a42a2c0..281a632fa6a6 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include diff --git a/net/dccp/ccid.h b/net/dccp/ccid.h index c37eeeaf5c6e..de681c6ad081 100644 --- a/net/dccp/ccid.h +++ b/net/dccp/ccid.h @@ -21,6 +21,8 @@ #define CCID_MAX 255 +struct tcp_info; + struct ccid { unsigned char ccid_id; const char *ccid_name; diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 99e8afa7ba1e..3f244670764a 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -19,6 +19,7 @@ #include #include +#include #include #include #include diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 2e194c8f9953..c609dc78f487 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include diff --git a/net/dccp/output.c b/net/dccp/output.c index 95a3c2c6a3ce..efd7ffb903a1 100644 --- a/net/dccp/output.c +++ b/net/dccp/output.c @@ -15,6 +15,7 @@ #include #include +#include #include #include "ackvec.h" diff --git a/net/dccp/proto.c b/net/dccp/proto.c index e4e629ed9bf7..65b11ea90d85 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -24,7 +24,7 @@ #include #include -#include +#include #include #include #include @@ -34,7 +34,6 @@ #include #include #include -#include #include "ccid.h" #include "dccp.h" diff --git a/net/econet/af_econet.c b/net/econet/af_econet.c index ff58f49c8b4a..70fb2b88da65 100644 --- a/net/econet/af_econet.c +++ b/net/econet/af_econet.c @@ -31,6 +31,7 @@ #include #include #include +#include #include #include #include diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 4ed8a814c6cb..36a6306ca5a3 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -93,6 +93,7 @@ #include #include #include +#include #include #include #include diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c index 035ad2c9e1ba..aed537fa2c88 100644 --- a/net/ipv4/ah4.c +++ b/net/ipv4/ah4.c @@ -6,6 +6,7 @@ #include #include #include +#include #include diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index b425748f02d7..37432088fe6d 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -86,6 +86,7 @@ #include #include #include +#include #include #include #include diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 04a6fe3e95a2..7b9bb28e2ee9 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -58,6 +58,7 @@ #endif #include +#include #include #include #include diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 1b18ce66e7b7..73bfcae8af9c 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -9,6 +9,7 @@ #include #include #include +#include #include /* decapsulation data for use when post-processing */ diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 19b1b984d687..18f5e509281a 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c index 7ea0209cb169..e2890ec8159e 100644 --- a/net/ipv4/fib_hash.c +++ b/net/ipv4/fib_hash.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index 0b298bbc1518..0dd4d06e456d 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 6d2a6ac070e3..ef4724de7350 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include @@ -36,6 +37,7 @@ #include #include +#include #include #include #include diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 92e23b2ad4d2..be5a519cd2f8 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -73,6 +73,7 @@ #include #include #include +#include #include #include #include diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 4a195c724f01..34758118c10c 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -91,6 +91,8 @@ #include #include #include + +#include #include #include #include diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 473d0f2b2e0d..e45846ae570b 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -128,6 +128,7 @@ #include #include #include +#include #include #include diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index dbe12da8d8b3..d3f6c468faf4 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -22,6 +22,7 @@ #include #include #include +#include /* * Write options to IP header, record destination address to diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index add019c746f8..6986e11d65cc 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c index fc718df17b40..d64e2ec8da7b 100644 --- a/net/ipv4/ipcomp.c +++ b/net/ipv4/ipcomp.c @@ -28,6 +28,7 @@ #include #include #include +#include struct ipcomp_tfms { struct list_head list; diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index e8674baaa8d9..bb3613ec448c 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -42,6 +42,7 @@ #include #include #include +#include #include #include #include @@ -58,6 +59,7 @@ #include #include #include +#include #include #include diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 302b7eb507c9..caa3b7d2e48a 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -52,6 +52,7 @@ #include #include #include +#include #include #include #include diff --git a/net/ipv4/ipvs/ip_vs_conn.c b/net/ipv4/ipvs/ip_vs_conn.c index 2a3a8c59c655..d35cea31cb55 100644 --- a/net/ipv4/ipvs/ip_vs_conn.c +++ b/net/ipv4/ipvs/ip_vs_conn.c @@ -24,7 +24,9 @@ * */ +#include #include +#include #include #include /* for proc_net_* */ #include diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c index 9bdcf31b760e..fe2c39d2a002 100644 --- a/net/ipv4/ipvs/ip_vs_ctl.c +++ b/net/ipv4/ipvs/ip_vs_ctl.c @@ -35,6 +35,7 @@ #include #include +#include #include #include diff --git a/net/ipv4/ipvs/ip_vs_dh.c b/net/ipv4/ipvs/ip_vs_dh.c index f3bc320dce93..9fee19c4c617 100644 --- a/net/ipv4/ipvs/ip_vs_dh.c +++ b/net/ipv4/ipvs/ip_vs_dh.c @@ -37,8 +37,10 @@ * */ +#include #include #include +#include #include diff --git a/net/ipv4/ipvs/ip_vs_est.c b/net/ipv4/ipvs/ip_vs_est.c index 67b3e2fc1fa1..e7004741ac73 100644 --- a/net/ipv4/ipvs/ip_vs_est.c +++ b/net/ipv4/ipvs/ip_vs_est.c @@ -13,7 +13,10 @@ * Changes: * */ +#include #include +#include +#include #include #include diff --git a/net/ipv4/ipvs/ip_vs_lblc.c b/net/ipv4/ipvs/ip_vs_lblc.c index b6dad7e3710c..6e5cb92a5c83 100644 --- a/net/ipv4/ipvs/ip_vs_lblc.c +++ b/net/ipv4/ipvs/ip_vs_lblc.c @@ -41,8 +41,10 @@ * me to write this module. */ +#include #include #include +#include /* for sysctl */ #include diff --git a/net/ipv4/ipvs/ip_vs_lblcr.c b/net/ipv4/ipvs/ip_vs_lblcr.c index 8c78ef76c121..32ba37ba72d8 100644 --- a/net/ipv4/ipvs/ip_vs_lblcr.c +++ b/net/ipv4/ipvs/ip_vs_lblcr.c @@ -39,8 +39,10 @@ * */ +#include #include #include +#include /* for sysctl */ #include diff --git a/net/ipv4/ipvs/ip_vs_proto_ah.c b/net/ipv4/ipvs/ip_vs_proto_ah.c index 453e94a0bbd7..8b0505b09317 100644 --- a/net/ipv4/ipvs/ip_vs_proto_ah.c +++ b/net/ipv4/ipvs/ip_vs_proto_ah.c @@ -12,6 +12,8 @@ * */ +#include +#include #include #include #include diff --git a/net/ipv4/ipvs/ip_vs_proto_esp.c b/net/ipv4/ipvs/ip_vs_proto_esp.c index 478e5c7c7e8e..c36ccf057a19 100644 --- a/net/ipv4/ipvs/ip_vs_proto_esp.c +++ b/net/ipv4/ipvs/ip_vs_proto_esp.c @@ -12,6 +12,8 @@ * */ +#include +#include #include #include #include diff --git a/net/ipv4/ipvs/ip_vs_proto_udp.c b/net/ipv4/ipvs/ip_vs_proto_udp.c index 8ae5f2e0aefa..89d9175d8f28 100644 --- a/net/ipv4/ipvs/ip_vs_proto_udp.c +++ b/net/ipv4/ipvs/ip_vs_proto_udp.c @@ -15,8 +15,11 @@ * */ +#include +#include #include #include +#include #include diff --git a/net/ipv4/ipvs/ip_vs_sh.c b/net/ipv4/ipvs/ip_vs_sh.c index 6f7c50e44a39..7775e6cc68be 100644 --- a/net/ipv4/ipvs/ip_vs_sh.c +++ b/net/ipv4/ipvs/ip_vs_sh.c @@ -34,8 +34,10 @@ * */ +#include #include #include +#include #include diff --git a/net/ipv4/ipvs/ip_vs_sync.c b/net/ipv4/ipvs/ip_vs_sync.c index 2e5ced3d8062..1bca714bda3d 100644 --- a/net/ipv4/ipvs/ip_vs_sync.c +++ b/net/ipv4/ipvs/ip_vs_sync.c @@ -21,12 +21,14 @@ #include #include +#include #include #include #include #include #include #include /* for ip_mc_join_group */ +#include #include #include diff --git a/net/ipv4/netfilter/ip_conntrack_amanda.c b/net/ipv4/netfilter/ip_conntrack_amanda.c index e52847fa10f5..0366eedb4d70 100644 --- a/net/ipv4/netfilter/ip_conntrack_amanda.c +++ b/net/ipv4/netfilter/ip_conntrack_amanda.c @@ -18,11 +18,13 @@ * */ +#include #include #include #include #include #include +#include #include #include diff --git a/net/ipv4/netfilter/ip_conntrack_proto_gre.c b/net/ipv4/netfilter/ip_conntrack_proto_gre.c index 744abb9d377a..57956dee60c8 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_gre.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_gre.c @@ -31,6 +31,7 @@ #include #include #include +#include static DEFINE_RWLOCK(ip_ct_gre_lock); #define ASSERT_READ_LOCK(x) diff --git a/net/ipv4/netfilter/ip_conntrack_proto_udp.c b/net/ipv4/netfilter/ip_conntrack_proto_udp.c index f2dcac7c7660..46becbe4fe58 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_udp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_udp.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c index dd476b191f4b..a88bcc551244 100644 --- a/net/ipv4/netfilter/ip_conntrack_standalone.c +++ b/net/ipv4/netfilter/ip_conntrack_standalone.c @@ -27,6 +27,7 @@ #endif #include #include +#include #define ASSERT_READ_LOCK(x) #define ASSERT_WRITE_LOCK(x) diff --git a/net/ipv4/netfilter/ip_nat_snmp_basic.c b/net/ipv4/netfilter/ip_nat_snmp_basic.c index 8acb7ed40b47..4f95d477805c 100644 --- a/net/ipv4/netfilter/ip_nat_snmp_basic.c +++ b/net/ipv4/netfilter/ip_nat_snmp_basic.c @@ -44,6 +44,7 @@ * */ #include +#include #include #include #include @@ -53,6 +54,7 @@ #include #include #include +#include #include #include #include diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c index 275a174c6fe6..27860510ca6d 100644 --- a/net/ipv4/netfilter/ipt_MASQUERADE.c +++ b/net/ipv4/netfilter/ipt_MASQUERADE.c @@ -11,6 +11,7 @@ #include #include +#include #include #include #include @@ -18,6 +19,7 @@ #include #include #include +#include #include #include #include diff --git a/net/ipv4/netfilter/ipt_physdev.c b/net/ipv4/netfilter/ipt_physdev.c index 1a53924041fc..03f554857a4d 100644 --- a/net/ipv4/netfilter/ipt_physdev.c +++ b/net/ipv4/netfilter/ipt_physdev.c @@ -9,6 +9,7 @@ */ #include +#include #include #include #include diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 0d7dc668db46..39d49dc333a7 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -38,6 +38,7 @@ #include #include #include +#include #include #include #include diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index dbf82955aabe..16984d4a8a06 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 67c036384e77..223abaa72bc5 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -86,6 +86,7 @@ #include #include #include +#include #include #include #include diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c index f3629730eb15..13cc7f895583 100644 --- a/net/ipv6/ah6.c +++ b/net/ipv6/ah6.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index 8bfbe9970793..6de8ee1a5ad9 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -36,6 +36,7 @@ #include #include #include +#include #include static int esp6_output(struct xfrm_state *x, struct sk_buff *skb) diff --git a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c index 55917fb17094..626dd39685f2 100644 --- a/net/ipv6/ipcomp6.c +++ b/net/ipv6/ipcomp6.c @@ -47,6 +47,7 @@ #include #include #include +#include #include #include diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index dd80020d8740..ea43ef1d94a7 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -15,6 +15,7 @@ * - new extension header parser code */ #include +#include #include #include #include diff --git a/net/ipv6/netfilter/ip6t_LOG.c b/net/ipv6/netfilter/ip6t_LOG.c index 0cd1d1bd9033..ae4653bfd654 100644 --- a/net/ipv6/netfilter/ip6t_LOG.c +++ b/net/ipv6/netfilter/ip6t_LOG.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include diff --git a/net/ipv6/netfilter/ip6t_ah.c b/net/ipv6/netfilter/ip6t_ah.c index dde37793d20b..268918d5deea 100644 --- a/net/ipv6/netfilter/ip6t_ah.c +++ b/net/ipv6/netfilter/ip6t_ah.c @@ -9,6 +9,7 @@ #include #include +#include #include #include #include diff --git a/net/ipv6/netfilter/ip6t_esp.c b/net/ipv6/netfilter/ip6t_esp.c index 24bc0cde43a1..65937de1b58c 100644 --- a/net/ipv6/netfilter/ip6t_esp.c +++ b/net/ipv6/netfilter/ip6t_esp.c @@ -9,6 +9,7 @@ #include #include +#include #include #include #include diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c index 6cf0342706b5..c4a2a8c4c339 100644 --- a/net/sched/sch_teql.c +++ b/net/sched/sch_teql.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index d1b0747a5b9d..de693b43c8ea 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -54,6 +54,7 @@ #include #include #include +#include #include #include #include -- cgit v1.2.3 From 8639a11e23d9eb0a6ceac2feed27acdfbb158f95 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 27 Dec 2005 15:17:57 -0200 Subject: [TCP]: Don't use __constant_htonl for a non const arg Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- include/net/tcp.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include/net') diff --git a/include/net/tcp.h b/include/net/tcp.h index 176221cd0cce..369930497401 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -994,11 +994,11 @@ static __inline__ void tcp_build_and_update_options(__u32 *ptr, struct tcp_sock struct tcp_sack_block *sp = tp->rx_opt.dsack ? tp->duplicate_sack : tp->selective_acks; int this_sack; - *ptr++ = __constant_htonl((TCPOPT_NOP << 24) | - (TCPOPT_NOP << 16) | - (TCPOPT_SACK << 8) | - (TCPOLEN_SACK_BASE + - (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK))); + *ptr++ = htonl((TCPOPT_NOP << 24) | + (TCPOPT_NOP << 16) | + (TCPOPT_SACK << 8) | + (TCPOLEN_SACK_BASE + (tp->rx_opt.eff_sacks * + TCPOLEN_SACK_PERBLOCK))); for(this_sack = 0; this_sack < tp->rx_opt.eff_sacks; this_sack++) { *ptr++ = htonl(sp[this_sack].start_seq); *ptr++ = htonl(sp[this_sack].end_seq); -- cgit v1.2.3 From fd19f329a32bdc4eb07885e0b3889567cfe00aa7 Mon Sep 17 00:00:00 2001 From: Benjamin LaHaise Date: Tue, 3 Jan 2006 14:10:46 -0800 Subject: [AF_UNIX]: Convert to use a spinlock instead of rwlock From: Benjamin LaHaise In af_unix, a rwlock is used to protect internal state. At least on my P4 with HT it is faster to use a spinlock due to the simpler memory barrier used to unlock. This patch raises bw_unix to ~690K/s. Signed-off-by: David S. Miller --- include/net/af_unix.h | 10 +++++----- net/unix/af_unix.c | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include/net') diff --git a/include/net/af_unix.h b/include/net/af_unix.h index 3f302ae98c03..bfc1779fc753 100644 --- a/include/net/af_unix.h +++ b/include/net/af_unix.h @@ -58,10 +58,10 @@ struct unix_skb_parms { #define UNIXCB(skb) (*(struct unix_skb_parms*)&((skb)->cb)) #define UNIXCREDS(skb) (&UNIXCB((skb)).creds) -#define unix_state_rlock(s) read_lock(&unix_sk(s)->lock) -#define unix_state_runlock(s) read_unlock(&unix_sk(s)->lock) -#define unix_state_wlock(s) write_lock(&unix_sk(s)->lock) -#define unix_state_wunlock(s) write_unlock(&unix_sk(s)->lock) +#define unix_state_rlock(s) spin_lock(&unix_sk(s)->lock) +#define unix_state_runlock(s) spin_unlock(&unix_sk(s)->lock) +#define unix_state_wlock(s) spin_lock(&unix_sk(s)->lock) +#define unix_state_wunlock(s) spin_unlock(&unix_sk(s)->lock) #ifdef __KERNEL__ /* The AF_UNIX socket */ @@ -76,7 +76,7 @@ struct unix_sock { struct sock *other; struct sock *gc_tree; atomic_t inflight; - rwlock_t lock; + spinlock_t lock; wait_queue_head_t peer_wait; }; #define unix_sk(__sk) ((struct unix_sock *)__sk) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 7d3fe6aebcdb..1ddd36d50091 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -564,7 +564,7 @@ static struct sock * unix_create1(struct socket *sock) u = unix_sk(sk); u->dentry = NULL; u->mnt = NULL; - rwlock_init(&u->lock); + spin_lock_init(&u->lock); atomic_set(&u->inflight, sock ? 0 : -1); init_MUTEX(&u->readsem); /* single task reading lock */ init_waitqueue_head(&u->peer_wait); -- cgit v1.2.3 From b461d2f2188c1c578ed651e4cdf608be7a993cd4 Mon Sep 17 00:00:00 2001 From: Per Liden Date: Tue, 3 Jan 2006 14:13:29 -0800 Subject: [NETLINK] genetlink: fix cmd type in genl_ops to be consistent to u8 Signed-off-by: Per Liden ACKed-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- include/net/genetlink.h | 2 +- net/netlink/genetlink.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/genetlink.h b/include/net/genetlink.h index 52d8b1a73d52..c5b96b2b8155 100644 --- a/include/net/genetlink.h +++ b/include/net/genetlink.h @@ -60,7 +60,7 @@ struct genl_info */ struct genl_ops { - unsigned int cmd; + u8 cmd; unsigned int flags; struct nla_policy *policy; int (*doit)(struct sk_buff *skb, diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index 287cfcc56951..3b1378498d50 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -441,7 +441,7 @@ errout: } static struct sk_buff *ctrl_build_msg(struct genl_family *family, u32 pid, - int seq, int cmd) + int seq, u8 cmd) { struct sk_buff *skb; int err; -- cgit v1.2.3 From 40efc6fa179f440a008333ea98f701bc35a1f97f Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Tue, 3 Jan 2006 16:03:49 -0800 Subject: [TCP]: less inline's TCP inline usage cleanup: * get rid of inline in several places * replace __inline__ with inline where possible * move functions used in one file out of tcp.h * let compiler decide on used once cases On x86_64: text data bss dec hex filename 3594701 648348 567400 4810449 4966d1 vmlinux.orig 3593133 648580 567400 4809113 496199 vmlinux On sparc64: text data bss dec hex filename 2538278 406152 530392 3474822 350586 vmlinux.ORIG 2536382 406384 530392 3473158 34ff06 vmlinux Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- include/net/tcp.h | 193 +++++++------------------------------------------- net/ipv4/tcp_cong.c | 28 ++++++++ net/ipv4/tcp_input.c | 82 +++++++++++++++------ net/ipv4/tcp_ipv4.c | 9 ++- net/ipv4/tcp_output.c | 87 ++++++++++++++++++++--- 5 files changed, 198 insertions(+), 201 deletions(-) (limited to 'include/net') diff --git a/include/net/tcp.h b/include/net/tcp.h index 369930497401..77f21c65bbca 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -445,34 +445,16 @@ typedef int (*sk_read_actor_t)(read_descriptor_t *, struct sk_buff *, extern int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, sk_read_actor_t recv_actor); -/* Initialize RCV_MSS value. - * RCV_MSS is an our guess about MSS used by the peer. - * We haven't any direct information about the MSS. - * It's better to underestimate the RCV_MSS rather than overestimate. - * Overestimations make us ACKing less frequently than needed. - * Underestimations are more easy to detect and fix by tcp_measure_rcv_mss(). - */ +extern void tcp_initialize_rcv_mss(struct sock *sk); -static inline void tcp_initialize_rcv_mss(struct sock *sk) -{ - struct tcp_sock *tp = tcp_sk(sk); - unsigned int hint = min_t(unsigned int, tp->advmss, tp->mss_cache); - - hint = min(hint, tp->rcv_wnd/2); - hint = min(hint, TCP_MIN_RCVMSS); - hint = max(hint, TCP_MIN_MSS); - - inet_csk(sk)->icsk_ack.rcv_mss = hint; -} - -static __inline__ void __tcp_fast_path_on(struct tcp_sock *tp, u32 snd_wnd) +static inline void __tcp_fast_path_on(struct tcp_sock *tp, u32 snd_wnd) { tp->pred_flags = htonl((tp->tcp_header_len << 26) | ntohl(TCP_FLAG_ACK) | snd_wnd); } -static __inline__ void tcp_fast_path_on(struct tcp_sock *tp) +static inline void tcp_fast_path_on(struct tcp_sock *tp) { __tcp_fast_path_on(tp, tp->snd_wnd >> tp->rx_opt.snd_wscale); } @@ -490,7 +472,7 @@ static inline void tcp_fast_path_check(struct sock *sk, struct tcp_sock *tp) * Rcv_nxt can be after the window if our peer push more data * than the offered window. */ -static __inline__ u32 tcp_receive_window(const struct tcp_sock *tp) +static inline u32 tcp_receive_window(const struct tcp_sock *tp) { s32 win = tp->rcv_wup + tp->rcv_wnd - tp->rcv_nxt; @@ -662,6 +644,7 @@ extern void tcp_cleanup_congestion_control(struct sock *sk); extern int tcp_set_default_congestion_control(const char *name); extern void tcp_get_default_congestion_control(char *name); extern int tcp_set_congestion_control(struct sock *sk, const char *name); +extern void tcp_slow_start(struct tcp_sock *tp); extern struct tcp_congestion_ops tcp_init_congestion_ops; extern u32 tcp_reno_ssthresh(struct sock *sk); @@ -701,7 +684,7 @@ static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event) * "Packets left network, but not honestly ACKed yet" PLUS * "Packets fast retransmitted" */ -static __inline__ unsigned int tcp_packets_in_flight(const struct tcp_sock *tp) +static inline unsigned int tcp_packets_in_flight(const struct tcp_sock *tp) { return (tp->packets_out - tp->left_out + tp->retrans_out); } @@ -721,33 +704,6 @@ static inline __u32 tcp_current_ssthresh(const struct sock *sk) (tp->snd_cwnd >> 2))); } -/* - * Linear increase during slow start - */ -static inline void tcp_slow_start(struct tcp_sock *tp) -{ - if (sysctl_tcp_abc) { - /* RFC3465: Slow Start - * TCP sender SHOULD increase cwnd by the number of - * previously unacknowledged bytes ACKed by each incoming - * acknowledgment, provided the increase is not more than L - */ - if (tp->bytes_acked < tp->mss_cache) - return; - - /* We MAY increase by 2 if discovered delayed ack */ - if (sysctl_tcp_abc > 1 && tp->bytes_acked > 2*tp->mss_cache) { - if (tp->snd_cwnd < tp->snd_cwnd_clamp) - tp->snd_cwnd++; - } - } - tp->bytes_acked = 0; - - if (tp->snd_cwnd < tp->snd_cwnd_clamp) - tp->snd_cwnd++; -} - - static inline void tcp_sync_left_out(struct tcp_sock *tp) { if (tp->rx_opt.sack_ok && @@ -756,34 +712,7 @@ static inline void tcp_sync_left_out(struct tcp_sock *tp) tp->left_out = tp->sacked_out + tp->lost_out; } -/* Set slow start threshold and cwnd not falling to slow start */ -static inline void __tcp_enter_cwr(struct sock *sk) -{ - const struct inet_connection_sock *icsk = inet_csk(sk); - struct tcp_sock *tp = tcp_sk(sk); - - tp->undo_marker = 0; - tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); - tp->snd_cwnd = min(tp->snd_cwnd, - tcp_packets_in_flight(tp) + 1U); - tp->snd_cwnd_cnt = 0; - tp->high_seq = tp->snd_nxt; - tp->snd_cwnd_stamp = tcp_time_stamp; - TCP_ECN_queue_cwr(tp); -} - -static inline void tcp_enter_cwr(struct sock *sk) -{ - struct tcp_sock *tp = tcp_sk(sk); - - tp->prior_ssthresh = 0; - tp->bytes_acked = 0; - if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) { - __tcp_enter_cwr(sk); - tcp_set_ca_state(sk, TCP_CA_CWR); - } -} - +extern void tcp_enter_cwr(struct sock *sk); extern __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst); /* Slow start with delack produces 3 packets of burst, so that @@ -815,14 +744,14 @@ static inline int tcp_is_cwnd_limited(const struct sock *sk, u32 in_flight) return left <= tcp_max_burst(tp); } -static __inline__ void tcp_minshall_update(struct tcp_sock *tp, int mss, - const struct sk_buff *skb) +static inline void tcp_minshall_update(struct tcp_sock *tp, int mss, + const struct sk_buff *skb) { if (skb->len < mss) tp->snd_sml = TCP_SKB_CB(skb)->end_seq; } -static __inline__ void tcp_check_probe_timer(struct sock *sk, struct tcp_sock *tp) +static inline void tcp_check_probe_timer(struct sock *sk, struct tcp_sock *tp) { const struct inet_connection_sock *icsk = inet_csk(sk); if (!tp->packets_out && !icsk->icsk_pending) @@ -830,18 +759,18 @@ static __inline__ void tcp_check_probe_timer(struct sock *sk, struct tcp_sock *t icsk->icsk_rto, TCP_RTO_MAX); } -static __inline__ void tcp_push_pending_frames(struct sock *sk, - struct tcp_sock *tp) +static inline void tcp_push_pending_frames(struct sock *sk, + struct tcp_sock *tp) { __tcp_push_pending_frames(sk, tp, tcp_current_mss(sk, 1), tp->nonagle); } -static __inline__ void tcp_init_wl(struct tcp_sock *tp, u32 ack, u32 seq) +static inline void tcp_init_wl(struct tcp_sock *tp, u32 ack, u32 seq) { tp->snd_wl1 = seq; } -static __inline__ void tcp_update_wl(struct tcp_sock *tp, u32 ack, u32 seq) +static inline void tcp_update_wl(struct tcp_sock *tp, u32 ack, u32 seq) { tp->snd_wl1 = seq; } @@ -849,19 +778,19 @@ static __inline__ void tcp_update_wl(struct tcp_sock *tp, u32 ack, u32 seq) /* * Calculate(/check) TCP checksum */ -static __inline__ u16 tcp_v4_check(struct tcphdr *th, int len, - unsigned long saddr, unsigned long daddr, - unsigned long base) +static inline u16 tcp_v4_check(struct tcphdr *th, int len, + unsigned long saddr, unsigned long daddr, + unsigned long base) { return csum_tcpudp_magic(saddr,daddr,len,IPPROTO_TCP,base); } -static __inline__ int __tcp_checksum_complete(struct sk_buff *skb) +static inline int __tcp_checksum_complete(struct sk_buff *skb) { return __skb_checksum_complete(skb); } -static __inline__ int tcp_checksum_complete(struct sk_buff *skb) +static inline int tcp_checksum_complete(struct sk_buff *skb) { return skb->ip_summed != CHECKSUM_UNNECESSARY && __tcp_checksum_complete(skb); @@ -869,7 +798,7 @@ static __inline__ int tcp_checksum_complete(struct sk_buff *skb) /* Prequeue for VJ style copy to user, combined with checksumming. */ -static __inline__ void tcp_prequeue_init(struct tcp_sock *tp) +static inline void tcp_prequeue_init(struct tcp_sock *tp) { tp->ucopy.task = NULL; tp->ucopy.len = 0; @@ -885,7 +814,7 @@ static __inline__ void tcp_prequeue_init(struct tcp_sock *tp) * * NOTE: is this not too big to inline? */ -static __inline__ int tcp_prequeue(struct sock *sk, struct sk_buff *skb) +static inline int tcp_prequeue(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); @@ -926,7 +855,7 @@ static const char *statename[]={ }; #endif -static __inline__ void tcp_set_state(struct sock *sk, int state) +static inline void tcp_set_state(struct sock *sk, int state) { int oldstate = sk->sk_state; @@ -960,7 +889,7 @@ static __inline__ void tcp_set_state(struct sock *sk, int state) #endif } -static __inline__ void tcp_done(struct sock *sk) +static inline void tcp_done(struct sock *sk) { tcp_set_state(sk, TCP_CLOSE); tcp_clear_xmit_timers(sk); @@ -973,81 +902,13 @@ static __inline__ void tcp_done(struct sock *sk) inet_csk_destroy_sock(sk); } -static __inline__ void tcp_sack_reset(struct tcp_options_received *rx_opt) +static inline void tcp_sack_reset(struct tcp_options_received *rx_opt) { rx_opt->dsack = 0; rx_opt->eff_sacks = 0; rx_opt->num_sacks = 0; } -static __inline__ void tcp_build_and_update_options(__u32 *ptr, struct tcp_sock *tp, __u32 tstamp) -{ - if (tp->rx_opt.tstamp_ok) { - *ptr++ = __constant_htonl((TCPOPT_NOP << 24) | - (TCPOPT_NOP << 16) | - (TCPOPT_TIMESTAMP << 8) | - TCPOLEN_TIMESTAMP); - *ptr++ = htonl(tstamp); - *ptr++ = htonl(tp->rx_opt.ts_recent); - } - if (tp->rx_opt.eff_sacks) { - struct tcp_sack_block *sp = tp->rx_opt.dsack ? tp->duplicate_sack : tp->selective_acks; - int this_sack; - - *ptr++ = htonl((TCPOPT_NOP << 24) | - (TCPOPT_NOP << 16) | - (TCPOPT_SACK << 8) | - (TCPOLEN_SACK_BASE + (tp->rx_opt.eff_sacks * - TCPOLEN_SACK_PERBLOCK))); - for(this_sack = 0; this_sack < tp->rx_opt.eff_sacks; this_sack++) { - *ptr++ = htonl(sp[this_sack].start_seq); - *ptr++ = htonl(sp[this_sack].end_seq); - } - if (tp->rx_opt.dsack) { - tp->rx_opt.dsack = 0; - tp->rx_opt.eff_sacks--; - } - } -} - -/* Construct a tcp options header for a SYN or SYN_ACK packet. - * If this is every changed make sure to change the definition of - * MAX_SYN_SIZE to match the new maximum number of options that you - * can generate. - */ -static inline void tcp_syn_build_options(__u32 *ptr, int mss, int ts, int sack, - int offer_wscale, int wscale, __u32 tstamp, __u32 ts_recent) -{ - /* We always get an MSS option. - * The option bytes which will be seen in normal data - * packets should timestamps be used, must be in the MSS - * advertised. But we subtract them from tp->mss_cache so - * that calculations in tcp_sendmsg are simpler etc. - * So account for this fact here if necessary. If we - * don't do this correctly, as a receiver we won't - * recognize data packets as being full sized when we - * should, and thus we won't abide by the delayed ACK - * rules correctly. - * SACKs don't matter, we never delay an ACK when we - * have any of those going out. - */ - *ptr++ = htonl((TCPOPT_MSS << 24) | (TCPOLEN_MSS << 16) | mss); - if (ts) { - if(sack) - *ptr++ = __constant_htonl((TCPOPT_SACK_PERM << 24) | (TCPOLEN_SACK_PERM << 16) | - (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP); - else - *ptr++ = __constant_htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | - (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP); - *ptr++ = htonl(tstamp); /* TSVAL */ - *ptr++ = htonl(ts_recent); /* TSECR */ - } else if(sack) - *ptr++ = __constant_htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | - (TCPOPT_SACK_PERM << 8) | TCPOLEN_SACK_PERM); - if (offer_wscale) - *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_WINDOW << 16) | (TCPOLEN_WINDOW << 8) | (wscale)); -} - /* Determine a window scaling and initial window to offer. */ extern void tcp_select_initial_window(int __space, __u32 mss, __u32 *rcv_wnd, __u32 *window_clamp, @@ -1072,9 +933,9 @@ static inline int tcp_full_space(const struct sock *sk) return tcp_win_from_space(sk->sk_rcvbuf); } -static __inline__ void tcp_openreq_init(struct request_sock *req, - struct tcp_options_received *rx_opt, - struct sk_buff *skb) +static inline void tcp_openreq_init(struct request_sock *req, + struct tcp_options_received *rx_opt, + struct sk_buff *skb) { struct inet_request_sock *ireq = inet_rsk(req); diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index c7cc62c8dc12..e688c687d62d 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -174,6 +174,34 @@ int tcp_set_congestion_control(struct sock *sk, const char *name) return err; } + +/* + * Linear increase during slow start + */ +void tcp_slow_start(struct tcp_sock *tp) +{ + if (sysctl_tcp_abc) { + /* RFC3465: Slow Start + * TCP sender SHOULD increase cwnd by the number of + * previously unacknowledged bytes ACKed by each incoming + * acknowledgment, provided the increase is not more than L + */ + if (tp->bytes_acked < tp->mss_cache) + return; + + /* We MAY increase by 2 if discovered delayed ack */ + if (sysctl_tcp_abc > 1 && tp->bytes_acked > 2*tp->mss_cache) { + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; + } + } + tp->bytes_acked = 0; + + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; +} +EXPORT_SYMBOL_GPL(tcp_slow_start); + /* * TCP Reno congestion control * This is special case used for fallback as well. diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 981d1203b152..0a461232329f 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -115,8 +115,8 @@ int sysctl_tcp_abc = 1; /* Adapt the MSS value used to make delayed ack decision to the * real world. */ -static inline void tcp_measure_rcv_mss(struct sock *sk, - const struct sk_buff *skb) +static void tcp_measure_rcv_mss(struct sock *sk, + const struct sk_buff *skb) { struct inet_connection_sock *icsk = inet_csk(sk); const unsigned int lss = icsk->icsk_ack.last_seg_size; @@ -246,8 +246,8 @@ static int __tcp_grow_window(const struct sock *sk, struct tcp_sock *tp, return 0; } -static inline void tcp_grow_window(struct sock *sk, struct tcp_sock *tp, - struct sk_buff *skb) +static void tcp_grow_window(struct sock *sk, struct tcp_sock *tp, + struct sk_buff *skb) { /* Check #1 */ if (tp->rcv_ssthresh < tp->window_clamp && @@ -341,6 +341,26 @@ static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp) tp->rcv_ssthresh = min(tp->window_clamp, 2U*tp->advmss); } + +/* Initialize RCV_MSS value. + * RCV_MSS is an our guess about MSS used by the peer. + * We haven't any direct information about the MSS. + * It's better to underestimate the RCV_MSS rather than overestimate. + * Overestimations make us ACKing less frequently than needed. + * Underestimations are more easy to detect and fix by tcp_measure_rcv_mss(). + */ +void tcp_initialize_rcv_mss(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + unsigned int hint = min_t(unsigned int, tp->advmss, tp->mss_cache); + + hint = min(hint, tp->rcv_wnd/2); + hint = min(hint, TCP_MIN_RCVMSS); + hint = max(hint, TCP_MIN_MSS); + + inet_csk(sk)->icsk_ack.rcv_mss = hint; +} + /* Receiver "autotuning" code. * * The algorithm for RTT estimation w/o timestamps is based on @@ -735,6 +755,27 @@ __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst) return min_t(__u32, cwnd, tp->snd_cwnd_clamp); } +/* Set slow start threshold and cwnd not falling to slow start */ +void tcp_enter_cwr(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + tp->prior_ssthresh = 0; + tp->bytes_acked = 0; + if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) { + tp->undo_marker = 0; + tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk); + tp->snd_cwnd = min(tp->snd_cwnd, + tcp_packets_in_flight(tp) + 1U); + tp->snd_cwnd_cnt = 0; + tp->high_seq = tp->snd_nxt; + tp->snd_cwnd_stamp = tcp_time_stamp; + TCP_ECN_queue_cwr(tp); + + tcp_set_ca_state(sk, TCP_CA_CWR); + } +} + /* Initialize metrics on socket. */ static void tcp_init_metrics(struct sock *sk) @@ -2070,8 +2111,8 @@ static inline void tcp_ack_update_rtt(struct sock *sk, const int flag, tcp_ack_no_tstamp(sk, seq_rtt, flag); } -static inline void tcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt, - u32 in_flight, int good) +static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt, + u32 in_flight, int good) { const struct inet_connection_sock *icsk = inet_csk(sk); icsk->icsk_ca_ops->cong_avoid(sk, ack, rtt, in_flight, good); @@ -2082,7 +2123,7 @@ static inline void tcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt, * RFC2988 recommends to restart timer to now+rto. */ -static inline void tcp_ack_packets_out(struct sock *sk, struct tcp_sock *tp) +static void tcp_ack_packets_out(struct sock *sk, struct tcp_sock *tp) { if (!tp->packets_out) { inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS); @@ -2147,7 +2188,7 @@ static int tcp_tso_acked(struct sock *sk, struct sk_buff *skb, return acked; } -static inline u32 tcp_usrtt(const struct sk_buff *skb) +static u32 tcp_usrtt(const struct sk_buff *skb) { struct timeval tv, now; @@ -2583,8 +2624,8 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx, /* Fast parse options. This hopes to only see timestamps. * If it is wrong it falls back on tcp_parse_options(). */ -static inline int tcp_fast_parse_options(struct sk_buff *skb, struct tcphdr *th, - struct tcp_sock *tp) +static int tcp_fast_parse_options(struct sk_buff *skb, struct tcphdr *th, + struct tcp_sock *tp) { if (th->doff == sizeof(struct tcphdr)>>2) { tp->rx_opt.saw_tstamp = 0; @@ -2804,8 +2845,7 @@ static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th) } } -static __inline__ int -tcp_sack_extend(struct tcp_sack_block *sp, u32 seq, u32 end_seq) +static inline int tcp_sack_extend(struct tcp_sack_block *sp, u32 seq, u32 end_seq) { if (!after(seq, sp->end_seq) && !after(sp->start_seq, end_seq)) { if (before(seq, sp->start_seq)) @@ -2817,7 +2857,7 @@ tcp_sack_extend(struct tcp_sack_block *sp, u32 seq, u32 end_seq) return 0; } -static inline void tcp_dsack_set(struct tcp_sock *tp, u32 seq, u32 end_seq) +static void tcp_dsack_set(struct tcp_sock *tp, u32 seq, u32 end_seq) { if (tp->rx_opt.sack_ok && sysctl_tcp_dsack) { if (before(seq, tp->rcv_nxt)) @@ -2832,7 +2872,7 @@ static inline void tcp_dsack_set(struct tcp_sock *tp, u32 seq, u32 end_seq) } } -static inline void tcp_dsack_extend(struct tcp_sock *tp, u32 seq, u32 end_seq) +static void tcp_dsack_extend(struct tcp_sock *tp, u32 seq, u32 end_seq) { if (!tp->rx_opt.dsack) tcp_dsack_set(tp, seq, end_seq); @@ -2890,7 +2930,7 @@ static void tcp_sack_maybe_coalesce(struct tcp_sock *tp) } } -static __inline__ void tcp_sack_swap(struct tcp_sack_block *sack1, struct tcp_sack_block *sack2) +static inline void tcp_sack_swap(struct tcp_sack_block *sack1, struct tcp_sack_block *sack2) { __u32 tmp; @@ -3455,7 +3495,7 @@ void tcp_cwnd_application_limited(struct sock *sk) tp->snd_cwnd_stamp = tcp_time_stamp; } -static inline int tcp_should_expand_sndbuf(struct sock *sk, struct tcp_sock *tp) +static int tcp_should_expand_sndbuf(struct sock *sk, struct tcp_sock *tp) { /* If the user specified a specific send buffer setting, do * not modify it. @@ -3502,7 +3542,7 @@ static void tcp_new_space(struct sock *sk) sk->sk_write_space(sk); } -static inline void tcp_check_space(struct sock *sk) +static void tcp_check_space(struct sock *sk) { if (sock_flag(sk, SOCK_QUEUE_SHRUNK)) { sock_reset_flag(sk, SOCK_QUEUE_SHRUNK); @@ -3512,7 +3552,7 @@ static inline void tcp_check_space(struct sock *sk) } } -static __inline__ void tcp_data_snd_check(struct sock *sk, struct tcp_sock *tp) +static inline void tcp_data_snd_check(struct sock *sk, struct tcp_sock *tp) { tcp_push_pending_frames(sk, tp); tcp_check_space(sk); @@ -3544,7 +3584,7 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) } } -static __inline__ void tcp_ack_snd_check(struct sock *sk) +static inline void tcp_ack_snd_check(struct sock *sk) { if (!inet_csk_ack_scheduled(sk)) { /* We sent a data segment already. */ @@ -3692,8 +3732,7 @@ static int __tcp_checksum_complete_user(struct sock *sk, struct sk_buff *skb) return result; } -static __inline__ int -tcp_checksum_complete_user(struct sock *sk, struct sk_buff *skb) +static inline int tcp_checksum_complete_user(struct sock *sk, struct sk_buff *skb) { return skb->ip_summed != CHECKSUM_UNNECESSARY && __tcp_checksum_complete_user(sk, skb); @@ -4474,3 +4513,4 @@ EXPORT_SYMBOL(sysctl_tcp_abc); EXPORT_SYMBOL(tcp_parse_options); EXPORT_SYMBOL(tcp_rcv_established); EXPORT_SYMBOL(tcp_rcv_state_process); +EXPORT_SYMBOL(tcp_initialize_rcv_mss); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 9b62d80bb20f..5c70493dff02 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -270,8 +270,7 @@ failure: /* * This routine does path mtu discovery as defined in RFC1191. */ -static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, - u32 mtu) +static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu) { struct dst_entry *dst; struct inet_sock *inet = inet_sk(sk); @@ -662,7 +661,7 @@ static void tcp_v4_reqsk_destructor(struct request_sock *req) kfree(inet_rsk(req)->opt); } -static inline void syn_flood_warning(struct sk_buff *skb) +static void syn_flood_warning(struct sk_buff *skb) { static unsigned long warntime; @@ -677,8 +676,8 @@ static inline void syn_flood_warning(struct sk_buff *skb) /* * Save and compile IPv4 options into the request_sock if needed. */ -static inline struct ip_options *tcp_v4_save_options(struct sock *sk, - struct sk_buff *skb) +static struct ip_options *tcp_v4_save_options(struct sock *sk, + struct sk_buff *skb) { struct ip_options *opt = &(IPCB(skb)->opt); struct ip_options *dopt = NULL; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 3a0a914de917..a7623ead39a8 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -51,8 +51,8 @@ int sysctl_tcp_retrans_collapse = 1; */ int sysctl_tcp_tso_win_divisor = 3; -static inline void update_send_head(struct sock *sk, struct tcp_sock *tp, - struct sk_buff *skb) +static void update_send_head(struct sock *sk, struct tcp_sock *tp, + struct sk_buff *skb) { sk->sk_send_head = skb->next; if (sk->sk_send_head == (struct sk_buff *)&sk->sk_write_queue) @@ -124,8 +124,8 @@ static void tcp_cwnd_restart(struct sock *sk, struct dst_entry *dst) tp->snd_cwnd_used = 0; } -static inline void tcp_event_data_sent(struct tcp_sock *tp, - struct sk_buff *skb, struct sock *sk) +static void tcp_event_data_sent(struct tcp_sock *tp, + struct sk_buff *skb, struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); const u32 now = tcp_time_stamp; @@ -142,7 +142,7 @@ static inline void tcp_event_data_sent(struct tcp_sock *tp, icsk->icsk_ack.pingpong = 1; } -static __inline__ void tcp_event_ack_sent(struct sock *sk, unsigned int pkts) +static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts) { tcp_dec_quickack_mode(sk, pkts); inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); @@ -212,7 +212,7 @@ void tcp_select_initial_window(int __space, __u32 mss, * value can be stuffed directly into th->window for an outgoing * frame. */ -static __inline__ u16 tcp_select_window(struct sock *sk) +static u16 tcp_select_window(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); u32 cur_win = tcp_receive_window(tp); @@ -250,6 +250,75 @@ static __inline__ u16 tcp_select_window(struct sock *sk) return new_win; } +static void tcp_build_and_update_options(__u32 *ptr, struct tcp_sock *tp, + __u32 tstamp) +{ + if (tp->rx_opt.tstamp_ok) { + *ptr++ = __constant_htonl((TCPOPT_NOP << 24) | + (TCPOPT_NOP << 16) | + (TCPOPT_TIMESTAMP << 8) | + TCPOLEN_TIMESTAMP); + *ptr++ = htonl(tstamp); + *ptr++ = htonl(tp->rx_opt.ts_recent); + } + if (tp->rx_opt.eff_sacks) { + struct tcp_sack_block *sp = tp->rx_opt.dsack ? tp->duplicate_sack : tp->selective_acks; + int this_sack; + + *ptr++ = htonl((TCPOPT_NOP << 24) | + (TCPOPT_NOP << 16) | + (TCPOPT_SACK << 8) | + (TCPOLEN_SACK_BASE + (tp->rx_opt.eff_sacks * + TCPOLEN_SACK_PERBLOCK))); + for(this_sack = 0; this_sack < tp->rx_opt.eff_sacks; this_sack++) { + *ptr++ = htonl(sp[this_sack].start_seq); + *ptr++ = htonl(sp[this_sack].end_seq); + } + if (tp->rx_opt.dsack) { + tp->rx_opt.dsack = 0; + tp->rx_opt.eff_sacks--; + } + } +} + +/* Construct a tcp options header for a SYN or SYN_ACK packet. + * If this is every changed make sure to change the definition of + * MAX_SYN_SIZE to match the new maximum number of options that you + * can generate. + */ +static void tcp_syn_build_options(__u32 *ptr, int mss, int ts, int sack, + int offer_wscale, int wscale, __u32 tstamp, + __u32 ts_recent) +{ + /* We always get an MSS option. + * The option bytes which will be seen in normal data + * packets should timestamps be used, must be in the MSS + * advertised. But we subtract them from tp->mss_cache so + * that calculations in tcp_sendmsg are simpler etc. + * So account for this fact here if necessary. If we + * don't do this correctly, as a receiver we won't + * recognize data packets as being full sized when we + * should, and thus we won't abide by the delayed ACK + * rules correctly. + * SACKs don't matter, we never delay an ACK when we + * have any of those going out. + */ + *ptr++ = htonl((TCPOPT_MSS << 24) | (TCPOLEN_MSS << 16) | mss); + if (ts) { + if(sack) + *ptr++ = __constant_htonl((TCPOPT_SACK_PERM << 24) | (TCPOLEN_SACK_PERM << 16) | + (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP); + else + *ptr++ = __constant_htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | + (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP); + *ptr++ = htonl(tstamp); /* TSVAL */ + *ptr++ = htonl(ts_recent); /* TSECR */ + } else if(sack) + *ptr++ = __constant_htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | + (TCPOPT_SACK_PERM << 8) | TCPOLEN_SACK_PERM); + if (offer_wscale) + *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_WINDOW << 16) | (TCPOLEN_WINDOW << 8) | (wscale)); +} /* This routine actually transmits TCP packets queued in by * tcp_do_sendmsg(). This is used by both the initial @@ -724,7 +793,7 @@ unsigned int tcp_current_mss(struct sock *sk, int large_allowed) /* Congestion window validation. (RFC2861) */ -static inline void tcp_cwnd_validate(struct sock *sk, struct tcp_sock *tp) +static void tcp_cwnd_validate(struct sock *sk, struct tcp_sock *tp) { __u32 packets_out = tp->packets_out; @@ -773,7 +842,7 @@ static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp, struct sk_buff *sk /* This must be invoked the first time we consider transmitting * SKB onto the wire. */ -static inline int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb, unsigned int mss_now) +static int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb, unsigned int mss_now) { int tso_segs = tcp_skb_pcount(skb); @@ -1794,7 +1863,7 @@ struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst, /* * Do all connect socket setups that can be done AF independent. */ -static inline void tcp_connect_init(struct sock *sk) +static void tcp_connect_init(struct sock *sk) { struct dst_entry *dst = __sk_dst_get(sk); struct tcp_sock *tp = tcp_sk(sk); -- cgit v1.2.3 From c1d10adb4a521de5760112853f42aaeefcec96eb Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 5 Jan 2006 12:19:05 -0800 Subject: [NETFILTER]: Add ctnetlink port for nf_conntrack Signed-off-by: Pablo Neira Ayuso Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- include/linux/netfilter/nfnetlink_conntrack.h | 3 + include/net/netfilter/nf_conntrack.h | 31 + include/net/netfilter/nf_conntrack_helper.h | 2 + include/net/netfilter/nf_conntrack_l3proto.h | 15 +- include/net/netfilter/nf_conntrack_protocol.h | 26 +- net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c | 47 + net/ipv4/netfilter/nf_conntrack_proto_icmp.c | 97 +- net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c | 47 + net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c | 77 +- net/netfilter/Kconfig | 7 + net/netfilter/Makefile | 3 + net/netfilter/nf_conntrack_core.c | 232 +++- net/netfilter/nf_conntrack_netlink.c | 1642 ++++++++++++++++++++++++ net/netfilter/nf_conntrack_proto_tcp.c | 71 + net/netfilter/nf_conntrack_proto_udp.c | 10 + net/netfilter/nf_conntrack_standalone.c | 42 +- 16 files changed, 2289 insertions(+), 63 deletions(-) create mode 100644 net/netfilter/nf_conntrack_netlink.c (limited to 'include/net') diff --git a/include/linux/netfilter/nfnetlink_conntrack.h b/include/linux/netfilter/nfnetlink_conntrack.h index 116fcaced909..b8e9a5b6fb1e 100644 --- a/include/linux/netfilter/nfnetlink_conntrack.h +++ b/include/linux/netfilter/nfnetlink_conntrack.h @@ -64,6 +64,9 @@ enum ctattr_l4proto { CTA_PROTO_ICMP_ID, CTA_PROTO_ICMP_TYPE, CTA_PROTO_ICMP_CODE, + CTA_PROTO_ICMPV6_ID, + CTA_PROTO_ICMPV6_TYPE, + CTA_PROTO_ICMPV6_CODE, __CTA_PROTO_MAX }; #define CTA_PROTO_MAX (__CTA_PROTO_MAX - 1) diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index cc4825610795..64b82b74a650 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -94,6 +94,9 @@ struct nf_conn /* Current number of expected connections */ unsigned int expecting; + /* Unique ID that identifies this conntrack*/ + unsigned int id; + /* Helper. if any */ struct nf_conntrack_helper *helper; @@ -140,6 +143,9 @@ struct nf_conntrack_expect /* Usage count. */ atomic_t use; + /* Unique ID */ + unsigned int id; + /* Flags */ unsigned int flags; @@ -190,6 +196,31 @@ static inline void nf_ct_put(struct nf_conn *ct) nf_conntrack_put(&ct->ct_general); } +extern struct nf_conntrack_tuple_hash * +__nf_conntrack_find(const struct nf_conntrack_tuple *tuple, + const struct nf_conn *ignored_conntrack); + +extern void nf_conntrack_hash_insert(struct nf_conn *ct); + +extern struct nf_conntrack_expect * +__nf_conntrack_expect_find(const struct nf_conntrack_tuple *tuple); + +extern struct nf_conntrack_expect * +nf_conntrack_expect_find(const struct nf_conntrack_tuple *tuple); + +extern void nf_ct_unlink_expect(struct nf_conntrack_expect *exp); + +extern void nf_ct_remove_expectations(struct nf_conn *ct); + +extern void nf_conntrack_flush(void); + +extern struct nf_conntrack_helper * +nf_ct_helper_find_get( const struct nf_conntrack_tuple *tuple); +extern void nf_ct_helper_put(struct nf_conntrack_helper *helper); + +extern struct nf_conntrack_helper * +__nf_conntrack_helper_find_byname(const char *name); + /* call to create an explicit dependency on nf_conntrack. */ extern void need_nf_conntrack(void); diff --git a/include/net/netfilter/nf_conntrack_helper.h b/include/net/netfilter/nf_conntrack_helper.h index 5a66b2a3a623..86ec8174ad02 100644 --- a/include/net/netfilter/nf_conntrack_helper.h +++ b/include/net/netfilter/nf_conntrack_helper.h @@ -33,6 +33,8 @@ struct nf_conntrack_helper unsigned int protoff, struct nf_conn *ct, enum ip_conntrack_info conntrackinfo); + + int (*to_nfattr)(struct sk_buff *skb, const struct nf_conn *ct); }; extern int nf_conntrack_helper_register(struct nf_conntrack_helper *); diff --git a/include/net/netfilter/nf_conntrack_l3proto.h b/include/net/netfilter/nf_conntrack_l3proto.h index 01663e5b33df..67856eb93b43 100644 --- a/include/net/netfilter/nf_conntrack_l3proto.h +++ b/include/net/netfilter/nf_conntrack_l3proto.h @@ -14,6 +14,8 @@ #include #include +struct nfattr; + struct nf_conntrack_l3proto { /* Next pointer. */ @@ -70,6 +72,12 @@ struct nf_conntrack_l3proto u_int32_t (*get_features)(const struct nf_conntrack_tuple *tuple); + int (*tuple_to_nfattr)(struct sk_buff *skb, + const struct nf_conntrack_tuple *t); + + int (*nfattr_to_tuple)(struct nfattr *tb[], + struct nf_conntrack_tuple *t); + /* Module (if any) which this is connected to. */ struct module *me; }; @@ -81,11 +89,16 @@ extern int nf_conntrack_l3proto_register(struct nf_conntrack_l3proto *proto); extern void nf_conntrack_l3proto_unregister(struct nf_conntrack_l3proto *proto); static inline struct nf_conntrack_l3proto * -nf_ct_find_l3proto(u_int16_t l3proto) +__nf_ct_l3proto_find(u_int16_t l3proto) { return nf_ct_l3protos[l3proto]; } +extern struct nf_conntrack_l3proto * +nf_ct_l3proto_find_get(u_int16_t l3proto); + +extern void nf_ct_l3proto_put(struct nf_conntrack_l3proto *p); + /* Existing built-in protocols */ extern struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4; extern struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv6; diff --git a/include/net/netfilter/nf_conntrack_protocol.h b/include/net/netfilter/nf_conntrack_protocol.h index b3afda35397a..1f33737fcea5 100644 --- a/include/net/netfilter/nf_conntrack_protocol.h +++ b/include/net/netfilter/nf_conntrack_protocol.h @@ -12,6 +12,7 @@ #include struct seq_file; +struct nfattr; struct nf_conntrack_protocol { @@ -66,6 +67,18 @@ struct nf_conntrack_protocol enum ip_conntrack_info *ctinfo, int pf, unsigned int hooknum); + /* convert protoinfo to nfnetink attributes */ + int (*to_nfattr)(struct sk_buff *skb, struct nfattr *nfa, + const struct nf_conn *ct); + + /* convert nfnetlink attributes to protoinfo */ + int (*from_nfattr)(struct nfattr *tb[], struct nf_conn *ct); + + int (*tuple_to_nfattr)(struct sk_buff *skb, + const struct nf_conntrack_tuple *t); + int (*nfattr_to_tuple)(struct nfattr *tb[], + struct nf_conntrack_tuple *t); + /* Module (if any) which this is connected to. */ struct module *me; }; @@ -80,12 +93,23 @@ extern struct nf_conntrack_protocol nf_conntrack_generic_protocol; extern struct nf_conntrack_protocol **nf_ct_protos[PF_MAX]; extern struct nf_conntrack_protocol * -nf_ct_find_proto(u_int16_t l3proto, u_int8_t protocol); +__nf_ct_proto_find(u_int16_t l3proto, u_int8_t protocol); + +extern struct nf_conntrack_protocol * +nf_ct_proto_find_get(u_int16_t l3proto, u_int8_t protocol); + +extern void nf_ct_proto_put(struct nf_conntrack_protocol *p); /* Protocol registration. */ extern int nf_conntrack_protocol_register(struct nf_conntrack_protocol *proto); extern void nf_conntrack_protocol_unregister(struct nf_conntrack_protocol *proto); +/* Generic netlink helpers */ +extern int nf_ct_port_tuple_to_nfattr(struct sk_buff *skb, + const struct nf_conntrack_tuple *tuple); +extern int nf_ct_port_nfattr_to_tuple(struct nfattr *tb[], + struct nf_conntrack_tuple *t); + /* Log invalid packets */ extern unsigned int nf_ct_log_invalid; diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index 8202c1c0afad..385867efd481 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c @@ -392,6 +392,48 @@ getorigdst(struct sock *sk, int optval, void __user *user, int *len) return -ENOENT; } +#if defined(CONFIG_NF_CT_NETLINK) || \ + defined(CONFIG_NF_CT_NETLINK_MODULE) + +#include +#include + +static int ipv4_tuple_to_nfattr(struct sk_buff *skb, + const struct nf_conntrack_tuple *tuple) +{ + NFA_PUT(skb, CTA_IP_V4_SRC, sizeof(u_int32_t), + &tuple->src.u3.ip); + NFA_PUT(skb, CTA_IP_V4_DST, sizeof(u_int32_t), + &tuple->dst.u3.ip); + return 0; + +nfattr_failure: + return -1; +} + +static const size_t cta_min_ip[CTA_IP_MAX] = { + [CTA_IP_V4_SRC-1] = sizeof(u_int32_t), + [CTA_IP_V4_DST-1] = sizeof(u_int32_t), +}; + +static int ipv4_nfattr_to_tuple(struct nfattr *tb[], + struct nf_conntrack_tuple *t) +{ + if (!tb[CTA_IP_V4_SRC-1] || !tb[CTA_IP_V4_DST-1]) + return -EINVAL; + + if (nfattr_bad_size(tb, CTA_IP_MAX, cta_min_ip)) + return -EINVAL; + + t->src.u3.ip = + *(u_int32_t *)NFA_DATA(tb[CTA_IP_V4_SRC-1]); + t->dst.u3.ip = + *(u_int32_t *)NFA_DATA(tb[CTA_IP_V4_DST-1]); + + return 0; +} +#endif + static struct nf_sockopt_ops so_getorigdst = { .pf = PF_INET, .get_optmin = SO_ORIGINAL_DST, @@ -408,6 +450,11 @@ struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 = { .print_conntrack = ipv4_print_conntrack, .prepare = ipv4_prepare, .get_features = ipv4_get_features, +#if defined(CONFIG_NF_CT_NETLINK) || \ + defined(CONFIG_NF_CT_NETLINK_MODULE) + .tuple_to_nfattr = ipv4_tuple_to_nfattr, + .nfattr_to_tuple = ipv4_nfattr_to_tuple, +#endif .me = THIS_MODULE, }; diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c index 7ddb5c08f7b8..52dc175be39a 100644 --- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c +++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c @@ -50,20 +50,21 @@ static int icmp_pkt_to_tuple(const struct sk_buff *skb, return 1; } +/* Add 1; spaces filled with 0. */ +static const u_int8_t invmap[] = { + [ICMP_ECHO] = ICMP_ECHOREPLY + 1, + [ICMP_ECHOREPLY] = ICMP_ECHO + 1, + [ICMP_TIMESTAMP] = ICMP_TIMESTAMPREPLY + 1, + [ICMP_TIMESTAMPREPLY] = ICMP_TIMESTAMP + 1, + [ICMP_INFO_REQUEST] = ICMP_INFO_REPLY + 1, + [ICMP_INFO_REPLY] = ICMP_INFO_REQUEST + 1, + [ICMP_ADDRESS] = ICMP_ADDRESSREPLY + 1, + [ICMP_ADDRESSREPLY] = ICMP_ADDRESS + 1 +}; + static int icmp_invert_tuple(struct nf_conntrack_tuple *tuple, const struct nf_conntrack_tuple *orig) { - /* Add 1; spaces filled with 0. */ - static u_int8_t invmap[] - = { [ICMP_ECHO] = ICMP_ECHOREPLY + 1, - [ICMP_ECHOREPLY] = ICMP_ECHO + 1, - [ICMP_TIMESTAMP] = ICMP_TIMESTAMPREPLY + 1, - [ICMP_TIMESTAMPREPLY] = ICMP_TIMESTAMP + 1, - [ICMP_INFO_REQUEST] = ICMP_INFO_REPLY + 1, - [ICMP_INFO_REPLY] = ICMP_INFO_REQUEST + 1, - [ICMP_ADDRESS] = ICMP_ADDRESSREPLY + 1, - [ICMP_ADDRESSREPLY] = ICMP_ADDRESS + 1}; - if (orig->dst.u.icmp.type >= sizeof(invmap) || !invmap[orig->dst.u.icmp.type]) return 0; @@ -120,11 +121,12 @@ static int icmp_packet(struct nf_conn *ct, static int icmp_new(struct nf_conn *conntrack, const struct sk_buff *skb, unsigned int dataoff) { - static u_int8_t valid_new[] - = { [ICMP_ECHO] = 1, - [ICMP_TIMESTAMP] = 1, - [ICMP_INFO_REQUEST] = 1, - [ICMP_ADDRESS] = 1 }; + static const u_int8_t valid_new[] = { + [ICMP_ECHO] = 1, + [ICMP_TIMESTAMP] = 1, + [ICMP_INFO_REQUEST] = 1, + [ICMP_ADDRESS] = 1 + }; if (conntrack->tuplehash[0].tuple.dst.u.icmp.type >= sizeof(valid_new) || !valid_new[conntrack->tuplehash[0].tuple.dst.u.icmp.type]) { @@ -168,7 +170,7 @@ icmp_error_message(struct sk_buff *skb, return -NF_ACCEPT; } - innerproto = nf_ct_find_proto(PF_INET, inside->ip.protocol); + innerproto = __nf_ct_proto_find(PF_INET, inside->ip.protocol); dataoff = skb->nh.iph->ihl*4 + sizeof(inside->icmp); /* Are they talking about one of our connections? */ if (!nf_ct_get_tuple(skb, dataoff, dataoff + inside->ip.ihl*4, PF_INET, @@ -281,6 +283,60 @@ checksum_skipped: return icmp_error_message(skb, ctinfo, hooknum); } +#if defined(CONFIG_NF_CT_NETLINK) || \ + defined(CONFIG_NF_CT_NETLINK_MODULE) + +#include +#include + +static int icmp_tuple_to_nfattr(struct sk_buff *skb, + const struct nf_conntrack_tuple *t) +{ + NFA_PUT(skb, CTA_PROTO_ICMP_ID, sizeof(u_int16_t), + &t->src.u.icmp.id); + NFA_PUT(skb, CTA_PROTO_ICMP_TYPE, sizeof(u_int8_t), + &t->dst.u.icmp.type); + NFA_PUT(skb, CTA_PROTO_ICMP_CODE, sizeof(u_int8_t), + &t->dst.u.icmp.code); + + return 0; + +nfattr_failure: + return -1; +} + +static const size_t cta_min_proto[CTA_PROTO_MAX] = { + [CTA_PROTO_ICMP_TYPE-1] = sizeof(u_int8_t), + [CTA_PROTO_ICMP_CODE-1] = sizeof(u_int8_t), + [CTA_PROTO_ICMP_ID-1] = sizeof(u_int16_t) +}; + +static int icmp_nfattr_to_tuple(struct nfattr *tb[], + struct nf_conntrack_tuple *tuple) +{ + if (!tb[CTA_PROTO_ICMP_TYPE-1] + || !tb[CTA_PROTO_ICMP_CODE-1] + || !tb[CTA_PROTO_ICMP_ID-1]) + return -EINVAL; + + if (nfattr_bad_size(tb, CTA_PROTO_MAX, cta_min_proto)) + return -EINVAL; + + tuple->dst.u.icmp.type = + *(u_int8_t *)NFA_DATA(tb[CTA_PROTO_ICMP_TYPE-1]); + tuple->dst.u.icmp.code = + *(u_int8_t *)NFA_DATA(tb[CTA_PROTO_ICMP_CODE-1]); + tuple->src.u.icmp.id = + *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_ICMP_ID-1]); + + if (tuple->dst.u.icmp.type >= sizeof(invmap) + || !invmap[tuple->dst.u.icmp.type]) + return -EINVAL; + + return 0; +} +#endif + struct nf_conntrack_protocol nf_conntrack_protocol_icmp = { .list = { NULL, NULL }, @@ -295,7 +351,12 @@ struct nf_conntrack_protocol nf_conntrack_protocol_icmp = .new = icmp_new, .error = icmp_error, .destroy = NULL, - .me = NULL + .me = NULL, +#if defined(CONFIG_NF_CT_NETLINK) || \ + defined(CONFIG_NF_CT_NETLINK_MODULE) + .tuple_to_nfattr = icmp_tuple_to_nfattr, + .nfattr_to_tuple = icmp_nfattr_to_tuple, +#endif }; EXPORT_SYMBOL(nf_conntrack_protocol_icmp); diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c index 753a3ae8502b..704fbbe74874 100644 --- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c +++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c @@ -401,6 +401,48 @@ static ctl_table nf_ct_net_table[] = { }; #endif +#if defined(CONFIG_NF_CT_NETLINK) || \ + defined(CONFIG_NF_CT_NETLINK_MODULE) + +#include +#include + +static int ipv6_tuple_to_nfattr(struct sk_buff *skb, + const struct nf_conntrack_tuple *tuple) +{ + NFA_PUT(skb, CTA_IP_V6_SRC, sizeof(u_int32_t) * 4, + &tuple->src.u3.ip6); + NFA_PUT(skb, CTA_IP_V6_DST, sizeof(u_int32_t) * 4, + &tuple->dst.u3.ip6); + return 0; + +nfattr_failure: + return -1; +} + +static const size_t cta_min_ip[CTA_IP_MAX] = { + [CTA_IP_V6_SRC-1] = sizeof(u_int32_t)*4, + [CTA_IP_V6_DST-1] = sizeof(u_int32_t)*4, +}; + +static int ipv6_nfattr_to_tuple(struct nfattr *tb[], + struct nf_conntrack_tuple *t) +{ + if (!tb[CTA_IP_V6_SRC-1] || !tb[CTA_IP_V6_DST-1]) + return -EINVAL; + + if (nfattr_bad_size(tb, CTA_IP_MAX, cta_min_ip)) + return -EINVAL; + + memcpy(&t->src.u3.ip6, NFA_DATA(tb[CTA_IP_V6_SRC-1]), + sizeof(u_int32_t) * 4); + memcpy(&t->dst.u3.ip6, NFA_DATA(tb[CTA_IP_V6_DST-1]), + sizeof(u_int32_t) * 4); + + return 0; +} +#endif + struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv6 = { .l3proto = PF_INET6, .name = "ipv6", @@ -409,6 +451,11 @@ struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv6 = { .print_tuple = ipv6_print_tuple, .print_conntrack = ipv6_print_conntrack, .prepare = ipv6_prepare, +#if defined(CONFIG_NF_CT_NETLINK) || \ + defined(CONFIG_NF_CT_NETLINK_MODULE) + .tuple_to_nfattr = ipv6_tuple_to_nfattr, + .nfattr_to_tuple = ipv6_nfattr_to_tuple, +#endif .get_features = ipv6_get_features, .me = THIS_MODULE, }; diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c index a7e03cfacd06..09945c333055 100644 --- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c +++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c @@ -57,17 +57,17 @@ static int icmpv6_pkt_to_tuple(const struct sk_buff *skb, return 1; } +/* Add 1; spaces filled with 0. */ +static u_int8_t invmap[] = { + [ICMPV6_ECHO_REQUEST - 128] = ICMPV6_ECHO_REPLY + 1, + [ICMPV6_ECHO_REPLY - 128] = ICMPV6_ECHO_REQUEST + 1, + [ICMPV6_NI_QUERY - 128] = ICMPV6_NI_QUERY + 1, + [ICMPV6_NI_REPLY - 128] = ICMPV6_NI_REPLY +1 +}; + static int icmpv6_invert_tuple(struct nf_conntrack_tuple *tuple, const struct nf_conntrack_tuple *orig) { - /* Add 1; spaces filled with 0. */ - static u_int8_t invmap[] = { - [ICMPV6_ECHO_REQUEST - 128] = ICMPV6_ECHO_REPLY + 1, - [ICMPV6_ECHO_REPLY - 128] = ICMPV6_ECHO_REQUEST + 1, - [ICMPV6_NI_QUERY - 128] = ICMPV6_NI_QUERY + 1, - [ICMPV6_NI_REPLY - 128] = ICMPV6_NI_REPLY +1 - }; - int type = orig->dst.u.icmp.type - 128; if (type < 0 || type >= sizeof(invmap) || !invmap[type]) return 0; @@ -185,7 +185,7 @@ icmpv6_error_message(struct sk_buff *skb, return -NF_ACCEPT; } - inproto = nf_ct_find_proto(PF_INET6, inprotonum); + inproto = __nf_ct_proto_find(PF_INET6, inprotonum); /* Are they talking about one of our connections? */ if (!nf_ct_get_tuple(skb, inip6off, inprotoff, PF_INET6, inprotonum, @@ -255,6 +255,60 @@ skipped: return icmpv6_error_message(skb, dataoff, ctinfo, hooknum); } +#if defined(CONFIG_NF_CT_NETLINK) || \ + defined(CONFIG_NF_CT_NETLINK_MODULE) + +#include +#include +static int icmpv6_tuple_to_nfattr(struct sk_buff *skb, + const struct nf_conntrack_tuple *t) +{ + NFA_PUT(skb, CTA_PROTO_ICMPV6_ID, sizeof(u_int16_t), + &t->src.u.icmp.id); + NFA_PUT(skb, CTA_PROTO_ICMPV6_TYPE, sizeof(u_int8_t), + &t->dst.u.icmp.type); + NFA_PUT(skb, CTA_PROTO_ICMPV6_CODE, sizeof(u_int8_t), + &t->dst.u.icmp.code); + + return 0; + +nfattr_failure: + return -1; +} + +static const size_t cta_min_proto[CTA_PROTO_MAX] = { + [CTA_PROTO_ICMPV6_TYPE-1] = sizeof(u_int8_t), + [CTA_PROTO_ICMPV6_CODE-1] = sizeof(u_int8_t), + [CTA_PROTO_ICMPV6_ID-1] = sizeof(u_int16_t) +}; + +static int icmpv6_nfattr_to_tuple(struct nfattr *tb[], + struct nf_conntrack_tuple *tuple) +{ + if (!tb[CTA_PROTO_ICMPV6_TYPE-1] + || !tb[CTA_PROTO_ICMPV6_CODE-1] + || !tb[CTA_PROTO_ICMPV6_ID-1]) + return -EINVAL; + + if (nfattr_bad_size(tb, CTA_PROTO_MAX, cta_min_proto)) + return -EINVAL; + + tuple->dst.u.icmp.type = + *(u_int8_t *)NFA_DATA(tb[CTA_PROTO_ICMPV6_TYPE-1]); + tuple->dst.u.icmp.code = + *(u_int8_t *)NFA_DATA(tb[CTA_PROTO_ICMPV6_CODE-1]); + tuple->src.u.icmp.id = + *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_ICMPV6_ID-1]); + + if (tuple->dst.u.icmp.type < 128 + || tuple->dst.u.icmp.type - 128 >= sizeof(invmap) + || !invmap[tuple->dst.u.icmp.type - 128]) + return -EINVAL; + + return 0; +} +#endif + struct nf_conntrack_protocol nf_conntrack_protocol_icmpv6 = { .l3proto = PF_INET6, @@ -267,6 +321,11 @@ struct nf_conntrack_protocol nf_conntrack_protocol_icmpv6 = .packet = icmpv6_packet, .new = icmpv6_new, .error = icmpv6_error, +#if defined(CONFIG_NF_CT_NETLINK) || \ + defined(CONFIG_NF_CT_NETLINK_MODULE) + .tuple_to_nfattr = icmpv6_tuple_to_nfattr, + .nfattr_to_tuple = icmpv6_nfattr_to_tuple, +#endif }; EXPORT_SYMBOL(nf_conntrack_protocol_icmpv6); diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 794c41d19b28..7d55f9cbd853 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -95,4 +95,11 @@ config NF_CONNTRACK_FTP To compile it as a module, choose M here. If unsure, say N. +config NF_CT_NETLINK + tristate 'Connection tracking netlink interface (EXPERIMENTAL)' + depends on EXPERIMENTAL && NF_CONNTRACK && NETFILTER_NETLINK + depends on NF_CONNTRACK!=y || NETFILTER_NETLINK!=m + help + This option enables support for a netlink-based userspace interface + endmenu diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 55f019ad2c08..cb2183145c37 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -13,3 +13,6 @@ obj-$(CONFIG_NF_CONNTRACK_FTP) += nf_conntrack_ftp.o # SCTP protocol connection tracking obj-$(CONFIG_NF_CT_PROTO_SCTP) += nf_conntrack_proto_sctp.o + +# netlink interface for nf_conntrack +obj-$(CONFIG_NF_CT_NETLINK) += nf_conntrack_netlink.o diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 0c5b01d732d8..62bb509f05d4 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -82,6 +82,8 @@ unsigned int nf_ct_log_invalid; static LIST_HEAD(unconfirmed); static int nf_conntrack_vmalloc; +static unsigned int nf_conntrack_next_id = 1; +static unsigned int nf_conntrack_expect_next_id = 1; #ifdef CONFIG_NF_CONNTRACK_EVENTS struct notifier_block *nf_conntrack_chain; struct notifier_block *nf_conntrack_expect_chain; @@ -184,7 +186,7 @@ DECLARE_MUTEX(nf_ct_cache_mutex); extern struct nf_conntrack_protocol nf_conntrack_generic_protocol; struct nf_conntrack_protocol * -nf_ct_find_proto(u_int16_t l3proto, u_int8_t protocol) +__nf_ct_proto_find(u_int16_t l3proto, u_int8_t protocol) { if (unlikely(nf_ct_protos[l3proto] == NULL)) return &nf_conntrack_generic_protocol; @@ -192,6 +194,50 @@ nf_ct_find_proto(u_int16_t l3proto, u_int8_t protocol) return nf_ct_protos[l3proto][protocol]; } +/* this is guaranteed to always return a valid protocol helper, since + * it falls back to generic_protocol */ +struct nf_conntrack_protocol * +nf_ct_proto_find_get(u_int16_t l3proto, u_int8_t protocol) +{ + struct nf_conntrack_protocol *p; + + preempt_disable(); + p = __nf_ct_proto_find(l3proto, protocol); + if (p) { + if (!try_module_get(p->me)) + p = &nf_conntrack_generic_protocol; + } + preempt_enable(); + + return p; +} + +void nf_ct_proto_put(struct nf_conntrack_protocol *p) +{ + module_put(p->me); +} + +struct nf_conntrack_l3proto * +nf_ct_l3proto_find_get(u_int16_t l3proto) +{ + struct nf_conntrack_l3proto *p; + + preempt_disable(); + p = __nf_ct_l3proto_find(l3proto); + if (p) { + if (!try_module_get(p->me)) + p = &nf_conntrack_generic_l3proto; + } + preempt_enable(); + + return p; +} + +void nf_ct_l3proto_put(struct nf_conntrack_l3proto *p) +{ + module_put(p->me); +} + static int nf_conntrack_hash_rnd_initted; static unsigned int nf_conntrack_hash_rnd; @@ -384,7 +430,7 @@ nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse, } /* nf_conntrack_expect helper functions */ -static void nf_ct_unlink_expect(struct nf_conntrack_expect *exp) +void nf_ct_unlink_expect(struct nf_conntrack_expect *exp) { ASSERT_WRITE_LOCK(&nf_conntrack_lock); NF_CT_ASSERT(!timer_pending(&exp->timeout)); @@ -404,6 +450,33 @@ static void expectation_timed_out(unsigned long ul_expect) nf_conntrack_expect_put(exp); } +struct nf_conntrack_expect * +__nf_conntrack_expect_find(const struct nf_conntrack_tuple *tuple) +{ + struct nf_conntrack_expect *i; + + list_for_each_entry(i, &nf_conntrack_expect_list, list) { + if (nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)) { + atomic_inc(&i->use); + return i; + } + } + return NULL; +} + +/* Just find a expectation corresponding to a tuple. */ +struct nf_conntrack_expect * +nf_conntrack_expect_find(const struct nf_conntrack_tuple *tuple) +{ + struct nf_conntrack_expect *i; + + read_lock_bh(&nf_conntrack_lock); + i = __nf_conntrack_expect_find(tuple); + read_unlock_bh(&nf_conntrack_lock); + + return i; +} + /* If an expectation for this connection is found, it gets delete from * global list then returned. */ static struct nf_conntrack_expect * @@ -432,7 +505,7 @@ find_expectation(const struct nf_conntrack_tuple *tuple) } /* delete all expectations for this conntrack */ -static void remove_expectations(struct nf_conn *ct) +void nf_ct_remove_expectations(struct nf_conn *ct) { struct nf_conntrack_expect *i, *tmp; @@ -462,7 +535,7 @@ clean_from_lists(struct nf_conn *ct) LIST_DELETE(&nf_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]); /* Destroy all pending expectations */ - remove_expectations(ct); + nf_ct_remove_expectations(ct); } static void @@ -482,12 +555,11 @@ destroy_conntrack(struct nf_conntrack *nfct) /* To make sure we don't get any weird locking issues here: * destroy_conntrack() MUST NOT be called with a write lock * to nf_conntrack_lock!!! -HW */ - l3proto = nf_ct_find_l3proto(ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.l3num); + l3proto = __nf_ct_l3proto_find(ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.l3num); if (l3proto && l3proto->destroy) l3proto->destroy(ct); - proto = nf_ct_find_proto(ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.l3num, - ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum); + proto = __nf_ct_proto_find(ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.l3num, ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum); if (proto && proto->destroy) proto->destroy(ct); @@ -499,7 +571,7 @@ destroy_conntrack(struct nf_conntrack *nfct) * except TFTP can create an expectation on the first packet, * before connection is in the list, so we need to clean here, * too. */ - remove_expectations(ct); + nf_ct_remove_expectations(ct); /* We overload first tuple to link into unconfirmed list. */ if (!nf_ct_is_confirmed(ct)) { @@ -540,7 +612,7 @@ conntrack_tuple_cmp(const struct nf_conntrack_tuple_hash *i, && nf_ct_tuple_equal(tuple, &i->tuple); } -static struct nf_conntrack_tuple_hash * +struct nf_conntrack_tuple_hash * __nf_conntrack_find(const struct nf_conntrack_tuple *tuple, const struct nf_conn *ignored_conntrack) { @@ -575,6 +647,29 @@ nf_conntrack_find_get(const struct nf_conntrack_tuple *tuple, return h; } +static void __nf_conntrack_hash_insert(struct nf_conn *ct, + unsigned int hash, + unsigned int repl_hash) +{ + ct->id = ++nf_conntrack_next_id; + list_prepend(&nf_conntrack_hash[hash], + &ct->tuplehash[IP_CT_DIR_ORIGINAL].list); + list_prepend(&nf_conntrack_hash[repl_hash], + &ct->tuplehash[IP_CT_DIR_REPLY].list); +} + +void nf_conntrack_hash_insert(struct nf_conn *ct) +{ + unsigned int hash, repl_hash; + + hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); + repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple); + + write_lock_bh(&nf_conntrack_lock); + __nf_conntrack_hash_insert(ct, hash, repl_hash); + write_unlock_bh(&nf_conntrack_lock); +} + /* Confirm a connection given skb; places it in hash table */ int __nf_conntrack_confirm(struct sk_buff **pskb) @@ -621,10 +716,7 @@ __nf_conntrack_confirm(struct sk_buff **pskb) /* Remove from unconfirmed list */ list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list); - list_prepend(&nf_conntrack_hash[hash], - &ct->tuplehash[IP_CT_DIR_ORIGINAL]); - list_prepend(&nf_conntrack_hash[repl_hash], - &ct->tuplehash[IP_CT_DIR_REPLY]); + __nf_conntrack_hash_insert(ct, hash, repl_hash); /* Timer relative to confirmation time, not original setting time, otherwise we'd get timer wrap in weird delay cases. */ @@ -708,13 +800,41 @@ static inline int helper_cmp(const struct nf_conntrack_helper *i, } static struct nf_conntrack_helper * -nf_ct_find_helper(const struct nf_conntrack_tuple *tuple) +__nf_ct_helper_find(const struct nf_conntrack_tuple *tuple) { return LIST_FIND(&helpers, helper_cmp, struct nf_conntrack_helper *, tuple); } +struct nf_conntrack_helper * +nf_ct_helper_find_get( const struct nf_conntrack_tuple *tuple) +{ + struct nf_conntrack_helper *helper; + + /* need nf_conntrack_lock to assure that helper exists until + * try_module_get() is called */ + read_lock_bh(&nf_conntrack_lock); + + helper = __nf_ct_helper_find(tuple); + if (helper) { + /* need to increase module usage count to assure helper will + * not go away while the caller is e.g. busy putting a + * conntrack in the hash that uses the helper */ + if (!try_module_get(helper->me)) + helper = NULL; + } + + read_unlock_bh(&nf_conntrack_lock); + + return helper; +} + +void nf_ct_helper_put(struct nf_conntrack_helper *helper) +{ + module_put(helper->me); +} + static struct nf_conn * __nf_conntrack_alloc(const struct nf_conntrack_tuple *orig, const struct nf_conntrack_tuple *repl, @@ -744,7 +864,7 @@ __nf_conntrack_alloc(const struct nf_conntrack_tuple *orig, /* find features needed by this conntrack. */ features = l3proto->get_features(orig); read_lock_bh(&nf_conntrack_lock); - if (nf_ct_find_helper(repl) != NULL) + if (__nf_ct_helper_find(repl) != NULL) features |= NF_CT_F_HELP; read_unlock_bh(&nf_conntrack_lock); @@ -794,7 +914,7 @@ struct nf_conn *nf_conntrack_alloc(const struct nf_conntrack_tuple *orig, { struct nf_conntrack_l3proto *l3proto; - l3proto = nf_ct_find_l3proto(orig->src.l3num); + l3proto = __nf_ct_l3proto_find(orig->src.l3num); return __nf_conntrack_alloc(orig, repl, l3proto); } @@ -853,7 +973,7 @@ init_conntrack(const struct nf_conntrack_tuple *tuple, nf_conntrack_get(&conntrack->master->ct_general); NF_CT_STAT_INC(expect_new); } else { - conntrack->helper = nf_ct_find_helper(&repl_tuple); + conntrack->helper = __nf_ct_helper_find(&repl_tuple); NF_CT_STAT_INC(new); } @@ -947,13 +1067,13 @@ nf_conntrack_in(int pf, unsigned int hooknum, struct sk_buff **pskb) return NF_ACCEPT; } - l3proto = nf_ct_find_l3proto((u_int16_t)pf); + l3proto = __nf_ct_l3proto_find((u_int16_t)pf); if ((ret = l3proto->prepare(pskb, hooknum, &dataoff, &protonum)) <= 0) { DEBUGP("not prepared to track yet or error occured\n"); return -ret; } - proto = nf_ct_find_proto((u_int16_t)pf, protonum); + proto = __nf_ct_proto_find((u_int16_t)pf, protonum); /* It may be an special packet, error, unclean... * inverse of the return code tells to the netfilter @@ -1002,9 +1122,9 @@ int nf_ct_invert_tuplepr(struct nf_conntrack_tuple *inverse, const struct nf_conntrack_tuple *orig) { return nf_ct_invert_tuple(inverse, orig, - nf_ct_find_l3proto(orig->src.l3num), - nf_ct_find_proto(orig->src.l3num, - orig->dst.protonum)); + __nf_ct_l3proto_find(orig->src.l3num), + __nf_ct_proto_find(orig->src.l3num, + orig->dst.protonum)); } /* Would two expected things clash? */ @@ -1096,6 +1216,7 @@ static void nf_conntrack_expect_insert(struct nf_conntrack_expect *exp) exp->timeout.expires = jiffies + exp->master->helper->timeout * HZ; add_timer(&exp->timeout); + exp->id = ++nf_conntrack_expect_next_id; atomic_inc(&exp->use); NF_CT_STAT_INC(expect_create); } @@ -1176,7 +1297,7 @@ void nf_conntrack_alter_reply(struct nf_conn *conntrack, conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply; if (!conntrack->master && conntrack->expecting == 0) - conntrack->helper = nf_ct_find_helper(newreply); + conntrack->helper = __nf_ct_helper_find(newreply); write_unlock_bh(&nf_conntrack_lock); } @@ -1201,6 +1322,19 @@ int nf_conntrack_helper_register(struct nf_conntrack_helper *me) return 0; } +struct nf_conntrack_helper * +__nf_conntrack_helper_find_byname(const char *name) +{ + struct nf_conntrack_helper *h; + + list_for_each_entry(h, &helpers, list) { + if (!strcmp(h->name, name)) + return h; + } + + return NULL; +} + static inline int unhelp(struct nf_conntrack_tuple_hash *i, const struct nf_conntrack_helper *me) { @@ -1284,6 +1418,51 @@ void __nf_ct_refresh_acct(struct nf_conn *ct, nf_conntrack_event_cache(event, skb); } +#if defined(CONFIG_NF_CT_NETLINK) || \ + defined(CONFIG_NF_CT_NETLINK_MODULE) + +#include +#include + +/* Generic function for tcp/udp/sctp/dccp and alike. This needs to be + * in ip_conntrack_core, since we don't want the protocols to autoload + * or depend on ctnetlink */ +int nf_ct_port_tuple_to_nfattr(struct sk_buff *skb, + const struct nf_conntrack_tuple *tuple) +{ + NFA_PUT(skb, CTA_PROTO_SRC_PORT, sizeof(u_int16_t), + &tuple->src.u.tcp.port); + NFA_PUT(skb, CTA_PROTO_DST_PORT, sizeof(u_int16_t), + &tuple->dst.u.tcp.port); + return 0; + +nfattr_failure: + return -1; +} + +static const size_t cta_min_proto[CTA_PROTO_MAX] = { + [CTA_PROTO_SRC_PORT-1] = sizeof(u_int16_t), + [CTA_PROTO_DST_PORT-1] = sizeof(u_int16_t) +}; + +int nf_ct_port_nfattr_to_tuple(struct nfattr *tb[], + struct nf_conntrack_tuple *t) +{ + if (!tb[CTA_PROTO_SRC_PORT-1] || !tb[CTA_PROTO_DST_PORT-1]) + return -EINVAL; + + if (nfattr_bad_size(tb, CTA_PROTO_MAX, cta_min_proto)) + return -EINVAL; + + t->src.u.tcp.port = + *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_SRC_PORT-1]); + t->dst.u.tcp.port = + *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_DST_PORT-1]); + + return 0; +} +#endif + /* Used by ipt_REJECT and ip6t_REJECT. */ void __nf_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb) { @@ -1366,6 +1545,11 @@ static void free_conntrack_hash(struct list_head *hash, int vmalloced, int size) get_order(sizeof(struct list_head) * size)); } +void nf_conntrack_flush() +{ + nf_ct_iterate_cleanup(kill_all, NULL); +} + /* Mishearing the voices in his head, our hero wonders how he's supposed to kill the mall. */ void nf_conntrack_cleanup(void) @@ -1379,7 +1563,7 @@ void nf_conntrack_cleanup(void) nf_ct_event_cache_flush(); i_see_dead_people: - nf_ct_iterate_cleanup(kill_all, NULL); + nf_conntrack_flush(); if (atomic_read(&nf_conntrack_count) != 0) { schedule(); goto i_see_dead_people; diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c new file mode 100644 index 000000000000..4f2e50952a12 --- /dev/null +++ b/net/netfilter/nf_conntrack_netlink.c @@ -0,0 +1,1642 @@ +/* Connection tracking via netlink socket. Allows for user space + * protocol helpers and general trouble making from userspace. + * + * (C) 2001 by Jay Schulist + * (C) 2002-2005 by Harald Welte + * (C) 2003 by Patrick Mchardy + * (C) 2005 by Pablo Neira Ayuso + * + * I've reworked this stuff to use attributes instead of conntrack + * structures. 5.44 am. I need more tea. --pablo 05/07/11. + * + * Initial connection tracking via netlink development funded and + * generally made possible by Network Robots, Inc. (www.networkrobots.com) + * + * Further development of this code funded by Astaro AG (http://www.astaro.com) + * + * This software may be used and distributed according to the terms + * of the GNU General Public License, incorporated herein by reference. + * + * Derived from ip_conntrack_netlink.c: Port by Pablo Neira Ayuso (05/11/14) + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +MODULE_LICENSE("GPL"); + +static char __initdata version[] = "0.92"; + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + + +static inline int +ctnetlink_dump_tuples_proto(struct sk_buff *skb, + const struct nf_conntrack_tuple *tuple) +{ + struct nf_conntrack_protocol *proto; + int ret = 0; + + NFA_PUT(skb, CTA_PROTO_NUM, sizeof(u_int8_t), &tuple->dst.protonum); + + /* If no protocol helper is found, this function will return the + * generic protocol helper, so proto won't *ever* be NULL */ + proto = nf_ct_proto_find_get(tuple->src.l3num, tuple->dst.protonum); + if (likely(proto->tuple_to_nfattr)) + ret = proto->tuple_to_nfattr(skb, tuple); + + nf_ct_proto_put(proto); + + return ret; + +nfattr_failure: + return -1; +} + +static inline int +ctnetlink_dump_tuples(struct sk_buff *skb, + const struct nf_conntrack_tuple *tuple) +{ + struct nfattr *nest_parms; + struct nf_conntrack_l3proto *l3proto; + int ret = 0; + + l3proto = nf_ct_l3proto_find_get(tuple->src.l3num); + + nest_parms = NFA_NEST(skb, CTA_TUPLE_IP); + if (likely(l3proto->tuple_to_nfattr)) + ret = l3proto->tuple_to_nfattr(skb, tuple); + NFA_NEST_END(skb, nest_parms); + + nf_ct_l3proto_put(l3proto); + + if (unlikely(ret < 0)) + return ret; + + nest_parms = NFA_NEST(skb, CTA_TUPLE_PROTO); + ret = ctnetlink_dump_tuples_proto(skb, tuple); + NFA_NEST_END(skb, nest_parms); + + return ret; + +nfattr_failure: + return -1; +} + +static inline int +ctnetlink_dump_status(struct sk_buff *skb, const struct nf_conn *ct) +{ + u_int32_t status = htonl((u_int32_t) ct->status); + NFA_PUT(skb, CTA_STATUS, sizeof(status), &status); + return 0; + +nfattr_failure: + return -1; +} + +static inline int +ctnetlink_dump_timeout(struct sk_buff *skb, const struct nf_conn *ct) +{ + long timeout_l = ct->timeout.expires - jiffies; + u_int32_t timeout; + + if (timeout_l < 0) + timeout = 0; + else + timeout = htonl(timeout_l / HZ); + + NFA_PUT(skb, CTA_TIMEOUT, sizeof(timeout), &timeout); + return 0; + +nfattr_failure: + return -1; +} + +static inline int +ctnetlink_dump_protoinfo(struct sk_buff *skb, const struct nf_conn *ct) +{ + struct nf_conntrack_protocol *proto = nf_ct_proto_find_get(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num, ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum); + struct nfattr *nest_proto; + int ret; + + if (!proto->to_nfattr) { + nf_ct_proto_put(proto); + return 0; + } + + nest_proto = NFA_NEST(skb, CTA_PROTOINFO); + + ret = proto->to_nfattr(skb, nest_proto, ct); + + nf_ct_proto_put(proto); + + NFA_NEST_END(skb, nest_proto); + + return ret; + +nfattr_failure: + return -1; +} + +static inline int +ctnetlink_dump_helpinfo(struct sk_buff *skb, const struct nf_conn *ct) +{ + struct nfattr *nest_helper; + + if (!ct->helper) + return 0; + + nest_helper = NFA_NEST(skb, CTA_HELP); + NFA_PUT(skb, CTA_HELP_NAME, strlen(ct->helper->name), ct->helper->name); + + if (ct->helper->to_nfattr) + ct->helper->to_nfattr(skb, ct); + + NFA_NEST_END(skb, nest_helper); + + return 0; + +nfattr_failure: + return -1; +} + +#ifdef CONFIG_NF_CT_ACCT +static inline int +ctnetlink_dump_counters(struct sk_buff *skb, const struct nf_conn *ct, + enum ip_conntrack_dir dir) +{ + enum ctattr_type type = dir ? CTA_COUNTERS_REPLY: CTA_COUNTERS_ORIG; + struct nfattr *nest_count = NFA_NEST(skb, type); + u_int32_t tmp; + + tmp = htonl(ct->counters[dir].packets); + NFA_PUT(skb, CTA_COUNTERS32_PACKETS, sizeof(u_int32_t), &tmp); + + tmp = htonl(ct->counters[dir].bytes); + NFA_PUT(skb, CTA_COUNTERS32_BYTES, sizeof(u_int32_t), &tmp); + + NFA_NEST_END(skb, nest_count); + + return 0; + +nfattr_failure: + return -1; +} +#else +#define ctnetlink_dump_counters(a, b, c) (0) +#endif + +#ifdef CONFIG_NF_CONNTRACK_MARK +static inline int +ctnetlink_dump_mark(struct sk_buff *skb, const struct nf_conn *ct) +{ + u_int32_t mark = htonl(ct->mark); + + NFA_PUT(skb, CTA_MARK, sizeof(u_int32_t), &mark); + return 0; + +nfattr_failure: + return -1; +} +#else +#define ctnetlink_dump_mark(a, b) (0) +#endif + +static inline int +ctnetlink_dump_id(struct sk_buff *skb, const struct nf_conn *ct) +{ + u_int32_t id = htonl(ct->id); + NFA_PUT(skb, CTA_ID, sizeof(u_int32_t), &id); + return 0; + +nfattr_failure: + return -1; +} + +static inline int +ctnetlink_dump_use(struct sk_buff *skb, const struct nf_conn *ct) +{ + u_int32_t use = htonl(atomic_read(&ct->ct_general.use)); + + NFA_PUT(skb, CTA_USE, sizeof(u_int32_t), &use); + return 0; + +nfattr_failure: + return -1; +} + +#define tuple(ct, dir) (&(ct)->tuplehash[dir].tuple) + +static int +ctnetlink_fill_info(struct sk_buff *skb, u32 pid, u32 seq, + int event, int nowait, + const struct nf_conn *ct) +{ + struct nlmsghdr *nlh; + struct nfgenmsg *nfmsg; + struct nfattr *nest_parms; + unsigned char *b; + + b = skb->tail; + + event |= NFNL_SUBSYS_CTNETLINK << 8; + nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(struct nfgenmsg)); + nfmsg = NLMSG_DATA(nlh); + + nlh->nlmsg_flags = (nowait && pid) ? NLM_F_MULTI : 0; + nfmsg->nfgen_family = + ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num; + nfmsg->version = NFNETLINK_V0; + nfmsg->res_id = 0; + + nest_parms = NFA_NEST(skb, CTA_TUPLE_ORIG); + if (ctnetlink_dump_tuples(skb, tuple(ct, IP_CT_DIR_ORIGINAL)) < 0) + goto nfattr_failure; + NFA_NEST_END(skb, nest_parms); + + nest_parms = NFA_NEST(skb, CTA_TUPLE_REPLY); + if (ctnetlink_dump_tuples(skb, tuple(ct, IP_CT_DIR_REPLY)) < 0) + goto nfattr_failure; + NFA_NEST_END(skb, nest_parms); + + if (ctnetlink_dump_status(skb, ct) < 0 || + ctnetlink_dump_timeout(skb, ct) < 0 || + ctnetlink_dump_counters(skb, ct, IP_CT_DIR_ORIGINAL) < 0 || + ctnetlink_dump_counters(skb, ct, IP_CT_DIR_REPLY) < 0 || + ctnetlink_dump_protoinfo(skb, ct) < 0 || + ctnetlink_dump_helpinfo(skb, ct) < 0 || + ctnetlink_dump_mark(skb, ct) < 0 || + ctnetlink_dump_id(skb, ct) < 0 || + ctnetlink_dump_use(skb, ct) < 0) + goto nfattr_failure; + + nlh->nlmsg_len = skb->tail - b; + return skb->len; + +nlmsg_failure: +nfattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +#ifdef CONFIG_NF_CONNTRACK_EVENTS +static int ctnetlink_conntrack_event(struct notifier_block *this, + unsigned long events, void *ptr) +{ + struct nlmsghdr *nlh; + struct nfgenmsg *nfmsg; + struct nfattr *nest_parms; + struct nf_conn *ct = (struct nf_conn *)ptr; + struct sk_buff *skb; + unsigned int type; + unsigned char *b; + unsigned int flags = 0, group; + + /* ignore our fake conntrack entry */ + if (ct == &nf_conntrack_untracked) + return NOTIFY_DONE; + + if (events & IPCT_DESTROY) { + type = IPCTNL_MSG_CT_DELETE; + group = NFNLGRP_CONNTRACK_DESTROY; + } else if (events & (IPCT_NEW | IPCT_RELATED)) { + type = IPCTNL_MSG_CT_NEW; + flags = NLM_F_CREATE|NLM_F_EXCL; + /* dump everything */ + events = ~0UL; + group = NFNLGRP_CONNTRACK_NEW; + } else if (events & (IPCT_STATUS | + IPCT_PROTOINFO | + IPCT_HELPER | + IPCT_HELPINFO | + IPCT_NATINFO)) { + type = IPCTNL_MSG_CT_NEW; + group = NFNLGRP_CONNTRACK_UPDATE; + } else + return NOTIFY_DONE; + + /* FIXME: Check if there are any listeners before, don't hurt performance */ + + skb = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC); + if (!skb) + return NOTIFY_DONE; + + b = skb->tail; + + type |= NFNL_SUBSYS_CTNETLINK << 8; + nlh = NLMSG_PUT(skb, 0, 0, type, sizeof(struct nfgenmsg)); + nfmsg = NLMSG_DATA(nlh); + + nlh->nlmsg_flags = flags; + nfmsg->nfgen_family = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num; + nfmsg->version = NFNETLINK_V0; + nfmsg->res_id = 0; + + nest_parms = NFA_NEST(skb, CTA_TUPLE_ORIG); + if (ctnetlink_dump_tuples(skb, tuple(ct, IP_CT_DIR_ORIGINAL)) < 0) + goto nfattr_failure; + NFA_NEST_END(skb, nest_parms); + + nest_parms = NFA_NEST(skb, CTA_TUPLE_REPLY); + if (ctnetlink_dump_tuples(skb, tuple(ct, IP_CT_DIR_REPLY)) < 0) + goto nfattr_failure; + NFA_NEST_END(skb, nest_parms); + + /* NAT stuff is now a status flag */ + if ((events & IPCT_STATUS || events & IPCT_NATINFO) + && ctnetlink_dump_status(skb, ct) < 0) + goto nfattr_failure; + if (events & IPCT_REFRESH + && ctnetlink_dump_timeout(skb, ct) < 0) + goto nfattr_failure; + if (events & IPCT_PROTOINFO + && ctnetlink_dump_protoinfo(skb, ct) < 0) + goto nfattr_failure; + if (events & IPCT_HELPINFO + && ctnetlink_dump_helpinfo(skb, ct) < 0) + goto nfattr_failure; + + if (ctnetlink_dump_counters(skb, ct, IP_CT_DIR_ORIGINAL) < 0 || + ctnetlink_dump_counters(skb, ct, IP_CT_DIR_REPLY) < 0) + goto nfattr_failure; + + nlh->nlmsg_len = skb->tail - b; + nfnetlink_send(skb, 0, group, 0); + return NOTIFY_DONE; + +nlmsg_failure: +nfattr_failure: + kfree_skb(skb); + return NOTIFY_DONE; +} +#endif /* CONFIG_NF_CONNTRACK_EVENTS */ + +static int ctnetlink_done(struct netlink_callback *cb) +{ + DEBUGP("entered %s\n", __FUNCTION__); + return 0; +} + +static int +ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct nf_conn *ct = NULL; + struct nf_conntrack_tuple_hash *h; + struct list_head *i; + u_int32_t *id = (u_int32_t *) &cb->args[1]; + + DEBUGP("entered %s, last bucket=%lu id=%u\n", __FUNCTION__, + cb->args[0], *id); + + read_lock_bh(&nf_conntrack_lock); + for (; cb->args[0] < nf_conntrack_htable_size; cb->args[0]++, *id = 0) { + list_for_each_prev(i, &nf_conntrack_hash[cb->args[0]]) { + h = (struct nf_conntrack_tuple_hash *) i; + if (DIRECTION(h) != IP_CT_DIR_ORIGINAL) + continue; + ct = nf_ct_tuplehash_to_ctrack(h); + if (ct->id <= *id) + continue; + if (ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, + IPCTNL_MSG_CT_NEW, + 1, ct) < 0) + goto out; + *id = ct->id; + } + } +out: + read_unlock_bh(&nf_conntrack_lock); + + DEBUGP("leaving, last bucket=%lu id=%u\n", cb->args[0], *id); + + return skb->len; +} + +#ifdef CONFIG_NF_CT_ACCT +static int +ctnetlink_dump_table_w(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct nf_conn *ct = NULL; + struct nf_conntrack_tuple_hash *h; + struct list_head *i; + u_int32_t *id = (u_int32_t *) &cb->args[1]; + + DEBUGP("entered %s, last bucket=%u id=%u\n", __FUNCTION__, + cb->args[0], *id); + + write_lock_bh(&nf_conntrack_lock); + for (; cb->args[0] < nf_conntrack_htable_size; cb->args[0]++, *id = 0) { + list_for_each_prev(i, &nf_conntrack_hash[cb->args[0]]) { + h = (struct nf_conntrack_tuple_hash *) i; + if (DIRECTION(h) != IP_CT_DIR_ORIGINAL) + continue; + ct = nf_ct_tuplehash_to_ctrack(h); + if (ct->id <= *id) + continue; + if (ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, + IPCTNL_MSG_CT_NEW, + 1, ct) < 0) + goto out; + *id = ct->id; + + memset(&ct->counters, 0, sizeof(ct->counters)); + } + } +out: + write_unlock_bh(&nf_conntrack_lock); + + DEBUGP("leaving, last bucket=%lu id=%u\n", cb->args[0], *id); + + return skb->len; +} +#endif + +static inline int +ctnetlink_parse_tuple_ip(struct nfattr *attr, struct nf_conntrack_tuple *tuple) +{ + struct nfattr *tb[CTA_IP_MAX]; + struct nf_conntrack_l3proto *l3proto; + int ret = 0; + + DEBUGP("entered %s\n", __FUNCTION__); + + nfattr_parse_nested(tb, CTA_IP_MAX, attr); + + l3proto = nf_ct_l3proto_find_get(tuple->src.l3num); + + if (likely(l3proto->nfattr_to_tuple)) + ret = l3proto->nfattr_to_tuple(tb, tuple); + + nf_ct_l3proto_put(l3proto); + + DEBUGP("leaving\n"); + + return ret; +} + +static const size_t cta_min_proto[CTA_PROTO_MAX] = { + [CTA_PROTO_NUM-1] = sizeof(u_int8_t), +}; + +static inline int +ctnetlink_parse_tuple_proto(struct nfattr *attr, + struct nf_conntrack_tuple *tuple) +{ + struct nfattr *tb[CTA_PROTO_MAX]; + struct nf_conntrack_protocol *proto; + int ret = 0; + + DEBUGP("entered %s\n", __FUNCTION__); + + nfattr_parse_nested(tb, CTA_PROTO_MAX, attr); + + if (nfattr_bad_size(tb, CTA_PROTO_MAX, cta_min_proto)) + return -EINVAL; + + if (!tb[CTA_PROTO_NUM-1]) + return -EINVAL; + tuple->dst.protonum = *(u_int8_t *)NFA_DATA(tb[CTA_PROTO_NUM-1]); + + proto = nf_ct_proto_find_get(tuple->src.l3num, tuple->dst.protonum); + + if (likely(proto->nfattr_to_tuple)) + ret = proto->nfattr_to_tuple(tb, tuple); + + nf_ct_proto_put(proto); + + return ret; +} + +static inline int +ctnetlink_parse_tuple(struct nfattr *cda[], struct nf_conntrack_tuple *tuple, + enum ctattr_tuple type, u_int8_t l3num) +{ + struct nfattr *tb[CTA_TUPLE_MAX]; + int err; + + DEBUGP("entered %s\n", __FUNCTION__); + + memset(tuple, 0, sizeof(*tuple)); + + nfattr_parse_nested(tb, CTA_TUPLE_MAX, cda[type-1]); + + if (!tb[CTA_TUPLE_IP-1]) + return -EINVAL; + + tuple->src.l3num = l3num; + + err = ctnetlink_parse_tuple_ip(tb[CTA_TUPLE_IP-1], tuple); + if (err < 0) + return err; + + if (!tb[CTA_TUPLE_PROTO-1]) + return -EINVAL; + + err = ctnetlink_parse_tuple_proto(tb[CTA_TUPLE_PROTO-1], tuple); + if (err < 0) + return err; + + /* orig and expect tuples get DIR_ORIGINAL */ + if (type == CTA_TUPLE_REPLY) + tuple->dst.dir = IP_CT_DIR_REPLY; + else + tuple->dst.dir = IP_CT_DIR_ORIGINAL; + + NF_CT_DUMP_TUPLE(tuple); + + DEBUGP("leaving\n"); + + return 0; +} + +#ifdef CONFIG_IP_NF_NAT_NEEDED +static const size_t cta_min_protonat[CTA_PROTONAT_MAX] = { + [CTA_PROTONAT_PORT_MIN-1] = sizeof(u_int16_t), + [CTA_PROTONAT_PORT_MAX-1] = sizeof(u_int16_t), +}; + +static int ctnetlink_parse_nat_proto(struct nfattr *attr, + const struct nf_conn *ct, + struct ip_nat_range *range) +{ + struct nfattr *tb[CTA_PROTONAT_MAX]; + struct ip_nat_protocol *npt; + + DEBUGP("entered %s\n", __FUNCTION__); + + nfattr_parse_nested(tb, CTA_PROTONAT_MAX, attr); + + if (nfattr_bad_size(tb, CTA_PROTONAT_MAX, cta_min_protonat)) + return -EINVAL; + + npt = ip_nat_proto_find_get(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum); + + if (!npt->nfattr_to_range) { + ip_nat_proto_put(npt); + return 0; + } + + /* nfattr_to_range returns 1 if it parsed, 0 if not, neg. on error */ + if (npt->nfattr_to_range(tb, range) > 0) + range->flags |= IP_NAT_RANGE_PROTO_SPECIFIED; + + ip_nat_proto_put(npt); + + DEBUGP("leaving\n"); + return 0; +} + +static const size_t cta_min_nat[CTA_NAT_MAX] = { + [CTA_NAT_MINIP-1] = sizeof(u_int32_t), + [CTA_NAT_MAXIP-1] = sizeof(u_int32_t), +}; + +static inline int +ctnetlink_parse_nat(struct nfattr *cda[], + const struct nf_conn *ct, struct ip_nat_range *range) +{ + struct nfattr *tb[CTA_NAT_MAX]; + int err; + + DEBUGP("entered %s\n", __FUNCTION__); + + memset(range, 0, sizeof(*range)); + + nfattr_parse_nested(tb, CTA_NAT_MAX, cda[CTA_NAT-1]); + + if (nfattr_bad_size(tb, CTA_NAT_MAX, cta_min_nat)) + return -EINVAL; + + if (tb[CTA_NAT_MINIP-1]) + range->min_ip = *(u_int32_t *)NFA_DATA(tb[CTA_NAT_MINIP-1]); + + if (!tb[CTA_NAT_MAXIP-1]) + range->max_ip = range->min_ip; + else + range->max_ip = *(u_int32_t *)NFA_DATA(tb[CTA_NAT_MAXIP-1]); + + if (range->min_ip) + range->flags |= IP_NAT_RANGE_MAP_IPS; + + if (!tb[CTA_NAT_PROTO-1]) + return 0; + + err = ctnetlink_parse_nat_proto(tb[CTA_NAT_PROTO-1], ct, range); + if (err < 0) + return err; + + DEBUGP("leaving\n"); + return 0; +} +#endif + +static inline int +ctnetlink_parse_help(struct nfattr *attr, char **helper_name) +{ + struct nfattr *tb[CTA_HELP_MAX]; + + DEBUGP("entered %s\n", __FUNCTION__); + + nfattr_parse_nested(tb, CTA_HELP_MAX, attr); + + if (!tb[CTA_HELP_NAME-1]) + return -EINVAL; + + *helper_name = NFA_DATA(tb[CTA_HELP_NAME-1]); + + return 0; +} + +static const size_t cta_min[CTA_MAX] = { + [CTA_STATUS-1] = sizeof(u_int32_t), + [CTA_TIMEOUT-1] = sizeof(u_int32_t), + [CTA_MARK-1] = sizeof(u_int32_t), + [CTA_USE-1] = sizeof(u_int32_t), + [CTA_ID-1] = sizeof(u_int32_t) +}; + +static int +ctnetlink_del_conntrack(struct sock *ctnl, struct sk_buff *skb, + struct nlmsghdr *nlh, struct nfattr *cda[], int *errp) +{ + struct nf_conntrack_tuple_hash *h; + struct nf_conntrack_tuple tuple; + struct nf_conn *ct; + struct nfgenmsg *nfmsg = NLMSG_DATA(nlh); + u_int8_t u3 = nfmsg->nfgen_family; + int err = 0; + + DEBUGP("entered %s\n", __FUNCTION__); + + if (nfattr_bad_size(cda, CTA_MAX, cta_min)) + return -EINVAL; + + if (cda[CTA_TUPLE_ORIG-1]) + err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG, u3); + else if (cda[CTA_TUPLE_REPLY-1]) + err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY, u3); + else { + /* Flush the whole table */ + nf_conntrack_flush(); + return 0; + } + + if (err < 0) + return err; + + h = nf_conntrack_find_get(&tuple, NULL); + if (!h) { + DEBUGP("tuple not found in conntrack hash\n"); + return -ENOENT; + } + + ct = nf_ct_tuplehash_to_ctrack(h); + + if (cda[CTA_ID-1]) { + u_int32_t id = ntohl(*(u_int32_t *)NFA_DATA(cda[CTA_ID-1])); + if (ct->id != id) { + nf_ct_put(ct); + return -ENOENT; + } + } + if (del_timer(&ct->timeout)) + ct->timeout.function((unsigned long)ct); + + nf_ct_put(ct); + DEBUGP("leaving\n"); + + return 0; +} + +static int +ctnetlink_get_conntrack(struct sock *ctnl, struct sk_buff *skb, + struct nlmsghdr *nlh, struct nfattr *cda[], int *errp) +{ + struct nf_conntrack_tuple_hash *h; + struct nf_conntrack_tuple tuple; + struct nf_conn *ct; + struct sk_buff *skb2 = NULL; + struct nfgenmsg *nfmsg = NLMSG_DATA(nlh); + u_int8_t u3 = nfmsg->nfgen_family; + int err = 0; + + DEBUGP("entered %s\n", __FUNCTION__); + + if (nlh->nlmsg_flags & NLM_F_DUMP) { + u32 rlen; + + if (nfmsg->nfgen_family != AF_INET) + return -EAFNOSUPPORT; + + if (NFNL_MSG_TYPE(nlh->nlmsg_type) == + IPCTNL_MSG_CT_GET_CTRZERO) { +#ifdef CONFIG_NF_CT_ACCT + if ((*errp = netlink_dump_start(ctnl, skb, nlh, + ctnetlink_dump_table_w, + ctnetlink_done)) != 0) + return -EINVAL; +#else + return -ENOTSUPP; +#endif + } else { + if ((*errp = netlink_dump_start(ctnl, skb, nlh, + ctnetlink_dump_table, + ctnetlink_done)) != 0) + return -EINVAL; + } + + rlen = NLMSG_ALIGN(nlh->nlmsg_len); + if (rlen > skb->len) + rlen = skb->len; + skb_pull(skb, rlen); + return 0; + } + + if (nfattr_bad_size(cda, CTA_MAX, cta_min)) + return -EINVAL; + + if (cda[CTA_TUPLE_ORIG-1]) + err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG, u3); + else if (cda[CTA_TUPLE_REPLY-1]) + err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY, u3); + else + return -EINVAL; + + if (err < 0) + return err; + + h = nf_conntrack_find_get(&tuple, NULL); + if (!h) { + DEBUGP("tuple not found in conntrack hash"); + return -ENOENT; + } + DEBUGP("tuple found\n"); + ct = nf_ct_tuplehash_to_ctrack(h); + + err = -ENOMEM; + skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + if (!skb2) { + nf_ct_put(ct); + return -ENOMEM; + } + NETLINK_CB(skb2).dst_pid = NETLINK_CB(skb).pid; + + err = ctnetlink_fill_info(skb2, NETLINK_CB(skb).pid, nlh->nlmsg_seq, + IPCTNL_MSG_CT_NEW, 1, ct); + nf_ct_put(ct); + if (err <= 0) + goto free; + + err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT); + if (err < 0) + goto out; + + DEBUGP("leaving\n"); + return 0; + +free: + kfree_skb(skb2); +out: + return err; +} + +static inline int +ctnetlink_change_status(struct nf_conn *ct, struct nfattr *cda[]) +{ + unsigned long d; + unsigned status = ntohl(*(u_int32_t *)NFA_DATA(cda[CTA_STATUS-1])); + d = ct->status ^ status; + + if (d & (IPS_EXPECTED|IPS_CONFIRMED|IPS_DYING)) + /* unchangeable */ + return -EINVAL; + + if (d & IPS_SEEN_REPLY && !(status & IPS_SEEN_REPLY)) + /* SEEN_REPLY bit can only be set */ + return -EINVAL; + + + if (d & IPS_ASSURED && !(status & IPS_ASSURED)) + /* ASSURED bit can only be set */ + return -EINVAL; + + if (cda[CTA_NAT-1]) { +#ifndef CONFIG_IP_NF_NAT_NEEDED + return -EINVAL; +#else + unsigned int hooknum; + struct ip_nat_range range; + + if (ctnetlink_parse_nat(cda, ct, &range) < 0) + return -EINVAL; + + DEBUGP("NAT: %u.%u.%u.%u-%u.%u.%u.%u:%u-%u\n", + NIPQUAD(range.min_ip), NIPQUAD(range.max_ip), + htons(range.min.all), htons(range.max.all)); + + /* This is tricky but it works. ip_nat_setup_info needs the + * hook number as parameter, so let's do the correct + * conversion and run away */ + if (status & IPS_SRC_NAT_DONE) + hooknum = NF_IP_POST_ROUTING; /* IP_NAT_MANIP_SRC */ + else if (status & IPS_DST_NAT_DONE) + hooknum = NF_IP_PRE_ROUTING; /* IP_NAT_MANIP_DST */ + else + return -EINVAL; /* Missing NAT flags */ + + DEBUGP("NAT status: %lu\n", + status & (IPS_NAT_MASK | IPS_NAT_DONE_MASK)); + + if (ip_nat_initialized(ct, HOOK2MANIP(hooknum))) + return -EEXIST; + ip_nat_setup_info(ct, &range, hooknum); + + DEBUGP("NAT status after setup_info: %lu\n", + ct->status & (IPS_NAT_MASK | IPS_NAT_DONE_MASK)); +#endif + } + + /* Be careful here, modifying NAT bits can screw up things, + * so don't let users modify them directly if they don't pass + * ip_nat_range. */ + ct->status |= status & ~(IPS_NAT_DONE_MASK | IPS_NAT_MASK); + return 0; +} + + +static inline int +ctnetlink_change_helper(struct nf_conn *ct, struct nfattr *cda[]) +{ + struct nf_conntrack_helper *helper; + char *helpname; + int err; + + DEBUGP("entered %s\n", __FUNCTION__); + + /* don't change helper of sibling connections */ + if (ct->master) + return -EINVAL; + + err = ctnetlink_parse_help(cda[CTA_HELP-1], &helpname); + if (err < 0) + return err; + + helper = __nf_conntrack_helper_find_byname(helpname); + if (!helper) { + if (!strcmp(helpname, "")) + helper = NULL; + else + return -EINVAL; + } + + if (ct->helper) { + if (!helper) { + /* we had a helper before ... */ + nf_ct_remove_expectations(ct); + ct->helper = NULL; + } else { + /* need to zero data of old helper */ + memset(&ct->help, 0, sizeof(ct->help)); + } + } + + ct->helper = helper; + + return 0; +} + +static inline int +ctnetlink_change_timeout(struct nf_conn *ct, struct nfattr *cda[]) +{ + u_int32_t timeout = ntohl(*(u_int32_t *)NFA_DATA(cda[CTA_TIMEOUT-1])); + + if (!del_timer(&ct->timeout)) + return -ETIME; + + ct->timeout.expires = jiffies + timeout * HZ; + add_timer(&ct->timeout); + + return 0; +} + +static inline int +ctnetlink_change_protoinfo(struct nf_conn *ct, struct nfattr *cda[]) +{ + struct nfattr *tb[CTA_PROTOINFO_MAX], *attr = cda[CTA_PROTOINFO-1]; + struct nf_conntrack_protocol *proto; + u_int16_t npt = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum; + u_int16_t l3num = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num; + int err = 0; + + nfattr_parse_nested(tb, CTA_PROTOINFO_MAX, attr); + + proto = nf_ct_proto_find_get(l3num, npt); + + if (proto->from_nfattr) + err = proto->from_nfattr(tb, ct); + nf_ct_proto_put(proto); + + return err; +} + +static int +ctnetlink_change_conntrack(struct nf_conn *ct, struct nfattr *cda[]) +{ + int err; + + DEBUGP("entered %s\n", __FUNCTION__); + + if (cda[CTA_HELP-1]) { + err = ctnetlink_change_helper(ct, cda); + if (err < 0) + return err; + } + + if (cda[CTA_TIMEOUT-1]) { + err = ctnetlink_change_timeout(ct, cda); + if (err < 0) + return err; + } + + if (cda[CTA_STATUS-1]) { + err = ctnetlink_change_status(ct, cda); + if (err < 0) + return err; + } + + if (cda[CTA_PROTOINFO-1]) { + err = ctnetlink_change_protoinfo(ct, cda); + if (err < 0) + return err; + } + +#if defined(CONFIG_IP_NF_CONNTRACK_MARK) + if (cda[CTA_MARK-1]) + ct->mark = ntohl(*(u_int32_t *)NFA_DATA(cda[CTA_MARK-1])); +#endif + + DEBUGP("all done\n"); + return 0; +} + +static int +ctnetlink_create_conntrack(struct nfattr *cda[], + struct nf_conntrack_tuple *otuple, + struct nf_conntrack_tuple *rtuple) +{ + struct nf_conn *ct; + int err = -EINVAL; + + DEBUGP("entered %s\n", __FUNCTION__); + + ct = nf_conntrack_alloc(otuple, rtuple); + if (ct == NULL || IS_ERR(ct)) + return -ENOMEM; + + if (!cda[CTA_TIMEOUT-1]) + goto err; + ct->timeout.expires = ntohl(*(u_int32_t *)NFA_DATA(cda[CTA_TIMEOUT-1])); + + ct->timeout.expires = jiffies + ct->timeout.expires * HZ; + ct->status |= IPS_CONFIRMED; + + err = ctnetlink_change_status(ct, cda); + if (err < 0) + goto err; + + if (cda[CTA_PROTOINFO-1]) { + err = ctnetlink_change_protoinfo(ct, cda); + if (err < 0) + return err; + } + +#if defined(CONFIG_IP_NF_CONNTRACK_MARK) + if (cda[CTA_MARK-1]) + ct->mark = ntohl(*(u_int32_t *)NFA_DATA(cda[CTA_MARK-1])); +#endif + + ct->helper = nf_ct_helper_find_get(rtuple); + + add_timer(&ct->timeout); + nf_conntrack_hash_insert(ct); + + if (ct->helper) + nf_ct_helper_put(ct->helper); + + DEBUGP("conntrack with id %u inserted\n", ct->id); + return 0; + +err: + nf_conntrack_free(ct); + return err; +} + +static int +ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb, + struct nlmsghdr *nlh, struct nfattr *cda[], int *errp) +{ + struct nf_conntrack_tuple otuple, rtuple; + struct nf_conntrack_tuple_hash *h = NULL; + struct nfgenmsg *nfmsg = NLMSG_DATA(nlh); + u_int8_t u3 = nfmsg->nfgen_family; + int err = 0; + + DEBUGP("entered %s\n", __FUNCTION__); + + if (nfattr_bad_size(cda, CTA_MAX, cta_min)) + return -EINVAL; + + if (cda[CTA_TUPLE_ORIG-1]) { + err = ctnetlink_parse_tuple(cda, &otuple, CTA_TUPLE_ORIG, u3); + if (err < 0) + return err; + } + + if (cda[CTA_TUPLE_REPLY-1]) { + err = ctnetlink_parse_tuple(cda, &rtuple, CTA_TUPLE_REPLY, u3); + if (err < 0) + return err; + } + + write_lock_bh(&nf_conntrack_lock); + if (cda[CTA_TUPLE_ORIG-1]) + h = __nf_conntrack_find(&otuple, NULL); + else if (cda[CTA_TUPLE_REPLY-1]) + h = __nf_conntrack_find(&rtuple, NULL); + + if (h == NULL) { + write_unlock_bh(&nf_conntrack_lock); + DEBUGP("no such conntrack, create new\n"); + err = -ENOENT; + if (nlh->nlmsg_flags & NLM_F_CREATE) + err = ctnetlink_create_conntrack(cda, &otuple, &rtuple); + return err; + } + /* implicit 'else' */ + + /* we only allow nat config for new conntracks */ + if (cda[CTA_NAT-1]) { + err = -EINVAL; + goto out_unlock; + } + + /* We manipulate the conntrack inside the global conntrack table lock, + * so there's no need to increase the refcount */ + DEBUGP("conntrack found\n"); + err = -EEXIST; + if (!(nlh->nlmsg_flags & NLM_F_EXCL)) + err = ctnetlink_change_conntrack(nf_ct_tuplehash_to_ctrack(h), cda); + +out_unlock: + write_unlock_bh(&nf_conntrack_lock); + return err; +} + +/*********************************************************************** + * EXPECT + ***********************************************************************/ + +static inline int +ctnetlink_exp_dump_tuple(struct sk_buff *skb, + const struct nf_conntrack_tuple *tuple, + enum ctattr_expect type) +{ + struct nfattr *nest_parms = NFA_NEST(skb, type); + + if (ctnetlink_dump_tuples(skb, tuple) < 0) + goto nfattr_failure; + + NFA_NEST_END(skb, nest_parms); + + return 0; + +nfattr_failure: + return -1; +} + +static inline int +ctnetlink_exp_dump_expect(struct sk_buff *skb, + const struct nf_conntrack_expect *exp) +{ + struct nf_conn *master = exp->master; + u_int32_t timeout = htonl((exp->timeout.expires - jiffies) / HZ); + u_int32_t id = htonl(exp->id); + + if (ctnetlink_exp_dump_tuple(skb, &exp->tuple, CTA_EXPECT_TUPLE) < 0) + goto nfattr_failure; + if (ctnetlink_exp_dump_tuple(skb, &exp->mask, CTA_EXPECT_MASK) < 0) + goto nfattr_failure; + if (ctnetlink_exp_dump_tuple(skb, + &master->tuplehash[IP_CT_DIR_ORIGINAL].tuple, + CTA_EXPECT_MASTER) < 0) + goto nfattr_failure; + + NFA_PUT(skb, CTA_EXPECT_TIMEOUT, sizeof(timeout), &timeout); + NFA_PUT(skb, CTA_EXPECT_ID, sizeof(u_int32_t), &id); + + return 0; + +nfattr_failure: + return -1; +} + +static int +ctnetlink_exp_fill_info(struct sk_buff *skb, u32 pid, u32 seq, + int event, + int nowait, + const struct nf_conntrack_expect *exp) +{ + struct nlmsghdr *nlh; + struct nfgenmsg *nfmsg; + unsigned char *b; + + b = skb->tail; + + event |= NFNL_SUBSYS_CTNETLINK_EXP << 8; + nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(struct nfgenmsg)); + nfmsg = NLMSG_DATA(nlh); + + nlh->nlmsg_flags = (nowait && pid) ? NLM_F_MULTI : 0; + nfmsg->nfgen_family = exp->tuple.src.l3num; + nfmsg->version = NFNETLINK_V0; + nfmsg->res_id = 0; + + if (ctnetlink_exp_dump_expect(skb, exp) < 0) + goto nfattr_failure; + + nlh->nlmsg_len = skb->tail - b; + return skb->len; + +nlmsg_failure: +nfattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +#ifdef CONFIG_NF_CONNTRACK_EVENTS +static int ctnetlink_expect_event(struct notifier_block *this, + unsigned long events, void *ptr) +{ + struct nlmsghdr *nlh; + struct nfgenmsg *nfmsg; + struct nf_conntrack_expect *exp = (struct nf_conntrack_expect *)ptr; + struct sk_buff *skb; + unsigned int type; + unsigned char *b; + int flags = 0; + + if (events & IPEXP_NEW) { + type = IPCTNL_MSG_EXP_NEW; + flags = NLM_F_CREATE|NLM_F_EXCL; + } else + return NOTIFY_DONE; + + skb = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC); + if (!skb) + return NOTIFY_DONE; + + b = skb->tail; + + type |= NFNL_SUBSYS_CTNETLINK << 8; + nlh = NLMSG_PUT(skb, 0, 0, type, sizeof(struct nfgenmsg)); + nfmsg = NLMSG_DATA(nlh); + + nlh->nlmsg_flags = flags; + nfmsg->nfgen_family = exp->tuple.src.l3num; + nfmsg->version = NFNETLINK_V0; + nfmsg->res_id = 0; + + if (ctnetlink_exp_dump_expect(skb, exp) < 0) + goto nfattr_failure; + + nlh->nlmsg_len = skb->tail - b; + nfnetlink_send(skb, 0, NFNLGRP_CONNTRACK_EXP_NEW, 0); + return NOTIFY_DONE; + +nlmsg_failure: +nfattr_failure: + kfree_skb(skb); + return NOTIFY_DONE; +} +#endif + +static int +ctnetlink_exp_dump_table(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct nf_conntrack_expect *exp = NULL; + struct list_head *i; + u_int32_t *id = (u_int32_t *) &cb->args[0]; + + DEBUGP("entered %s, last id=%llu\n", __FUNCTION__, *id); + + read_lock_bh(&nf_conntrack_lock); + list_for_each_prev(i, &nf_conntrack_expect_list) { + exp = (struct nf_conntrack_expect *) i; + if (exp->id <= *id) + continue; + if (ctnetlink_exp_fill_info(skb, NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, + IPCTNL_MSG_EXP_NEW, + 1, exp) < 0) + goto out; + *id = exp->id; + } +out: + read_unlock_bh(&nf_conntrack_lock); + + DEBUGP("leaving, last id=%llu\n", *id); + + return skb->len; +} + +static const size_t cta_min_exp[CTA_EXPECT_MAX] = { + [CTA_EXPECT_TIMEOUT-1] = sizeof(u_int32_t), + [CTA_EXPECT_ID-1] = sizeof(u_int32_t) +}; + +static int +ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb, + struct nlmsghdr *nlh, struct nfattr *cda[], int *errp) +{ + struct nf_conntrack_tuple tuple; + struct nf_conntrack_expect *exp; + struct sk_buff *skb2; + struct nfgenmsg *nfmsg = NLMSG_DATA(nlh); + u_int8_t u3 = nfmsg->nfgen_family; + int err = 0; + + DEBUGP("entered %s\n", __FUNCTION__); + + if (nfattr_bad_size(cda, CTA_EXPECT_MAX, cta_min_exp)) + return -EINVAL; + + if (nlh->nlmsg_flags & NLM_F_DUMP) { + u32 rlen; + + if (nfmsg->nfgen_family != AF_INET) + return -EAFNOSUPPORT; + + if ((*errp = netlink_dump_start(ctnl, skb, nlh, + ctnetlink_exp_dump_table, + ctnetlink_done)) != 0) + return -EINVAL; + rlen = NLMSG_ALIGN(nlh->nlmsg_len); + if (rlen > skb->len) + rlen = skb->len; + skb_pull(skb, rlen); + return 0; + } + + if (cda[CTA_EXPECT_MASTER-1]) + err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_MASTER, u3); + else + return -EINVAL; + + if (err < 0) + return err; + + exp = nf_conntrack_expect_find(&tuple); + if (!exp) + return -ENOENT; + + if (cda[CTA_EXPECT_ID-1]) { + u_int32_t id = *(u_int32_t *)NFA_DATA(cda[CTA_EXPECT_ID-1]); + if (exp->id != ntohl(id)) { + nf_conntrack_expect_put(exp); + return -ENOENT; + } + } + + err = -ENOMEM; + skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + if (!skb2) + goto out; + NETLINK_CB(skb2).dst_pid = NETLINK_CB(skb).pid; + + err = ctnetlink_exp_fill_info(skb2, NETLINK_CB(skb).pid, + nlh->nlmsg_seq, IPCTNL_MSG_EXP_NEW, + 1, exp); + if (err <= 0) + goto free; + + nf_conntrack_expect_put(exp); + + return netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT); + +free: + kfree_skb(skb2); +out: + nf_conntrack_expect_put(exp); + return err; +} + +static int +ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb, + struct nlmsghdr *nlh, struct nfattr *cda[], int *errp) +{ + struct nf_conntrack_expect *exp, *tmp; + struct nf_conntrack_tuple tuple; + struct nf_conntrack_helper *h; + struct nfgenmsg *nfmsg = NLMSG_DATA(nlh); + u_int8_t u3 = nfmsg->nfgen_family; + int err; + + if (nfattr_bad_size(cda, CTA_EXPECT_MAX, cta_min_exp)) + return -EINVAL; + + if (cda[CTA_EXPECT_TUPLE-1]) { + /* delete a single expect by tuple */ + err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE, u3); + if (err < 0) + return err; + + /* bump usage count to 2 */ + exp = nf_conntrack_expect_find(&tuple); + if (!exp) + return -ENOENT; + + if (cda[CTA_EXPECT_ID-1]) { + u_int32_t id = + *(u_int32_t *)NFA_DATA(cda[CTA_EXPECT_ID-1]); + if (exp->id != ntohl(id)) { + nf_conntrack_expect_put(exp); + return -ENOENT; + } + } + + /* after list removal, usage count == 1 */ + nf_conntrack_unexpect_related(exp); + /* have to put what we 'get' above. + * after this line usage count == 0 */ + nf_conntrack_expect_put(exp); + } else if (cda[CTA_EXPECT_HELP_NAME-1]) { + char *name = NFA_DATA(cda[CTA_EXPECT_HELP_NAME-1]); + + /* delete all expectations for this helper */ + write_lock_bh(&nf_conntrack_lock); + h = __nf_conntrack_helper_find_byname(name); + if (!h) { + write_unlock_bh(&nf_conntrack_lock); + return -EINVAL; + } + list_for_each_entry_safe(exp, tmp, &nf_conntrack_expect_list, + list) { + if (exp->master->helper == h + && del_timer(&exp->timeout)) { + nf_ct_unlink_expect(exp); + nf_conntrack_expect_put(exp); + } + } + write_unlock_bh(&nf_conntrack_lock); + } else { + /* This basically means we have to flush everything*/ + write_lock_bh(&nf_conntrack_lock); + list_for_each_entry_safe(exp, tmp, &nf_conntrack_expect_list, + list) { + if (del_timer(&exp->timeout)) { + nf_ct_unlink_expect(exp); + nf_conntrack_expect_put(exp); + } + } + write_unlock_bh(&nf_conntrack_lock); + } + + return 0; +} +static int +ctnetlink_change_expect(struct nf_conntrack_expect *x, struct nfattr *cda[]) +{ + return -EOPNOTSUPP; +} + +static int +ctnetlink_create_expect(struct nfattr *cda[], u_int8_t u3) +{ + struct nf_conntrack_tuple tuple, mask, master_tuple; + struct nf_conntrack_tuple_hash *h = NULL; + struct nf_conntrack_expect *exp; + struct nf_conn *ct; + int err = 0; + + DEBUGP("entered %s\n", __FUNCTION__); + + /* caller guarantees that those three CTA_EXPECT_* exist */ + err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE, u3); + if (err < 0) + return err; + err = ctnetlink_parse_tuple(cda, &mask, CTA_EXPECT_MASK, u3); + if (err < 0) + return err; + err = ctnetlink_parse_tuple(cda, &master_tuple, CTA_EXPECT_MASTER, u3); + if (err < 0) + return err; + + /* Look for master conntrack of this expectation */ + h = nf_conntrack_find_get(&master_tuple, NULL); + if (!h) + return -ENOENT; + ct = nf_ct_tuplehash_to_ctrack(h); + + if (!ct->helper) { + /* such conntrack hasn't got any helper, abort */ + err = -EINVAL; + goto out; + } + + exp = nf_conntrack_expect_alloc(ct); + if (!exp) { + err = -ENOMEM; + goto out; + } + + exp->expectfn = NULL; + exp->flags = 0; + exp->master = ct; + memcpy(&exp->tuple, &tuple, sizeof(struct nf_conntrack_tuple)); + memcpy(&exp->mask, &mask, sizeof(struct nf_conntrack_tuple)); + + err = nf_conntrack_expect_related(exp); + nf_conntrack_expect_put(exp); + +out: + nf_ct_put(nf_ct_tuplehash_to_ctrack(h)); + return err; +} + +static int +ctnetlink_new_expect(struct sock *ctnl, struct sk_buff *skb, + struct nlmsghdr *nlh, struct nfattr *cda[], int *errp) +{ + struct nf_conntrack_tuple tuple; + struct nf_conntrack_expect *exp; + struct nfgenmsg *nfmsg = NLMSG_DATA(nlh); + u_int8_t u3 = nfmsg->nfgen_family; + int err = 0; + + DEBUGP("entered %s\n", __FUNCTION__); + + if (nfattr_bad_size(cda, CTA_EXPECT_MAX, cta_min_exp)) + return -EINVAL; + + if (!cda[CTA_EXPECT_TUPLE-1] + || !cda[CTA_EXPECT_MASK-1] + || !cda[CTA_EXPECT_MASTER-1]) + return -EINVAL; + + err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE, u3); + if (err < 0) + return err; + + write_lock_bh(&nf_conntrack_lock); + exp = __nf_conntrack_expect_find(&tuple); + + if (!exp) { + write_unlock_bh(&nf_conntrack_lock); + err = -ENOENT; + if (nlh->nlmsg_flags & NLM_F_CREATE) + err = ctnetlink_create_expect(cda, u3); + return err; + } + + err = -EEXIST; + if (!(nlh->nlmsg_flags & NLM_F_EXCL)) + err = ctnetlink_change_expect(exp, cda); + write_unlock_bh(&nf_conntrack_lock); + + DEBUGP("leaving\n"); + + return err; +} + +#ifdef CONFIG_NF_CONNTRACK_EVENTS +static struct notifier_block ctnl_notifier = { + .notifier_call = ctnetlink_conntrack_event, +}; + +static struct notifier_block ctnl_notifier_exp = { + .notifier_call = ctnetlink_expect_event, +}; +#endif + +static struct nfnl_callback ctnl_cb[IPCTNL_MSG_MAX] = { + [IPCTNL_MSG_CT_NEW] = { .call = ctnetlink_new_conntrack, + .attr_count = CTA_MAX, }, + [IPCTNL_MSG_CT_GET] = { .call = ctnetlink_get_conntrack, + .attr_count = CTA_MAX, }, + [IPCTNL_MSG_CT_DELETE] = { .call = ctnetlink_del_conntrack, + .attr_count = CTA_MAX, }, + [IPCTNL_MSG_CT_GET_CTRZERO] = { .call = ctnetlink_get_conntrack, + .attr_count = CTA_MAX, }, +}; + +static struct nfnl_callback ctnl_exp_cb[IPCTNL_MSG_EXP_MAX] = { + [IPCTNL_MSG_EXP_GET] = { .call = ctnetlink_get_expect, + .attr_count = CTA_EXPECT_MAX, }, + [IPCTNL_MSG_EXP_NEW] = { .call = ctnetlink_new_expect, + .attr_count = CTA_EXPECT_MAX, }, + [IPCTNL_MSG_EXP_DELETE] = { .call = ctnetlink_del_expect, + .attr_count = CTA_EXPECT_MAX, }, +}; + +static struct nfnetlink_subsystem ctnl_subsys = { + .name = "conntrack", + .subsys_id = NFNL_SUBSYS_CTNETLINK, + .cb_count = IPCTNL_MSG_MAX, + .cb = ctnl_cb, +}; + +static struct nfnetlink_subsystem ctnl_exp_subsys = { + .name = "conntrack_expect", + .subsys_id = NFNL_SUBSYS_CTNETLINK_EXP, + .cb_count = IPCTNL_MSG_EXP_MAX, + .cb = ctnl_exp_cb, +}; + +MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_CTNETLINK); + +static int __init ctnetlink_init(void) +{ + int ret; + + printk("ctnetlink v%s: registering with nfnetlink.\n", version); + ret = nfnetlink_subsys_register(&ctnl_subsys); + if (ret < 0) { + printk("ctnetlink_init: cannot register with nfnetlink.\n"); + goto err_out; + } + + ret = nfnetlink_subsys_register(&ctnl_exp_subsys); + if (ret < 0) { + printk("ctnetlink_init: cannot register exp with nfnetlink.\n"); + goto err_unreg_subsys; + } + +#ifdef CONFIG_NF_CONNTRACK_EVENTS + ret = nf_conntrack_register_notifier(&ctnl_notifier); + if (ret < 0) { + printk("ctnetlink_init: cannot register notifier.\n"); + goto err_unreg_exp_subsys; + } + + ret = nf_conntrack_expect_register_notifier(&ctnl_notifier_exp); + if (ret < 0) { + printk("ctnetlink_init: cannot expect register notifier.\n"); + goto err_unreg_notifier; + } +#endif + + return 0; + +#ifdef CONFIG_NF_CONNTRACK_EVENTS +err_unreg_notifier: + nf_conntrack_unregister_notifier(&ctnl_notifier); +err_unreg_exp_subsys: + nfnetlink_subsys_unregister(&ctnl_exp_subsys); +#endif +err_unreg_subsys: + nfnetlink_subsys_unregister(&ctnl_subsys); +err_out: + return ret; +} + +static void __exit ctnetlink_exit(void) +{ + printk("ctnetlink: unregistering from nfnetlink.\n"); + +#ifdef CONFIG_NF_CONNTRACK_EVENTS + nf_conntrack_unregister_notifier(&ctnl_notifier_exp); + nf_conntrack_unregister_notifier(&ctnl_notifier); +#endif + + nfnetlink_subsys_unregister(&ctnl_exp_subsys); + nfnetlink_subsys_unregister(&ctnl_subsys); + return; +} + +module_init(ctnetlink_init); +module_exit(ctnetlink_exit); diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index 6035633d8225..6167137a5cb5 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -1147,6 +1147,63 @@ static int tcp_new(struct nf_conn *conntrack, receiver->td_scale); return 1; } + +#if defined(CONFIG_NF_CT_NETLINK) || \ + defined(CONFIG_NF_CT_NETLINK_MODULE) + +#include +#include + +static int tcp_to_nfattr(struct sk_buff *skb, struct nfattr *nfa, + const struct nf_conn *ct) +{ + struct nfattr *nest_parms; + + read_lock_bh(&tcp_lock); + nest_parms = NFA_NEST(skb, CTA_PROTOINFO_TCP); + NFA_PUT(skb, CTA_PROTOINFO_TCP_STATE, sizeof(u_int8_t), + &ct->proto.tcp.state); + read_unlock_bh(&tcp_lock); + + NFA_NEST_END(skb, nest_parms); + + return 0; + +nfattr_failure: + read_unlock_bh(&tcp_lock); + return -1; +} + +static const size_t cta_min_tcp[CTA_PROTOINFO_TCP_MAX] = { + [CTA_PROTOINFO_TCP_STATE-1] = sizeof(u_int8_t), +}; + +static int nfattr_to_tcp(struct nfattr *cda[], struct nf_conn *ct) +{ + struct nfattr *attr = cda[CTA_PROTOINFO_TCP-1]; + struct nfattr *tb[CTA_PROTOINFO_TCP_MAX]; + + /* updates could not contain anything about the private + * protocol info, in that case skip the parsing */ + if (!attr) + return 0; + + nfattr_parse_nested(tb, CTA_PROTOINFO_TCP_MAX, attr); + + if (nfattr_bad_size(tb, CTA_PROTOINFO_TCP_MAX, cta_min_tcp)) + return -EINVAL; + + if (!tb[CTA_PROTOINFO_TCP_STATE-1]) + return -EINVAL; + + write_lock_bh(&tcp_lock); + ct->proto.tcp.state = + *(u_int8_t *)NFA_DATA(tb[CTA_PROTOINFO_TCP_STATE-1]); + write_unlock_bh(&tcp_lock); + + return 0; +} +#endif struct nf_conntrack_protocol nf_conntrack_protocol_tcp4 = { @@ -1160,6 +1217,13 @@ struct nf_conntrack_protocol nf_conntrack_protocol_tcp4 = .packet = tcp_packet, .new = tcp_new, .error = tcp_error4, +#if defined(CONFIG_NF_CT_NETLINK) || \ + defined(CONFIG_NF_CT_NETLINK_MODULE) + .to_nfattr = tcp_to_nfattr, + .from_nfattr = nfattr_to_tcp, + .tuple_to_nfattr = nf_ct_port_tuple_to_nfattr, + .nfattr_to_tuple = nf_ct_port_nfattr_to_tuple, +#endif }; struct nf_conntrack_protocol nf_conntrack_protocol_tcp6 = @@ -1174,6 +1238,13 @@ struct nf_conntrack_protocol nf_conntrack_protocol_tcp6 = .packet = tcp_packet, .new = tcp_new, .error = tcp_error6, +#if defined(CONFIG_NF_CT_NETLINK) || \ + defined(CONFIG_NF_CT_NETLINK_MODULE) + .to_nfattr = tcp_to_nfattr, + .from_nfattr = nfattr_to_tcp, + .tuple_to_nfattr = nf_ct_port_tuple_to_nfattr, + .nfattr_to_tuple = nf_ct_port_nfattr_to_tuple, +#endif }; EXPORT_SYMBOL(nf_conntrack_protocol_tcp4); diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c index 3cae7ce420dd..1a592a556182 100644 --- a/net/netfilter/nf_conntrack_proto_udp.c +++ b/net/netfilter/nf_conntrack_proto_udp.c @@ -196,6 +196,11 @@ struct nf_conntrack_protocol nf_conntrack_protocol_udp4 = .packet = udp_packet, .new = udp_new, .error = udp_error4, +#if defined(CONFIG_NF_CT_NETLINK) || \ + defined(CONFIG_NF_CT_NETLINK_MODULE) + .tuple_to_nfattr = nf_ct_port_tuple_to_nfattr, + .nfattr_to_tuple = nf_ct_port_nfattr_to_tuple, +#endif }; struct nf_conntrack_protocol nf_conntrack_protocol_udp6 = @@ -210,6 +215,11 @@ struct nf_conntrack_protocol nf_conntrack_protocol_udp6 = .packet = udp_packet, .new = udp_new, .error = udp_error6, +#if defined(CONFIG_NF_CT_NETLINK) || \ + defined(CONFIG_NF_CT_NETLINK_MODULE) + .tuple_to_nfattr = nf_ct_port_tuple_to_nfattr, + .nfattr_to_tuple = nf_ct_port_nfattr_to_tuple, +#endif }; EXPORT_SYMBOL(nf_conntrack_protocol_udp4); diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 5af381f9fe3d..d17e42b28c79 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -161,14 +161,14 @@ static int ct_seq_show(struct seq_file *s, void *v) if (NF_CT_DIRECTION(hash)) return 0; - l3proto = nf_ct_find_l3proto(conntrack->tuplehash[IP_CT_DIR_ORIGINAL] - .tuple.src.l3num); + l3proto = __nf_ct_l3proto_find(conntrack->tuplehash[IP_CT_DIR_ORIGINAL] + .tuple.src.l3num); NF_CT_ASSERT(l3proto); - proto = nf_ct_find_proto(conntrack->tuplehash[IP_CT_DIR_ORIGINAL] - .tuple.src.l3num, - conntrack->tuplehash[IP_CT_DIR_ORIGINAL] - .tuple.dst.protonum); + proto = __nf_ct_proto_find(conntrack->tuplehash[IP_CT_DIR_ORIGINAL] + .tuple.src.l3num, + conntrack->tuplehash[IP_CT_DIR_ORIGINAL] + .tuple.dst.protonum); NF_CT_ASSERT(proto); if (seq_printf(s, "%-8s %u %-8s %u %ld ", @@ -307,9 +307,9 @@ static int exp_seq_show(struct seq_file *s, void *v) expect->tuple.src.l3num, expect->tuple.dst.protonum); print_tuple(s, &expect->tuple, - nf_ct_find_l3proto(expect->tuple.src.l3num), - nf_ct_find_proto(expect->tuple.src.l3num, - expect->tuple.dst.protonum)); + __nf_ct_l3proto_find(expect->tuple.src.l3num), + __nf_ct_proto_find(expect->tuple.src.l3num, + expect->tuple.dst.protonum)); return seq_putc(s, '\n'); } @@ -847,7 +847,11 @@ EXPORT_SYMBOL(nf_conntrack_helper_unregister); EXPORT_SYMBOL(nf_ct_iterate_cleanup); EXPORT_SYMBOL(__nf_ct_refresh_acct); EXPORT_SYMBOL(nf_ct_protos); -EXPORT_SYMBOL(nf_ct_find_proto); +EXPORT_SYMBOL(__nf_ct_proto_find); +EXPORT_SYMBOL(nf_ct_proto_find_get); +EXPORT_SYMBOL(nf_ct_proto_put); +EXPORT_SYMBOL(nf_ct_l3proto_find_get); +EXPORT_SYMBOL(nf_ct_l3proto_put); EXPORT_SYMBOL(nf_ct_l3protos); EXPORT_SYMBOL(nf_conntrack_expect_alloc); EXPORT_SYMBOL(nf_conntrack_expect_put); @@ -867,3 +871,21 @@ EXPORT_SYMBOL(nf_ct_get_tuple); EXPORT_SYMBOL(nf_ct_invert_tuple); EXPORT_SYMBOL(nf_conntrack_in); EXPORT_SYMBOL(__nf_conntrack_attach); +EXPORT_SYMBOL(nf_conntrack_alloc); +EXPORT_SYMBOL(nf_conntrack_free); +EXPORT_SYMBOL(nf_conntrack_flush); +EXPORT_SYMBOL(nf_ct_remove_expectations); +EXPORT_SYMBOL(nf_ct_helper_find_get); +EXPORT_SYMBOL(nf_ct_helper_put); +EXPORT_SYMBOL(__nf_conntrack_helper_find_byname); +EXPORT_SYMBOL(__nf_conntrack_find); +EXPORT_SYMBOL(nf_ct_unlink_expect); +EXPORT_SYMBOL(nf_conntrack_hash_insert); +EXPORT_SYMBOL(__nf_conntrack_expect_find); +EXPORT_SYMBOL(nf_conntrack_expect_find); +EXPORT_SYMBOL(nf_conntrack_expect_list); +#if defined(CONFIG_NF_CT_NETLINK) || \ + defined(CONFIG_NF_CT_NETLINK_MODULE) +EXPORT_SYMBOL(nf_ct_port_tuple_to_nfattr); +EXPORT_SYMBOL(nf_ct_port_nfattr_to_tuple); +#endif -- cgit v1.2.3 From 1bd9bef6f9fe06dd0c628ac877c85b6b36aca062 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Thu, 5 Jan 2006 12:20:59 -0800 Subject: [NETFILTER]: Call POST_ROUTING hook before fragmentation Call POST_ROUTING hook before fragmentation to get rid of the okfn use in ip_refrag and save the useless fragmentation/defragmentation step when NAT is used. The patch introduces one user-visible change, the POSTROUTING chain in the mangle table gets entire packets, not fragments, which should simplify use of the MARK and CLASSIFY targets for queueing as a nice side-effect. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- include/net/ip.h | 1 - net/ipv4/ip_output.c | 30 ++++++++++++-------------- net/ipv4/netfilter/ip_conntrack_standalone.c | 26 +--------------------- net/ipv4/netfilter/ip_nat_standalone.c | 17 --------------- net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c | 26 +--------------------- 5 files changed, 16 insertions(+), 84 deletions(-) (limited to 'include/net') diff --git a/include/net/ip.h b/include/net/ip.h index f7e7fd728b67..7bb5804847f2 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -317,7 +317,6 @@ enum ip_defrag_users IP_DEFRAG_CALL_RA_CHAIN, IP_DEFRAG_CONNTRACK_IN, IP_DEFRAG_CONNTRACK_OUT, - IP_DEFRAG_NAT_OUT, IP_DEFRAG_VS_IN, IP_DEFRAG_VS_OUT, IP_DEFRAG_VS_FWD diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 2a830de3a699..71da31818cfc 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -202,13 +202,11 @@ static inline int ip_finish_output2(struct sk_buff *skb) static inline int ip_finish_output(struct sk_buff *skb) { - struct net_device *dev = skb->dst->dev; - - skb->dev = dev; - skb->protocol = htons(ETH_P_IP); - - return NF_HOOK(PF_INET, NF_IP_POST_ROUTING, skb, NULL, dev, - ip_finish_output2); + if (skb->len > dst_mtu(skb->dst) && + !(skb_shinfo(skb)->ufo_size || skb_shinfo(skb)->tso_size)) + return ip_fragment(skb, ip_finish_output2); + else + return ip_finish_output2(skb); } int ip_mc_output(struct sk_buff *skb) @@ -265,21 +263,21 @@ int ip_mc_output(struct sk_buff *skb) newskb->dev, ip_dev_loopback_xmit); } - if (skb->len > dst_mtu(&rt->u.dst)) - return ip_fragment(skb, ip_finish_output); - else - return ip_finish_output(skb); + return NF_HOOK(PF_INET, NF_IP_POST_ROUTING, skb, NULL, skb->dev, + ip_finish_output); } int ip_output(struct sk_buff *skb) { + struct net_device *dev = skb->dst->dev; + IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS); - if (skb->len > dst_mtu(skb->dst) && - !(skb_shinfo(skb)->ufo_size || skb_shinfo(skb)->tso_size)) - return ip_fragment(skb, ip_finish_output); - else - return ip_finish_output(skb); + skb->dev = dev; + skb->protocol = htons(ETH_P_IP); + + return NF_HOOK(PF_INET, NF_IP_POST_ROUTING, skb, NULL, dev, + ip_finish_output); } int ip_queue_xmit(struct sk_buff *skb, int ipfragok) diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c index a88bcc551244..7ba97783e741 100644 --- a/net/ipv4/netfilter/ip_conntrack_standalone.c +++ b/net/ipv4/netfilter/ip_conntrack_standalone.c @@ -451,30 +451,6 @@ static unsigned int ip_conntrack_defrag(unsigned int hooknum, return NF_ACCEPT; } -static unsigned int ip_refrag(unsigned int hooknum, - struct sk_buff **pskb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ - struct rtable *rt = (struct rtable *)(*pskb)->dst; - - /* We've seen it coming out the other side: confirm */ - if (ip_confirm(hooknum, pskb, in, out, okfn) != NF_ACCEPT) - return NF_DROP; - - /* Local packets are never produced too large for their - interface. We degfragment them at LOCAL_OUT, however, - so we have to refragment them here. */ - if ((*pskb)->len > dst_mtu(&rt->u.dst) && - !skb_shinfo(*pskb)->tso_size) { - /* No hook can be after us, so this should be OK. */ - ip_fragment(*pskb, okfn); - return NF_STOLEN; - } - return NF_ACCEPT; -} - static unsigned int ip_conntrack_local(unsigned int hooknum, struct sk_buff **pskb, const struct net_device *in, @@ -544,7 +520,7 @@ static struct nf_hook_ops ip_conntrack_helper_in_ops = { /* Refragmenter; last chance. */ static struct nf_hook_ops ip_conntrack_out_ops = { - .hook = ip_refrag, + .hook = ip_confirm, .owner = THIS_MODULE, .pf = PF_INET, .hooknum = NF_IP_POST_ROUTING, diff --git a/net/ipv4/netfilter/ip_nat_standalone.c b/net/ipv4/netfilter/ip_nat_standalone.c index 30cd4e18c129..f04111f74e09 100644 --- a/net/ipv4/netfilter/ip_nat_standalone.c +++ b/net/ipv4/netfilter/ip_nat_standalone.c @@ -190,23 +190,6 @@ ip_nat_out(unsigned int hooknum, || (*pskb)->nh.iph->ihl * 4 < sizeof(struct iphdr)) return NF_ACCEPT; - /* We can hit fragment here; forwarded packets get - defragmented by connection tracking coming in, then - fragmented (grr) by the forward code. - - In future: If we have nfct != NULL, AND we have NAT - initialized, AND there is no helper, then we can do full - NAPT on the head, and IP-address-only NAT on the rest. - - I'm starting to have nightmares about fragments. */ - - if ((*pskb)->nh.iph->frag_off & htons(IP_MF|IP_OFFSET)) { - *pskb = ip_ct_gather_frags(*pskb, IP_DEFRAG_NAT_OUT); - - if (!*pskb) - return NF_STOLEN; - } - return ip_nat_fn(hooknum, pskb, in, out, okfn); } diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index 385867efd481..1d36e8effe4f 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c @@ -180,30 +180,6 @@ static unsigned int ipv4_conntrack_defrag(unsigned int hooknum, return NF_ACCEPT; } -static unsigned int ipv4_refrag(unsigned int hooknum, - struct sk_buff **pskb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ - struct rtable *rt = (struct rtable *)(*pskb)->dst; - - /* We've seen it coming out the other side: confirm */ - if (ipv4_confirm(hooknum, pskb, in, out, okfn) != NF_ACCEPT) - return NF_DROP; - - /* Local packets are never produced too large for their - interface. We degfragment them at LOCAL_OUT, however, - so we have to refragment them here. */ - if ((*pskb)->len > dst_mtu(&rt->u.dst) && - !skb_shinfo(*pskb)->tso_size) { - /* No hook can be after us, so this should be OK. */ - ip_fragment(*pskb, okfn); - return NF_STOLEN; - } - return NF_ACCEPT; -} - static unsigned int ipv4_conntrack_in(unsigned int hooknum, struct sk_buff **pskb, const struct net_device *in, @@ -283,7 +259,7 @@ static struct nf_hook_ops ipv4_conntrack_helper_in_ops = { /* Refragmenter; last chance. */ static struct nf_hook_ops ipv4_conntrack_out_ops = { - .hook = ipv4_refrag, + .hook = ipv4_confirm, .owner = THIS_MODULE, .pf = PF_INET, .hooknum = NF_IP_POST_ROUTING, -- cgit v1.2.3 From 4bad4dc919573dbe9a5b41dd9edff279e99822d7 Mon Sep 17 00:00:00 2001 From: Kris Katterjohn Date: Fri, 6 Jan 2006 13:08:20 -0800 Subject: [NET]: Change sk_run_filter()'s return type in net/core/filter.c It should return an unsigned value, and fix sk_filter() as well. Signed-off-by: Kris Katterjohn Signed-off-by: David S. Miller --- include/linux/filter.h | 2 +- include/net/sock.h | 4 ++-- net/core/filter.c | 6 +++--- 3 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include/net') diff --git a/include/linux/filter.h b/include/linux/filter.h index 3ba843c46382..c6cb8f095088 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -143,7 +143,7 @@ static inline unsigned int sk_filter_len(struct sk_filter *fp) struct sk_buff; struct sock; -extern int sk_run_filter(struct sk_buff *skb, struct sock_filter *filter, int flen); +extern unsigned int sk_run_filter(struct sk_buff *skb, struct sock_filter *filter, int flen); extern int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk); extern int sk_chk_filter(struct sock_filter *filter, int flen); #endif /* __KERNEL__ */ diff --git a/include/net/sock.h b/include/net/sock.h index 6961700ff3a0..1806e5b61419 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -856,8 +856,8 @@ static inline int sk_filter(struct sock *sk, struct sk_buff *skb, int needlock) filter = sk->sk_filter; if (filter) { - int pkt_len = sk_run_filter(skb, filter->insns, - filter->len); + unsigned int pkt_len = sk_run_filter(skb, filter->insns, + filter->len); if (!pkt_len) err = -EPERM; else diff --git a/net/core/filter.c b/net/core/filter.c index 8964d3445588..9eb9d0017a01 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -75,7 +75,7 @@ static inline void *load_pointer(struct sk_buff *skb, int k, * len is the number of filter blocks in the array. */ -int sk_run_filter(struct sk_buff *skb, struct sock_filter *filter, int flen) +unsigned int sk_run_filter(struct sk_buff *skb, struct sock_filter *filter, int flen) { struct sock_filter *fentry; /* We walk down these */ void *ptr; @@ -241,9 +241,9 @@ load_b: A = X; continue; case BPF_RET|BPF_K: - return ((unsigned int)fentry->k); + return fentry->k; case BPF_RET|BPF_A: - return ((unsigned int)A); + return A; case BPF_ST: mem[fentry->k] = A; continue; -- cgit v1.2.3