From 81f875208e7f46d003bedb82d5cfe54458a3ab60 Mon Sep 17 00:00:00 2001 From: James Ketrenos Date: Mon, 24 Oct 2005 10:20:53 -0500 Subject: scripts/Lindent on ieee80211 subsystem. Signed-off-by: James Ketrenos --- net/ieee80211/ieee80211_wx.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ieee80211/ieee80211_wx.c b/net/ieee80211/ieee80211_wx.c index 1ce7af9bec35..f5b805355000 100644 --- a/net/ieee80211/ieee80211_wx.c +++ b/net/ieee80211/ieee80211_wx.c @@ -161,9 +161,11 @@ static inline char *ipw2100_translate_scan(struct ieee80211_device *ieee, (ieee->perfect_rssi - ieee->worst_rssi) - (ieee->perfect_rssi - network->stats.rssi) * (15 * (ieee->perfect_rssi - ieee->worst_rssi) + - 62 * (ieee->perfect_rssi - network->stats.rssi))) / - ((ieee->perfect_rssi - ieee->worst_rssi) * - (ieee->perfect_rssi - ieee->worst_rssi)); + 62 * (ieee->perfect_rssi - + network->stats.rssi))) / + ((ieee->perfect_rssi - + ieee->worst_rssi) * (ieee->perfect_rssi - + ieee->worst_rssi)); if (iwe.u.qual.qual > 100) iwe.u.qual.qual = 100; else if (iwe.u.qual.qual < 1) -- cgit v1.2.3 From e189277a3f1cbb0f1282e0f4b8fa8c91e004c286 Mon Sep 17 00:00:00 2001 From: Volker Braun Date: Mon, 24 Oct 2005 10:15:36 -0500 Subject: Fix problem with WEP unicast key > index 0 The functions ieee80211_wx_{get,set}_encodeext fail if one tries to set unicast (IW_ENCODE_EXT_GROUP_KEY not set) keys at key indices>0. But at least some Cisco APs dish out dynamic WEP unicast keys at index !=0. Signed-off-by: Volker Braun Signed-off-by: James Ketrenos --- net/ieee80211/ieee80211_wx.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ieee80211/ieee80211_wx.c b/net/ieee80211/ieee80211_wx.c index f5b805355000..181755f2aa8b 100644 --- a/net/ieee80211/ieee80211_wx.c +++ b/net/ieee80211/ieee80211_wx.c @@ -522,7 +522,8 @@ int ieee80211_wx_set_encodeext(struct ieee80211_device *ieee, crypt = &ieee->crypt[idx]; group_key = 1; } else { - if (idx != 0) + /* some Cisco APs use idx>0 for unicast in dynamic WEP */ + if (idx != 0 && ext->alg != IW_ENCODE_ALG_WEP) return -EINVAL; if (ieee->iw_mode == IW_MODE_INFRA) crypt = &ieee->crypt[idx]; @@ -690,7 +691,8 @@ int ieee80211_wx_get_encodeext(struct ieee80211_device *ieee, } else idx = ieee->tx_keyidx; - if (!ext->ext_flags & IW_ENCODE_EXT_GROUP_KEY) + if (!ext->ext_flags & IW_ENCODE_EXT_GROUP_KEY && + ext->alg != IW_ENCODE_ALG_WEP) if (idx != 0 || ieee->iw_mode != IW_MODE_INFRA) return -EINVAL; -- cgit v1.2.3 From fd7a516efbcdabf5d7b9307ca9fe48b511b7d123 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Wed, 2 Nov 2005 01:53:16 +0100 Subject: [PATCH] fix NET_RADIO=n, IEEE80211=y compile This patch fixes the following compile error with CONFIG_NET_RADIO=n and CONFIG_IEEE80211=y: LD .tmp_vmlinux1 net/built-in.o: In function `ieee80211_rx': : undefined reference to `wireless_spy_update' make: *** [.tmp_vmlinux1] Error 1 Signed-off-by: Adrian Bunk Signed-off-by: John W. Linville --- net/ieee80211/ieee80211_rx.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/ieee80211/ieee80211_rx.c b/net/ieee80211/ieee80211_rx.c index ce694cf5c160..00eb780836d8 100644 --- a/net/ieee80211/ieee80211_rx.c +++ b/net/ieee80211/ieee80211_rx.c @@ -370,6 +370,7 @@ int ieee80211_rx(struct ieee80211_device *ieee, struct sk_buff *skb, /* Put this code here so that we avoid duplicating it in all * Rx paths. - Jean II */ #ifdef IW_WIRELESS_SPY /* defined in iw_handler.h */ +#ifdef CONFIG_NET_RADIO /* If spy monitoring on */ if (ieee->spy_data.spy_number > 0) { struct iw_quality wstats; @@ -396,6 +397,7 @@ int ieee80211_rx(struct ieee80211_device *ieee, struct sk_buff *skb, /* Update spy records */ wireless_spy_update(ieee->dev, hdr->addr2, &wstats); } +#endif /* CONFIG_NET_RADIO */ #endif /* IW_WIRELESS_SPY */ #ifdef NOT_YET -- cgit v1.2.3 From 971f359ddcb2e7a0d577479c7561bda407febe1b Mon Sep 17 00:00:00 2001 From: YOSHIFUJI Hideaki Date: Tue, 8 Nov 2005 09:37:56 -0800 Subject: [IPV6]: Put addr_diff() into common header for future use. Signed-off-by: YOSHIFUJI Hideaki Signed-off-by: David S. Miller --- include/net/ipv6.h | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ net/ipv6/ip6_fib.c | 54 ++---------------------------------------------------- 2 files changed, 50 insertions(+), 52 deletions(-) (limited to 'net') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 65ec86678a08..98661fa4fc78 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -340,6 +340,54 @@ static inline int ipv6_addr_any(const struct in6_addr *a) a->s6_addr32[2] | a->s6_addr32[3] ) == 0); } +/* + * find the first different bit between two addresses + * length of address must be a multiple of 32bits + */ +static inline int __ipv6_addr_diff(const void *token1, const void *token2, int addrlen) +{ + const __u32 *a1 = token1, *a2 = token2; + int i; + + addrlen >>= 2; + + for (i = 0; i < addrlen; i++) { + __u32 xb = a1[i] ^ a2[i]; + if (xb) { + int j = 31; + + xb = ntohl(xb); + while ((xb & (1 << j)) == 0) + j--; + + return (i * 32 + 31 - j); + } + } + + /* + * we should *never* get to this point since that + * would mean the addrs are equal + * + * However, we do get to it 8) And exacly, when + * addresses are equal 8) + * + * ip route add 1111::/128 via ... + * ip route add 1111::/64 via ... + * and we are here. + * + * Ideally, this function should stop comparison + * at prefix length. It does not, but it is still OK, + * if returned value is greater than prefix length. + * --ANK (980803) + */ + return (addrlen << 5); +} + +static inline int ipv6_addr_diff(const struct in6_addr *a1, const struct in6_addr *a2) +{ + return __ipv6_addr_diff(a1, a2, sizeof(struct in6_addr)); +} + /* * Prototypes exported by ipv6 */ diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 4fcc5a7acf6e..1bf6d9a769e6 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -127,56 +127,6 @@ static __inline__ int addr_bit_set(void *token, int fn_bit) return htonl(1 << ((~fn_bit)&0x1F)) & addr[fn_bit>>5]; } -/* - * find the first different bit between two addresses - * length of address must be a multiple of 32bits - */ - -static __inline__ int addr_diff(void *token1, void *token2, int addrlen) -{ - __u32 *a1 = token1; - __u32 *a2 = token2; - int i; - - addrlen >>= 2; - - for (i = 0; i < addrlen; i++) { - __u32 xb; - - xb = a1[i] ^ a2[i]; - - if (xb) { - int j = 31; - - xb = ntohl(xb); - - while ((xb & (1 << j)) == 0) - j--; - - return (i * 32 + 31 - j); - } - } - - /* - * we should *never* get to this point since that - * would mean the addrs are equal - * - * However, we do get to it 8) And exacly, when - * addresses are equal 8) - * - * ip route add 1111::/128 via ... - * ip route add 1111::/64 via ... - * and we are here. - * - * Ideally, this function should stop comparison - * at prefix length. It does not, but it is still OK, - * if returned value is greater than prefix length. - * --ANK (980803) - */ - - return addrlen<<5; -} - static __inline__ struct fib6_node * node_alloc(void) { struct fib6_node *fn; @@ -296,11 +246,11 @@ insert_above: /* find 1st bit in difference between the 2 addrs. - See comment in addr_diff: bit may be an invalid value, + See comment in __ipv6_addr_diff: bit may be an invalid value, but if it is >= plen, the value is ignored in any case. */ - bit = addr_diff(addr, &key->addr, addrlen); + bit = __ipv6_addr_diff(addr, &key->addr, addrlen); /* * (intermediate)[in] -- cgit v1.2.3 From b1cacb6820e0afc4aeeea67bcb5296a316862cad Mon Sep 17 00:00:00 2001 From: YOSHIFUJI Hideaki Date: Tue, 8 Nov 2005 09:38:12 -0800 Subject: [IPV6]: Make ipv6_addr_type() more generic so that we can use it for source address selection. Signed-off-by: YOSHIFUJI Hideaki Signed-off-by: David S. Miller --- include/net/ipv6.h | 19 +++++++++-- net/ipv6/addrconf.c | 90 +++++++++++++++++++++++++++------------------------- net/ipv6/ipv6_syms.c | 2 +- 3 files changed, 64 insertions(+), 47 deletions(-) (limited to 'net') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 98661fa4fc78..6addb4d464d6 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -252,12 +252,25 @@ typedef int (*inet_getfrag_t) (const void *data, char *, unsigned int, unsigned int); - -extern int ipv6_addr_type(const struct in6_addr *addr); +extern int __ipv6_addr_type(const struct in6_addr *addr); +static inline int ipv6_addr_type(const struct in6_addr *addr) +{ + return __ipv6_addr_type(addr) & 0xffff; +} static inline int ipv6_addr_scope(const struct in6_addr *addr) { - return ipv6_addr_type(addr) & IPV6_ADDR_SCOPE_MASK; + return __ipv6_addr_type(addr) & IPV6_ADDR_SCOPE_MASK; +} + +static inline int __ipv6_addr_src_scope(int type) +{ + return (type == IPV6_ADDR_ANY ? __IPV6_ADDR_SCOPE_INVALID : (type >> 16)); +} + +static inline int ipv6_addr_src_scope(const struct in6_addr *addr) +{ + return __ipv6_addr_src_scope(__ipv6_addr_type(addr)); } static inline int ipv6_addr_cmp(const struct in6_addr *a1, const struct in6_addr *a2) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 2c5f57299d63..ff895da6395b 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -35,6 +35,9 @@ * YOSHIFUJI Hideaki @USAGI : ARCnet support * YOSHIFUJI Hideaki @USAGI : convert /proc/net/if_inet6 to * seq_file. + * YOSHIFUJI Hideaki @USAGI : improved source address + * selection; consider scope, + * status etc. */ #include @@ -193,46 +196,51 @@ const struct in6_addr in6addr_any = IN6ADDR_ANY_INIT; #endif const struct in6_addr in6addr_loopback = IN6ADDR_LOOPBACK_INIT; -int ipv6_addr_type(const struct in6_addr *addr) +#define IPV6_ADDR_SCOPE_TYPE(scope) ((scope) << 16) + +static inline unsigned ipv6_addr_scope2type(unsigned scope) +{ + switch(scope) { + case IPV6_ADDR_SCOPE_NODELOCAL: + return (IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_NODELOCAL) | + IPV6_ADDR_LOOPBACK); + case IPV6_ADDR_SCOPE_LINKLOCAL: + return (IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_LINKLOCAL) | + IPV6_ADDR_LINKLOCAL); + case IPV6_ADDR_SCOPE_SITELOCAL: + return (IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_SITELOCAL) | + IPV6_ADDR_SITELOCAL); + } + return IPV6_ADDR_SCOPE_TYPE(scope); +} + +int __ipv6_addr_type(const struct in6_addr *addr) { - int type; u32 st; st = addr->s6_addr32[0]; - if ((st & htonl(0xFF000000)) == htonl(0xFF000000)) { - type = IPV6_ADDR_MULTICAST; - - switch((st & htonl(0x00FF0000))) { - case __constant_htonl(0x00010000): - type |= IPV6_ADDR_LOOPBACK; - break; - - case __constant_htonl(0x00020000): - type |= IPV6_ADDR_LINKLOCAL; - break; - - case __constant_htonl(0x00050000): - type |= IPV6_ADDR_SITELOCAL; - break; - }; - return type; - } - - type = IPV6_ADDR_UNICAST; - /* Consider all addresses with the first three bits different of - 000 and 111 as finished. + 000 and 111 as unicasts. */ if ((st & htonl(0xE0000000)) != htonl(0x00000000) && (st & htonl(0xE0000000)) != htonl(0xE0000000)) - return type; - - if ((st & htonl(0xFFC00000)) == htonl(0xFE800000)) - return (IPV6_ADDR_LINKLOCAL | type); + return (IPV6_ADDR_UNICAST | + IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_GLOBAL)); + if ((st & htonl(0xFF000000)) == htonl(0xFF000000)) { + /* multicast */ + /* addr-select 3.1 */ + return (IPV6_ADDR_MULTICAST | + ipv6_addr_scope2type(IPV6_ADDR_MC_SCOPE(addr))); + } + + if ((st & htonl(0xFFC00000)) == htonl(0xFE800000)) + return (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST | + IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_LINKLOCAL)); /* addr-select 3.1 */ if ((st & htonl(0xFFC00000)) == htonl(0xFEC00000)) - return (IPV6_ADDR_SITELOCAL | type); + return (IPV6_ADDR_SITELOCAL | IPV6_ADDR_UNICAST | + IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_SITELOCAL)); /* addr-select 3.1 */ if ((addr->s6_addr32[0] | addr->s6_addr32[1]) == 0) { if (addr->s6_addr32[2] == 0) { @@ -240,24 +248,20 @@ int ipv6_addr_type(const struct in6_addr *addr) return IPV6_ADDR_ANY; if (addr->s6_addr32[3] == htonl(0x00000001)) - return (IPV6_ADDR_LOOPBACK | type); + return (IPV6_ADDR_LOOPBACK | IPV6_ADDR_UNICAST | + IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_LINKLOCAL)); /* addr-select 3.4 */ - return (IPV6_ADDR_COMPATv4 | type); + return (IPV6_ADDR_COMPATv4 | IPV6_ADDR_UNICAST | + IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_GLOBAL)); /* addr-select 3.3 */ } if (addr->s6_addr32[2] == htonl(0x0000ffff)) - return IPV6_ADDR_MAPPED; - } - - st &= htonl(0xFF000000); - if (st == 0) - return IPV6_ADDR_RESERVED; - st &= htonl(0xFE000000); - if (st == htonl(0x02000000)) - return IPV6_ADDR_RESERVED; /* for NSAP */ - if (st == htonl(0x04000000)) - return IPV6_ADDR_RESERVED; /* for IPX */ - return type; + return (IPV6_ADDR_MAPPED | + IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_GLOBAL)); /* addr-select 3.3 */ + } + + return (IPV6_ADDR_RESERVED | + IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_GLOBAL)); /* addr-select 3.4 */ } static void addrconf_del_timer(struct inet6_ifaddr *ifp) diff --git a/net/ipv6/ipv6_syms.c b/net/ipv6/ipv6_syms.c index 37a4a99c9fe9..16482785bdfd 100644 --- a/net/ipv6/ipv6_syms.c +++ b/net/ipv6/ipv6_syms.c @@ -7,7 +7,7 @@ #include #include -EXPORT_SYMBOL(ipv6_addr_type); +EXPORT_SYMBOL(__ipv6_addr_type); EXPORT_SYMBOL(icmpv6_send); EXPORT_SYMBOL(icmpv6_statistics); EXPORT_SYMBOL(icmpv6_err_convert); -- cgit v1.2.3 From 072047e4de3800905e09d0f8ef0e1cc4e91a601e Mon Sep 17 00:00:00 2001 From: YOSHIFUJI Hideaki Date: Tue, 8 Nov 2005 09:38:30 -0800 Subject: [IPV6]: RFC3484 compliant source address selection Choose more appropriate source address; e.g. - outgoing interface - non-deprecated - scope - matching label Signed-off-by: YOSHIFUJI Hideaki Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 344 ++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 240 insertions(+), 104 deletions(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index ff895da6395b..a34d1504deb9 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -809,138 +809,274 @@ out: #endif /* - * Choose an appropriate source address - * should do: - * i) get an address with an appropriate scope - * ii) see if there is a specific route for the destination and use - * an address of the attached interface - * iii) don't use deprecated addresses + * Choose an appropriate source address (RFC3484) */ -static int inline ipv6_saddr_pref(const struct inet6_ifaddr *ifp, u8 invpref) +struct ipv6_saddr_score { + int addr_type; + unsigned int attrs; + int matchlen; + unsigned int scope; + unsigned int rule; +}; + +#define IPV6_SADDR_SCORE_LOCAL 0x0001 +#define IPV6_SADDR_SCORE_PREFERRED 0x0004 +#define IPV6_SADDR_SCORE_HOA 0x0008 +#define IPV6_SADDR_SCORE_OIF 0x0010 +#define IPV6_SADDR_SCORE_LABEL 0x0020 +#define IPV6_SADDR_SCORE_PRIVACY 0x0040 + +static int inline ipv6_saddr_preferred(int type) { - int pref; - pref = ifp->flags&IFA_F_DEPRECATED ? 0 : 2; -#ifdef CONFIG_IPV6_PRIVACY - pref |= (ifp->flags^invpref)&IFA_F_TEMPORARY ? 0 : 1; -#endif - return pref; + if (type & (IPV6_ADDR_MAPPED|IPV6_ADDR_COMPATv4| + IPV6_ADDR_LOOPBACK|IPV6_ADDR_RESERVED)) + return 1; + return 0; } -#ifdef CONFIG_IPV6_PRIVACY -#define IPV6_GET_SADDR_MAXSCORE(score) ((score) == 3) -#else -#define IPV6_GET_SADDR_MAXSCORE(score) (score) -#endif +/* static matching label */ +static int inline ipv6_saddr_label(const struct in6_addr *addr, int type) +{ + /* + * prefix (longest match) label + * ----------------------------- + * ::1/128 0 + * ::/0 1 + * 2002::/16 2 + * ::/96 3 + * ::ffff:0:0/96 4 + */ + if (type & IPV6_ADDR_LOOPBACK) + return 0; + else if (type & IPV6_ADDR_COMPATv4) + return 3; + else if (type & IPV6_ADDR_MAPPED) + return 4; + else if (addr->s6_addr16[0] == htons(0x2002)) + return 2; + return 1; +} -int ipv6_dev_get_saddr(struct net_device *dev, +int ipv6_dev_get_saddr(struct net_device *daddr_dev, struct in6_addr *daddr, struct in6_addr *saddr) { - struct inet6_ifaddr *ifp = NULL; - struct inet6_ifaddr *match = NULL; - struct inet6_dev *idev; - int scope; - int err; - int hiscore = -1, score; + struct ipv6_saddr_score hiscore; + struct inet6_ifaddr *ifa_result = NULL; + int daddr_type = __ipv6_addr_type(daddr); + int daddr_scope = __ipv6_addr_src_scope(daddr_type); + u32 daddr_label = ipv6_saddr_label(daddr, daddr_type); + struct net_device *dev; - scope = ipv6_addr_scope(daddr); + memset(&hiscore, 0, sizeof(hiscore)); - /* - * known dev - * search dev and walk through dev addresses - */ + read_lock(&dev_base_lock); + read_lock(&addrconf_lock); - if (dev) { - if (dev->flags & IFF_LOOPBACK) - scope = IFA_HOST; + for (dev = dev_base; dev; dev=dev->next) { + struct inet6_dev *idev; + struct inet6_ifaddr *ifa; + + /* Rule 0: Candidate Source Address (section 4) + * - multicast and link-local destination address, + * the set of candidate source address MUST only + * include addresses assigned to interfaces + * belonging to the same link as the outgoing + * interface. + * (- For site-local destination addresses, the + * set of candidate source addresses MUST only + * include addresses assigned to interfaces + * belonging to the same site as the outgoing + * interface.) + */ + if ((daddr_type & IPV6_ADDR_MULTICAST || + daddr_scope <= IPV6_ADDR_SCOPE_LINKLOCAL) && + daddr_dev && dev != daddr_dev) + continue; - read_lock(&addrconf_lock); idev = __in6_dev_get(dev); - if (idev) { - read_lock_bh(&idev->lock); - for (ifp=idev->addr_list; ifp; ifp=ifp->if_next) { - if (ifp->scope == scope) { - if (ifp->flags&IFA_F_TENTATIVE) - continue; -#ifdef CONFIG_IPV6_PRIVACY - score = ipv6_saddr_pref(ifp, idev->cnf.use_tempaddr > 1 ? IFA_F_TEMPORARY : 0); -#else - score = ipv6_saddr_pref(ifp, 0); -#endif - if (score <= hiscore) - continue; + if (!idev) + continue; - if (match) - in6_ifa_put(match); - match = ifp; - hiscore = score; - in6_ifa_hold(ifp); + read_lock_bh(&idev->lock); + for (ifa = idev->addr_list; ifa; ifa = ifa->if_next) { + struct ipv6_saddr_score score; - if (IPV6_GET_SADDR_MAXSCORE(score)) { - read_unlock_bh(&idev->lock); - read_unlock(&addrconf_lock); - goto out; - } + score.addr_type = __ipv6_addr_type(&ifa->addr); + + /* Rule 0: Candidate Source Address (section 4) + * - In any case, anycast addresses, multicast + * addresses, and the unspecified address MUST + * NOT be included in a candidate set. + */ + if (unlikely(score.addr_type == IPV6_ADDR_ANY || + score.addr_type & IPV6_ADDR_MULTICAST)) { + LIMIT_NETDEBUG(KERN_DEBUG + "ADDRCONF: unspecified / multicast address" + "assigned as unicast address on %s", + dev->name); + continue; + } + + score.attrs = 0; + score.matchlen = 0; + score.scope = 0; + score.rule = 0; + + if (ifa_result == NULL) { + /* record it if the first available entry */ + goto record_it; + } + + /* Rule 1: Prefer same address */ + if (hiscore.rule < 1) { + if (ipv6_addr_equal(&ifa_result->addr, daddr)) + hiscore.attrs |= IPV6_SADDR_SCORE_LOCAL; + hiscore.rule++; + } + if (ipv6_addr_equal(&ifa->addr, daddr)) { + score.attrs |= IPV6_SADDR_SCORE_LOCAL; + if (!(hiscore.attrs & IPV6_SADDR_SCORE_LOCAL)) { + score.rule = 1; + goto record_it; } + } else { + if (hiscore.attrs & IPV6_SADDR_SCORE_LOCAL) + continue; } - read_unlock_bh(&idev->lock); - } - read_unlock(&addrconf_lock); - } - if (scope == IFA_LINK) - goto out; + /* Rule 2: Prefer appropriate scope */ + if (hiscore.rule < 2) { + hiscore.scope = __ipv6_addr_src_scope(hiscore.addr_type); + hiscore.rule++; + } + score.scope = __ipv6_addr_src_scope(score.addr_type); + if (hiscore.scope < score.scope) { + if (hiscore.scope < daddr_scope) { + score.rule = 2; + goto record_it; + } else + continue; + } else if (score.scope < hiscore.scope) { + if (score.scope < daddr_scope) + continue; + else { + score.rule = 2; + goto record_it; + } + } - /* - * dev == NULL or search failed for specified dev - */ + /* Rule 3: Avoid deprecated address */ + if (hiscore.rule < 3) { + if (ipv6_saddr_preferred(hiscore.addr_type) || + !(ifa_result->flags & IFA_F_DEPRECATED)) + hiscore.attrs |= IPV6_SADDR_SCORE_PREFERRED; + hiscore.rule++; + } + if (ipv6_saddr_preferred(score.addr_type) || + !(ifa->flags & IFA_F_DEPRECATED)) { + score.attrs |= IPV6_SADDR_SCORE_PREFERRED; + if (!(hiscore.attrs & IPV6_SADDR_SCORE_PREFERRED)) { + score.rule = 3; + goto record_it; + } + } else { + if (hiscore.attrs & IPV6_SADDR_SCORE_PREFERRED) + continue; + } - read_lock(&dev_base_lock); - read_lock(&addrconf_lock); - for (dev = dev_base; dev; dev=dev->next) { - idev = __in6_dev_get(dev); - if (idev) { - read_lock_bh(&idev->lock); - for (ifp=idev->addr_list; ifp; ifp=ifp->if_next) { - if (ifp->scope == scope) { - if (ifp->flags&IFA_F_TENTATIVE) - continue; -#ifdef CONFIG_IPV6_PRIVACY - score = ipv6_saddr_pref(ifp, idev->cnf.use_tempaddr > 1 ? IFA_F_TEMPORARY : 0); -#else - score = ipv6_saddr_pref(ifp, 0); -#endif - if (score <= hiscore) - continue; + /* Rule 4: Prefer home address -- not implemented yet */ - if (match) - in6_ifa_put(match); - match = ifp; - hiscore = score; - in6_ifa_hold(ifp); + /* Rule 5: Prefer outgoing interface */ + if (hiscore.rule < 5) { + if (daddr_dev == NULL || + daddr_dev == ifa_result->idev->dev) + hiscore.attrs |= IPV6_SADDR_SCORE_OIF; + hiscore.rule++; + } + if (daddr_dev == NULL || + daddr_dev == ifa->idev->dev) { + score.attrs |= IPV6_SADDR_SCORE_OIF; + if (!(hiscore.attrs & IPV6_SADDR_SCORE_OIF)) { + score.rule = 5; + goto record_it; + } + } else { + if (hiscore.attrs & IPV6_SADDR_SCORE_OIF) + continue; + } - if (IPV6_GET_SADDR_MAXSCORE(score)) { - read_unlock_bh(&idev->lock); - goto out_unlock_base; - } + /* Rule 6: Prefer matching label */ + if (hiscore.rule < 6) { + if (ipv6_saddr_label(&ifa_result->addr, hiscore.addr_type) == daddr_label) + hiscore.attrs |= IPV6_SADDR_SCORE_LABEL; + hiscore.rule++; + } + if (ipv6_saddr_label(&ifa->addr, score.addr_type) == daddr_label) { + score.attrs |= IPV6_SADDR_SCORE_LABEL; + if (!(hiscore.attrs & IPV6_SADDR_SCORE_LABEL)) { + score.rule = 6; + goto record_it; } + } else { + if (hiscore.attrs & IPV6_SADDR_SCORE_LABEL) + continue; } - read_unlock_bh(&idev->lock); + + /* Rule 7: Prefer public address + * Note: prefer temprary address if use_tempaddr >= 2 + */ + if (hiscore.rule < 7) { + if ((!(ifa_result->flags & IFA_F_TEMPORARY)) ^ + (ifa_result->idev->cnf.use_tempaddr >= 2)) + hiscore.attrs |= IPV6_SADDR_SCORE_PRIVACY; + hiscore.rule++; + } + if ((!(ifa->flags & IFA_F_TEMPORARY)) ^ + (ifa->idev->cnf.use_tempaddr >= 2)) { + score.attrs |= IPV6_SADDR_SCORE_PRIVACY; + if (!(hiscore.attrs & IPV6_SADDR_SCORE_PRIVACY)) { + score.rule = 7; + goto record_it; + } + } else { + if (hiscore.attrs & IPV6_SADDR_SCORE_PRIVACY) + continue; + } + + /* Rule 8: Use longest matching prefix */ + if (hiscore.rule < 8) + hiscore.matchlen = ipv6_addr_diff(&ifa_result->addr, daddr); + score.rule++; + score.matchlen = ipv6_addr_diff(&ifa->addr, daddr); + if (score.matchlen > hiscore.matchlen) { + score.rule = 8; + goto record_it; + } +#if 0 + else if (score.matchlen < hiscore.matchlen) + continue; +#endif + + /* Final Rule: choose first available one */ + continue; +record_it: + if (ifa_result) + in6_ifa_put(ifa_result); + in6_ifa_hold(ifa); + ifa_result = ifa; + hiscore = score; } + read_unlock_bh(&idev->lock); } - -out_unlock_base: read_unlock(&addrconf_lock); read_unlock(&dev_base_lock); -out: - err = -EADDRNOTAVAIL; - if (match) { - ipv6_addr_copy(saddr, &match->addr); - err = 0; - in6_ifa_put(match); - } - - return err; + if (!ifa_result) + return -EADDRNOTAVAIL; + + ipv6_addr_copy(saddr, &ifa_result->addr); + in6_ifa_put(ifa_result); + return 0; } -- cgit v1.2.3 From b541ca2c5a3f3f399d6f2ec9da33c1be5a8d8c19 Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Tue, 8 Nov 2005 09:39:17 -0800 Subject: [PKT_SCHED]: Correctly handle empty ematch trees Fixes an invalid memory reference when the basic classifier is used without any ematches but just actions. Signed-off-by: Thomas Graf Signed-off-by: David S. Miller --- net/sched/ematch.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net') diff --git a/net/sched/ematch.c b/net/sched/ematch.c index ebfe2e7d21bd..64b047c65568 100644 --- a/net/sched/ematch.c +++ b/net/sched/ematch.c @@ -298,6 +298,11 @@ int tcf_em_tree_validate(struct tcf_proto *tp, struct rtattr *rta, struct tcf_ematch_tree_hdr *tree_hdr; struct tcf_ematch *em; + if (!rta) { + memset(tree, 0, sizeof(*tree)); + return 0; + } + if (rtattr_parse_nested(tb, TCA_EMATCH_TREE_MAX, rta) < 0) goto errout; -- cgit v1.2.3 From dc8103f25fd7cfac2c2b295f33edc10f255b4c80 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Tue, 8 Nov 2005 09:40:05 -0800 Subject: [IPVS]: fix connection leak if expire_nodest_conn=1 There was a fix in 2.6.13 that changed the behaviour of ip_vs_conn_expire_now function not to put reference to connection, its callers should hold write lock or connection refcnt. But we forgot to convert one caller, when the real server for connection is unavailable caller should put the connection reference. It happens only when sysctl var expire_nodest_conn is set to 1 and such connections never expire. Thanks to Roberto Nibali who found the problem and tested a 2.4.32-rc2 patch, which is equal to this 2.6 version. Patch for 2.4 is already sent to Marcelo. Signed-off-by: Julian Anastasov Signed-off-by: Roberto Nibali Signed-off-by: David S. Miller --- net/ipv4/ipvs/ip_vs_core.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c index 981cc3244ef2..1a0843cd58a9 100644 --- a/net/ipv4/ipvs/ip_vs_core.c +++ b/net/ipv4/ipvs/ip_vs_core.c @@ -1009,11 +1009,10 @@ ip_vs_in(unsigned int hooknum, struct sk_buff **pskb, if (sysctl_ip_vs_expire_nodest_conn) { /* try to expire the connection immediately */ ip_vs_conn_expire_now(cp); - } else { - /* don't restart its timer, and silently - drop the packet. */ - __ip_vs_conn_put(cp); } + /* don't restart its timer, and silently + drop the packet. */ + __ip_vs_conn_put(cp); return NF_DROP; } -- cgit v1.2.3 From a51482bde22f99c63fbbb57d5d46cc666384e379 Mon Sep 17 00:00:00 2001 From: Jesper Juhl Date: Tue, 8 Nov 2005 09:41:34 -0800 Subject: [NET]: kfree cleanup From: Jesper Juhl This is the net/ part of the big kfree cleanup patch. Remove pointless checks for NULL prior to calling kfree() in net/. Signed-off-by: Jesper Juhl Cc: "David S. Miller" Cc: Arnaldo Carvalho de Melo Acked-by: Marcel Holtmann Acked-by: YOSHIFUJI Hideaki Signed-off-by: Andrew Morton --- net/802/p8023.c | 3 +-- net/ax25/af_ax25.c | 6 ++---- net/ax25/ax25_in.c | 6 ++---- net/ax25/ax25_route.c | 19 ++++++------------- net/bluetooth/hidp/core.c | 4 +--- net/core/dev_mcast.c | 3 +-- net/core/sock.c | 3 +-- net/dccp/ipv4.c | 6 ++---- net/dccp/proto.c | 3 +-- net/decnet/dn_table.c | 14 ++++++-------- net/ethernet/pe2.c | 3 +-- net/ipv4/af_inet.c | 3 +-- net/ipv4/fib_frontend.c | 3 +-- net/ipv4/ip_options.c | 3 +-- net/ipv4/ip_output.c | 12 ++++-------- net/ipv4/ip_sockglue.c | 12 ++++-------- net/ipv4/ipvs/ip_vs_app.c | 6 ++---- net/ipv4/multipath_wrandom.c | 10 +++------- net/ipv4/netfilter/ip_nat_snmp_basic.c | 3 +-- net/ipv4/tcp_ipv4.c | 3 +-- net/ipv6/addrconf.c | 3 +-- net/ipv6/ip6_output.c | 15 +++++---------- net/ipv6/ip6_tunnel.c | 6 ++---- net/ipv6/ipcomp6.c | 3 +-- net/ipv6/ipv6_sockglue.c | 3 +-- net/irda/discovery.c | 3 +-- net/irda/irias_object.c | 16 ++++++---------- net/rose/rose_route.c | 6 ++---- net/sched/cls_fw.c | 3 +-- net/sched/cls_route.c | 3 +-- net/sched/cls_rsvp.h | 3 +-- net/sched/cls_tcindex.c | 9 +++------ net/sched/cls_u32.c | 4 ++-- net/sched/em_meta.c | 3 +-- net/sctp/associola.c | 4 +--- net/sctp/sm_make_chunk.c | 6 ++---- net/sunrpc/auth_gss/gss_krb5_seal.c | 2 +- net/sunrpc/auth_gss/gss_krb5_unseal.c | 2 +- net/sunrpc/auth_gss/gss_mech_switch.c | 3 +-- net/sunrpc/auth_gss/gss_spkm3_seal.c | 3 +-- net/sunrpc/auth_gss/gss_spkm3_token.c | 3 +-- net/sunrpc/auth_gss/gss_spkm3_unseal.c | 6 ++---- net/sunrpc/svc.c | 9 +++------ net/sunrpc/xdr.c | 3 +-- net/wanrouter/af_wanpipe.c | 20 +++++++------------- net/wanrouter/wanmain.c | 12 ++++-------- net/xfrm/xfrm_state.c | 12 ++++-------- 47 files changed, 99 insertions(+), 191 deletions(-) (limited to 'net') diff --git a/net/802/p8023.c b/net/802/p8023.c index 6368d3dce444..d23e906456eb 100644 --- a/net/802/p8023.c +++ b/net/802/p8023.c @@ -54,8 +54,7 @@ struct datalink_proto *make_8023_client(void) */ void destroy_8023_client(struct datalink_proto *dl) { - if (dl) - kfree(dl); + kfree(dl); } EXPORT_SYMBOL(destroy_8023_client); diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index 8e37e71e34ff..1b683f302657 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -1138,10 +1138,8 @@ static int ax25_connect(struct socket *sock, struct sockaddr *uaddr, sk->sk_state = TCP_CLOSE; sock->state = SS_UNCONNECTED; - if (ax25->digipeat != NULL) { - kfree(ax25->digipeat); - ax25->digipeat = NULL; - } + kfree(ax25->digipeat); + ax25->digipeat = NULL; /* * Handle digi-peaters to be used. diff --git a/net/ax25/ax25_in.c b/net/ax25/ax25_in.c index 73cfc3411c46..4cf87540fb3a 100644 --- a/net/ax25/ax25_in.c +++ b/net/ax25/ax25_in.c @@ -401,10 +401,8 @@ static int ax25_rcv(struct sk_buff *skb, struct net_device *dev, } if (dp.ndigi == 0) { - if (ax25->digipeat != NULL) { - kfree(ax25->digipeat); - ax25->digipeat = NULL; - } + kfree(ax25->digipeat); + ax25->digipeat = NULL; } else { /* Reverse the source SABM's path */ memcpy(ax25->digipeat, &reverse_dp, sizeof(ax25_digi)); diff --git a/net/ax25/ax25_route.c b/net/ax25/ax25_route.c index 26b77d972220..b1e945bd6ed3 100644 --- a/net/ax25/ax25_route.c +++ b/net/ax25/ax25_route.c @@ -54,15 +54,13 @@ void ax25_rt_device_down(struct net_device *dev) if (s->dev == dev) { if (ax25_route_list == s) { ax25_route_list = s->next; - if (s->digipeat != NULL) - kfree(s->digipeat); + kfree(s->digipeat); kfree(s); } else { for (t = ax25_route_list; t != NULL; t = t->next) { if (t->next == s) { t->next = s->next; - if (s->digipeat != NULL) - kfree(s->digipeat); + kfree(s->digipeat); kfree(s); break; } @@ -90,10 +88,8 @@ static int ax25_rt_add(struct ax25_routes_struct *route) while (ax25_rt != NULL) { if (ax25cmp(&ax25_rt->callsign, &route->dest_addr) == 0 && ax25_rt->dev == ax25_dev->dev) { - if (ax25_rt->digipeat != NULL) { - kfree(ax25_rt->digipeat); - ax25_rt->digipeat = NULL; - } + kfree(ax25_rt->digipeat); + ax25_rt->digipeat = NULL; if (route->digi_count != 0) { if ((ax25_rt->digipeat = kmalloc(sizeof(ax25_digi), GFP_ATOMIC)) == NULL) { write_unlock(&ax25_route_lock); @@ -145,8 +141,7 @@ static int ax25_rt_add(struct ax25_routes_struct *route) static void ax25_rt_destroy(ax25_route *ax25_rt) { if (atomic_read(&ax25_rt->ref) == 0) { - if (ax25_rt->digipeat != NULL) - kfree(ax25_rt->digipeat); + kfree(ax25_rt->digipeat); kfree(ax25_rt); return; } @@ -530,9 +525,7 @@ void __exit ax25_rt_free(void) s = ax25_rt; ax25_rt = ax25_rt->next; - if (s->digipeat != NULL) - kfree(s->digipeat); - + kfree(s->digipeat); kfree(s); } write_unlock(&ax25_route_lock); diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c index 860444a7fc0f..cdb9cfafd960 100644 --- a/net/bluetooth/hidp/core.c +++ b/net/bluetooth/hidp/core.c @@ -660,9 +660,7 @@ unlink: failed: up_write(&hidp_session_sem); - if (session->input) - kfree(session->input); - + kfree(session->input); kfree(session); return err; } diff --git a/net/core/dev_mcast.c b/net/core/dev_mcast.c index db098ff3cd6a..cb530eef0e39 100644 --- a/net/core/dev_mcast.c +++ b/net/core/dev_mcast.c @@ -194,8 +194,7 @@ int dev_mc_add(struct net_device *dev, void *addr, int alen, int glbl) done: spin_unlock_bh(&dev->xmit_lock); - if (dmi1) - kfree(dmi1); + kfree(dmi1); return err; } diff --git a/net/core/sock.c b/net/core/sock.c index 9602ceb3bac9..13cc3be4f056 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1242,8 +1242,7 @@ static void sock_def_write_space(struct sock *sk) static void sock_def_destruct(struct sock *sk) { - if (sk->sk_protinfo) - kfree(sk->sk_protinfo); + kfree(sk->sk_protinfo); } void sk_send_sigurg(struct sock *sk) diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 4b9bc81ae1a3..ca03521112c5 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -1263,10 +1263,8 @@ static int dccp_v4_destroy_sock(struct sock *sk) if (inet_csk(sk)->icsk_bind_hash != NULL) inet_put_port(&dccp_hashinfo, sk); - if (dp->dccps_service_list != NULL) { - kfree(dp->dccps_service_list); - dp->dccps_service_list = NULL; - } + kfree(dp->dccps_service_list); + dp->dccps_service_list = NULL; ccid_hc_rx_exit(dp->dccps_hc_rx_ccid, sk); ccid_hc_tx_exit(dp->dccps_hc_tx_ccid, sk); diff --git a/net/dccp/proto.c b/net/dccp/proto.c index a021c3422f67..e0ace7cbb996 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -238,8 +238,7 @@ static int dccp_setsockopt_service(struct sock *sk, const u32 service, lock_sock(sk); dp->dccps_service = service; - if (dp->dccps_service_list != NULL) - kfree(dp->dccps_service_list); + kfree(dp->dccps_service_list); dp->dccps_service_list = sl; release_sock(sk); diff --git a/net/decnet/dn_table.c b/net/decnet/dn_table.c index eeba56f99323..6f8b5658cb4e 100644 --- a/net/decnet/dn_table.c +++ b/net/decnet/dn_table.c @@ -784,16 +784,14 @@ struct dn_fib_table *dn_fib_get_table(int n, int create) static void dn_fib_del_tree(int n) { - struct dn_fib_table *t; + struct dn_fib_table *t; - write_lock(&dn_fib_tables_lock); - t = dn_fib_tables[n]; - dn_fib_tables[n] = NULL; - write_unlock(&dn_fib_tables_lock); + write_lock(&dn_fib_tables_lock); + t = dn_fib_tables[n]; + dn_fib_tables[n] = NULL; + write_unlock(&dn_fib_tables_lock); - if (t) { - kfree(t); - } + kfree(t); } struct dn_fib_table *dn_fib_empty_table(void) diff --git a/net/ethernet/pe2.c b/net/ethernet/pe2.c index 98a494be6039..9d57b4fb6440 100644 --- a/net/ethernet/pe2.c +++ b/net/ethernet/pe2.c @@ -32,8 +32,7 @@ struct datalink_proto *make_EII_client(void) void destroy_EII_client(struct datalink_proto *dl) { - if (dl) - kfree(dl); + kfree(dl); } EXPORT_SYMBOL(destroy_EII_client); diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index a9d84f93442c..eaa150c33b04 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -147,8 +147,7 @@ void inet_sock_destruct(struct sock *sk) BUG_TRAP(!sk->sk_wmem_queued); BUG_TRAP(!sk->sk_forward_alloc); - if (inet->opt) - kfree(inet->opt); + kfree(inet->opt); dst_release(sk->sk_dst_cache); sk_refcnt_debug_dec(sk); } diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 990633c09dfe..2267c1fad879 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -266,8 +266,7 @@ int ip_rt_ioctl(unsigned int cmd, void __user *arg) if (tb) err = tb->tb_insert(tb, &req.rtm, &rta, &req.nlh, NULL); } - if (rta.rta_mx) - kfree(rta.rta_mx); + kfree(rta.rta_mx); } rtnl_unlock(); return err; diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index bce4e875193b..dbe12da8d8b3 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -510,8 +510,7 @@ static int ip_options_get_finish(struct ip_options **optp, kfree(opt); return -EINVAL; } - if (*optp) - kfree(*optp); + kfree(*optp); *optp = opt; return 0; } diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 17758234a3e3..df7f20da422d 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1262,10 +1262,8 @@ int ip_push_pending_frames(struct sock *sk) out: inet->cork.flags &= ~IPCORK_OPT; - if (inet->cork.opt) { - kfree(inet->cork.opt); - inet->cork.opt = NULL; - } + kfree(inet->cork.opt); + inet->cork.opt = NULL; if (inet->cork.rt) { ip_rt_put(inet->cork.rt); inet->cork.rt = NULL; @@ -1289,10 +1287,8 @@ void ip_flush_pending_frames(struct sock *sk) kfree_skb(skb); inet->cork.flags &= ~IPCORK_OPT; - if (inet->cork.opt) { - kfree(inet->cork.opt); - inet->cork.opt = NULL; - } + kfree(inet->cork.opt); + inet->cork.opt = NULL; if (inet->cork.rt) { ip_rt_put(inet->cork.rt); inet->cork.rt = NULL; diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 2f0b47da5b37..4f2d87257309 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -202,8 +202,7 @@ int ip_ra_control(struct sock *sk, unsigned char on, void (*destructor)(struct s if (ra->sk == sk) { if (on) { write_unlock_bh(&ip_ra_lock); - if (new_ra) - kfree(new_ra); + kfree(new_ra); return -EADDRINUSE; } *rap = ra->next; @@ -446,8 +445,7 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval, #endif } opt = xchg(&inet->opt, opt); - if (opt) - kfree(opt); + kfree(opt); break; } case IP_PKTINFO: @@ -828,10 +826,8 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval, err = ip_mc_msfilter(sk, msf, ifindex); mc_msf_out: - if (msf) - kfree(msf); - if (gsf) - kfree(gsf); + kfree(msf); + kfree(gsf); break; } case IP_ROUTER_ALERT: diff --git a/net/ipv4/ipvs/ip_vs_app.c b/net/ipv4/ipvs/ip_vs_app.c index fc6f95aaa969..d7eb680101c2 100644 --- a/net/ipv4/ipvs/ip_vs_app.c +++ b/net/ipv4/ipvs/ip_vs_app.c @@ -110,8 +110,7 @@ ip_vs_app_inc_new(struct ip_vs_app *app, __u16 proto, __u16 port) return 0; out: - if (inc->timeout_table) - kfree(inc->timeout_table); + kfree(inc->timeout_table); kfree(inc); return ret; } @@ -136,8 +135,7 @@ ip_vs_app_inc_release(struct ip_vs_app *inc) list_del(&inc->a_list); - if (inc->timeout_table != NULL) - kfree(inc->timeout_table); + kfree(inc->timeout_table); kfree(inc); } diff --git a/net/ipv4/multipath_wrandom.c b/net/ipv4/multipath_wrandom.c index bd7d75b6abe0..d34a9fa608e0 100644 --- a/net/ipv4/multipath_wrandom.c +++ b/net/ipv4/multipath_wrandom.c @@ -207,16 +207,12 @@ static void wrandom_select_route(const struct flowi *flp, decision = mpc->rt; last_power = mpc->power; - if (last_mpc) - kfree(last_mpc); - + kfree(last_mpc); last_mpc = mpc; } - if (last_mpc) { - /* concurrent __multipath_flush may lead to !last_mpc */ - kfree(last_mpc); - } + /* concurrent __multipath_flush may lead to !last_mpc */ + kfree(last_mpc); decision->u.dst.__use++; *rp = decision; diff --git a/net/ipv4/netfilter/ip_nat_snmp_basic.c b/net/ipv4/netfilter/ip_nat_snmp_basic.c index 93b2c5111bb2..8acb7ed40b47 100644 --- a/net/ipv4/netfilter/ip_nat_snmp_basic.c +++ b/net/ipv4/netfilter/ip_nat_snmp_basic.c @@ -1161,8 +1161,7 @@ static int snmp_parse_mangle(unsigned char *msg, if (!snmp_object_decode(&ctx, obj)) { if (*obj) { - if ((*obj)->id) - kfree((*obj)->id); + kfree((*obj)->id); kfree(*obj); } kfree(obj); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 49d67cd75edd..634dabb558fd 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -823,8 +823,7 @@ out: */ static void tcp_v4_reqsk_destructor(struct request_sock *req) { - if (inet_rsk(req)->opt) - kfree(inet_rsk(req)->opt); + kfree(inet_rsk(req)->opt); } static inline void syn_flood_warning(struct sk_buff *skb) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index a34d1504deb9..b7a5f51238b3 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -3090,8 +3090,7 @@ static int inet6_fill_ifinfo(struct sk_buff *skb, struct inet6_dev *idev, nlmsg_failure: rtattr_failure: - if (array) - kfree(array); + kfree(array); skb_trim(skb, b - skb->data); return -1; } diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 614296a920c6..dbd9767b32e4 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -587,8 +587,7 @@ static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) skb->next = NULL; } - if (tmp_hdr) - kfree(tmp_hdr); + kfree(tmp_hdr); if (err == 0) { IP6_INC_STATS(IPSTATS_MIB_FRAGOKS); @@ -1186,10 +1185,8 @@ int ip6_push_pending_frames(struct sock *sk) out: inet->cork.flags &= ~IPCORK_OPT; - if (np->cork.opt) { - kfree(np->cork.opt); - np->cork.opt = NULL; - } + kfree(np->cork.opt); + np->cork.opt = NULL; if (np->cork.rt) { dst_release(&np->cork.rt->u.dst); np->cork.rt = NULL; @@ -1214,10 +1211,8 @@ void ip6_flush_pending_frames(struct sock *sk) inet->cork.flags &= ~IPCORK_OPT; - if (np->cork.opt) { - kfree(np->cork.opt); - np->cork.opt = NULL; - } + kfree(np->cork.opt); + np->cork.opt = NULL; if (np->cork.rt) { dst_release(&np->cork.rt->u.dst); np->cork.rt = NULL; diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index cf94372d1af3..e6b0e3954c02 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -756,8 +756,7 @@ ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) } ip6_tnl_dst_store(t, dst); - if (opt) - kfree(opt); + kfree(opt); t->recursion--; return 0; @@ -766,8 +765,7 @@ tx_err_link_failure: dst_link_failure(skb); tx_err_dst_release: dst_release(dst); - if (opt) - kfree(opt); + kfree(opt); tx_err: stats->tx_errors++; stats->tx_dropped++; diff --git a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c index 85bfbc69b2c3..55917fb17094 100644 --- a/net/ipv6/ipcomp6.c +++ b/net/ipv6/ipcomp6.c @@ -130,8 +130,7 @@ static int ipcomp6_input(struct xfrm_state *x, struct xfrm_decap_state *decap, s out_put_cpu: put_cpu(); out: - if (tmp_hdr) - kfree(tmp_hdr); + kfree(tmp_hdr); if (err) goto error_out; return nexthdr; diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 8567873d0dd8..003fd99ff597 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -80,8 +80,7 @@ int ip6_ra_control(struct sock *sk, int sel, void (*destructor)(struct sock *)) if (ra->sk == sk) { if (sel>=0) { write_unlock_bh(&ip6_ra_lock); - if (new_ra) - kfree(new_ra); + kfree(new_ra); return -EADDRINUSE; } diff --git a/net/irda/discovery.c b/net/irda/discovery.c index c4ba5fa1446a..3fefc822c1c0 100644 --- a/net/irda/discovery.c +++ b/net/irda/discovery.c @@ -194,8 +194,7 @@ void irlmp_expire_discoveries(hashbin_t *log, __u32 saddr, int force) /* Remove it from the log */ curr = hashbin_remove_this(log, (irda_queue_t *) curr); - if (curr) - kfree(curr); + kfree(curr); } } diff --git a/net/irda/irias_object.c b/net/irda/irias_object.c index 6fec428b4512..75f2666e8630 100644 --- a/net/irda/irias_object.c +++ b/net/irda/irias_object.c @@ -122,8 +122,7 @@ static void __irias_delete_attrib(struct ias_attrib *attrib) IRDA_ASSERT(attrib != NULL, return;); IRDA_ASSERT(attrib->magic == IAS_ATTRIB_MAGIC, return;); - if (attrib->name) - kfree(attrib->name); + kfree(attrib->name); irias_delete_value(attrib->value); attrib->magic = ~IAS_ATTRIB_MAGIC; @@ -136,8 +135,7 @@ void __irias_delete_object(struct ias_object *obj) IRDA_ASSERT(obj != NULL, return;); IRDA_ASSERT(obj->magic == IAS_OBJECT_MAGIC, return;); - if (obj->name) - kfree(obj->name); + kfree(obj->name); hashbin_delete(obj->attribs, (FREE_FUNC) __irias_delete_attrib); @@ -562,14 +560,12 @@ void irias_delete_value(struct ias_value *value) /* No need to deallocate */ break; case IAS_STRING: - /* If string, deallocate string */ - if (value->t.string != NULL) - kfree(value->t.string); + /* Deallocate string */ + kfree(value->t.string); break; case IAS_OCT_SEQ: - /* If byte stream, deallocate byte stream */ - if (value->t.oct_seq != NULL) - kfree(value->t.oct_seq); + /* Deallocate byte stream */ + kfree(value->t.oct_seq); break; default: IRDA_DEBUG(0, "%s(), Unknown value type!\n", __FUNCTION__); diff --git a/net/rose/rose_route.c b/net/rose/rose_route.c index b18fe5043019..8631b65a7312 100644 --- a/net/rose/rose_route.c +++ b/net/rose/rose_route.c @@ -240,8 +240,7 @@ static void rose_remove_neigh(struct rose_neigh *rose_neigh) if ((s = rose_neigh_list) == rose_neigh) { rose_neigh_list = rose_neigh->next; spin_unlock_bh(&rose_neigh_list_lock); - if (rose_neigh->digipeat != NULL) - kfree(rose_neigh->digipeat); + kfree(rose_neigh->digipeat); kfree(rose_neigh); return; } @@ -250,8 +249,7 @@ static void rose_remove_neigh(struct rose_neigh *rose_neigh) if (s->next == rose_neigh) { s->next = rose_neigh->next; spin_unlock_bh(&rose_neigh_list_lock); - if (rose_neigh->digipeat != NULL) - kfree(rose_neigh->digipeat); + kfree(rose_neigh->digipeat); kfree(rose_neigh); return; } diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c index 29d8b9a4d162..75470486e405 100644 --- a/net/sched/cls_fw.c +++ b/net/sched/cls_fw.c @@ -298,8 +298,7 @@ static int fw_change(struct tcf_proto *tp, unsigned long base, return 0; errout: - if (f) - kfree(f); + kfree(f); return err; } diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c index 02996ac05c75..520ff716dab2 100644 --- a/net/sched/cls_route.c +++ b/net/sched/cls_route.c @@ -525,8 +525,7 @@ reinsert: return 0; errout: - if (f) - kfree(f); + kfree(f); return err; } diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h index 006168d69376..572f06be3b02 100644 --- a/net/sched/cls_rsvp.h +++ b/net/sched/cls_rsvp.h @@ -555,8 +555,7 @@ insert: goto insert; errout: - if (f) - kfree(f); + kfree(f); errout2: tcf_exts_destroy(tp, &e); return err; diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c index 404d9d83a7fa..9f921174c8ab 100644 --- a/net/sched/cls_tcindex.c +++ b/net/sched/cls_tcindex.c @@ -194,8 +194,7 @@ found: } tcf_unbind_filter(tp, &r->res); tcf_exts_destroy(tp, &r->exts); - if (f) - kfree(f); + kfree(f); return 0; } @@ -442,10 +441,8 @@ static void tcindex_destroy(struct tcf_proto *tp) walker.skip = 0; walker.fn = &tcindex_destroy_element; tcindex_walk(tp,&walker); - if (p->perfect) - kfree(p->perfect); - if (p->h) - kfree(p->h); + kfree(p->perfect); + kfree(p->h); kfree(p); tp->root = NULL; } diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index 364b87d86455..2b670479dde1 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -347,7 +347,7 @@ static int u32_destroy_key(struct tcf_proto *tp, struct tc_u_knode *n) if (n->ht_down) n->ht_down->refcnt--; #ifdef CONFIG_CLS_U32_PERF - if (n && (NULL != n->pf)) + if (n) kfree(n->pf); #endif kfree(n); @@ -680,7 +680,7 @@ static int u32_change(struct tcf_proto *tp, unsigned long base, u32 handle, return 0; } #ifdef CONFIG_CLS_U32_PERF - if (n && (NULL != n->pf)) + if (n) kfree(n->pf); #endif kfree(n); diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c index cf68a59fdc5a..700844d49d79 100644 --- a/net/sched/em_meta.c +++ b/net/sched/em_meta.c @@ -561,8 +561,7 @@ static int meta_var_change(struct meta_value *dst, struct rtattr *rta) static void meta_var_destroy(struct meta_value *v) { - if (v->val) - kfree((void *) v->val); + kfree((void *) v->val); } static void meta_var_apply_extras(struct meta_value *v, diff --git a/net/sctp/associola.c b/net/sctp/associola.c index 12b0f582a66b..8c8ddf7f9b61 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -344,9 +344,7 @@ void sctp_association_free(struct sctp_association *asoc) } /* Free peer's cached cookie. */ - if (asoc->peer.cookie) { - kfree(asoc->peer.cookie); - } + kfree(asoc->peer.cookie); /* Release the transport structures. */ list_for_each_safe(pos, temp, &asoc->peer.transport_addr_list) { diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index 660c61bdf164..f9573eba5c7a 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -254,8 +254,7 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc, aiparam.adaption_ind = htonl(sp->adaption_ind); sctp_addto_chunk(retval, sizeof(aiparam), &aiparam); nodata: - if (addrs.v) - kfree(addrs.v); + kfree(addrs.v); return retval; } @@ -347,8 +346,7 @@ struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *asoc, nomem_chunk: kfree(cookie); nomem_cookie: - if (addrs.v) - kfree(addrs.v); + kfree(addrs.v); return retval; } diff --git a/net/sunrpc/auth_gss/gss_krb5_seal.c b/net/sunrpc/auth_gss/gss_krb5_seal.c index 13f8ae979454..d0dfdfd5e79e 100644 --- a/net/sunrpc/auth_gss/gss_krb5_seal.c +++ b/net/sunrpc/auth_gss/gss_krb5_seal.c @@ -143,6 +143,6 @@ gss_get_mic_kerberos(struct gss_ctx *gss_ctx, struct xdr_buf *text, return ((ctx->endtime < now) ? GSS_S_CONTEXT_EXPIRED : GSS_S_COMPLETE); out_err: - if (md5cksum.data) kfree(md5cksum.data); + kfree(md5cksum.data); return GSS_S_FAILURE; } diff --git a/net/sunrpc/auth_gss/gss_krb5_unseal.c b/net/sunrpc/auth_gss/gss_krb5_unseal.c index 2030475d98ed..db055fd7d778 100644 --- a/net/sunrpc/auth_gss/gss_krb5_unseal.c +++ b/net/sunrpc/auth_gss/gss_krb5_unseal.c @@ -176,6 +176,6 @@ gss_verify_mic_kerberos(struct gss_ctx *gss_ctx, ret = GSS_S_COMPLETE; out: - if (md5cksum.data) kfree(md5cksum.data); + kfree(md5cksum.data); return ret; } diff --git a/net/sunrpc/auth_gss/gss_mech_switch.c b/net/sunrpc/auth_gss/gss_mech_switch.c index b048bf672da2..f8bac6ccd524 100644 --- a/net/sunrpc/auth_gss/gss_mech_switch.c +++ b/net/sunrpc/auth_gss/gss_mech_switch.c @@ -60,8 +60,7 @@ gss_mech_free(struct gss_api_mech *gm) for (i = 0; i < gm->gm_pf_num; i++) { pf = &gm->gm_pfs[i]; - if (pf->auth_domain_name) - kfree(pf->auth_domain_name); + kfree(pf->auth_domain_name); pf->auth_domain_name = NULL; } } diff --git a/net/sunrpc/auth_gss/gss_spkm3_seal.c b/net/sunrpc/auth_gss/gss_spkm3_seal.c index 148201e929d0..d1e12b25d6e2 100644 --- a/net/sunrpc/auth_gss/gss_spkm3_seal.c +++ b/net/sunrpc/auth_gss/gss_spkm3_seal.c @@ -122,8 +122,7 @@ spkm3_make_token(struct spkm3_ctx *ctx, return GSS_S_COMPLETE; out_err: - if (md5cksum.data) - kfree(md5cksum.data); + kfree(md5cksum.data); token->data = NULL; token->len = 0; return GSS_S_FAILURE; diff --git a/net/sunrpc/auth_gss/gss_spkm3_token.c b/net/sunrpc/auth_gss/gss_spkm3_token.c index 46c08a0710f6..1f824578d773 100644 --- a/net/sunrpc/auth_gss/gss_spkm3_token.c +++ b/net/sunrpc/auth_gss/gss_spkm3_token.c @@ -259,8 +259,7 @@ spkm3_verify_mic_token(unsigned char **tokp, int *mic_hdrlen, unsigned char **ck ret = GSS_S_COMPLETE; out: - if (spkm3_ctx_id.data) - kfree(spkm3_ctx_id.data); + kfree(spkm3_ctx_id.data); return ret; } diff --git a/net/sunrpc/auth_gss/gss_spkm3_unseal.c b/net/sunrpc/auth_gss/gss_spkm3_unseal.c index c3c0d9586103..241d5b30dfcb 100644 --- a/net/sunrpc/auth_gss/gss_spkm3_unseal.c +++ b/net/sunrpc/auth_gss/gss_spkm3_unseal.c @@ -120,9 +120,7 @@ spkm3_read_token(struct spkm3_ctx *ctx, /* XXX: need to add expiration and sequencing */ ret = GSS_S_COMPLETE; out: - if (md5cksum.data) - kfree(md5cksum.data); - if (wire_cksum.data) - kfree(wire_cksum.data); + kfree(md5cksum.data); + kfree(wire_cksum.data); return ret; } diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 5a220b2bb376..e4296c8b861e 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -196,12 +196,9 @@ svc_exit_thread(struct svc_rqst *rqstp) struct svc_serv *serv = rqstp->rq_server; svc_release_buffer(rqstp); - if (rqstp->rq_resp) - kfree(rqstp->rq_resp); - if (rqstp->rq_argp) - kfree(rqstp->rq_argp); - if (rqstp->rq_auth_data) - kfree(rqstp->rq_auth_data); + kfree(rqstp->rq_resp); + kfree(rqstp->rq_argp); + kfree(rqstp->rq_auth_data); kfree(rqstp); /* Release the server */ diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index 32df43372ee9..aaf08cdd19f0 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -992,8 +992,7 @@ xdr_xcode_array2(struct xdr_buf *buf, unsigned int base, err = 0; out: - if (elem) - kfree(elem); + kfree(elem); if (ppages) kunmap(*ppages); return err; diff --git a/net/wanrouter/af_wanpipe.c b/net/wanrouter/af_wanpipe.c index 596cb96e5f47..59fec59b2132 100644 --- a/net/wanrouter/af_wanpipe.c +++ b/net/wanrouter/af_wanpipe.c @@ -1099,7 +1099,7 @@ static void release_driver(struct sock *sk) sock_reset_flag(sk, SOCK_ZAPPED); wp = wp_sk(sk); - if (wp && wp->mbox) { + if (wp) { kfree(wp->mbox); wp->mbox = NULL; } @@ -1186,10 +1186,8 @@ static void wanpipe_kill_sock_timer (unsigned long data) return; } - if (wp_sk(sk)) { - kfree(wp_sk(sk)); - wp_sk(sk) = NULL; - } + kfree(wp_sk(sk)); + wp_sk(sk) = NULL; if (atomic_read(&sk->sk_refcnt) != 1) { atomic_set(&sk->sk_refcnt, 1); @@ -1219,10 +1217,8 @@ static void wanpipe_kill_sock_accept (struct sock *sk) sk->sk_socket = NULL; - if (wp_sk(sk)) { - kfree(wp_sk(sk)); - wp_sk(sk) = NULL; - } + kfree(wp_sk(sk)); + wp_sk(sk) = NULL; if (atomic_read(&sk->sk_refcnt) != 1) { atomic_set(&sk->sk_refcnt, 1); @@ -1243,10 +1239,8 @@ static void wanpipe_kill_sock_irq (struct sock *sk) sk->sk_socket = NULL; - if (wp_sk(sk)) { - kfree(wp_sk(sk)); - wp_sk(sk) = NULL; - } + kfree(wp_sk(sk)); + wp_sk(sk) = NULL; if (atomic_read(&sk->sk_refcnt) != 1) { atomic_set(&sk->sk_refcnt, 1); diff --git a/net/wanrouter/wanmain.c b/net/wanrouter/wanmain.c index 13b650ad22e2..bcf7b3faa76a 100644 --- a/net/wanrouter/wanmain.c +++ b/net/wanrouter/wanmain.c @@ -714,10 +714,8 @@ static int wanrouter_device_new_if(struct wan_device *wandev, } /* This code has moved from del_if() function */ - if (dev->priv) { - kfree(dev->priv); - dev->priv = NULL; - } + kfree(dev->priv); + dev->priv = NULL; #ifdef CONFIG_WANPIPE_MULTPPP if (cnf->config_id == WANCONFIG_MPPP) @@ -851,10 +849,8 @@ static int wanrouter_delete_interface(struct wan_device *wandev, char *name) /* Due to new interface linking method using dev->priv, * this code has moved from del_if() function.*/ - if (dev->priv){ - kfree(dev->priv); - dev->priv=NULL; - } + kfree(dev->priv); + dev->priv=NULL; unregister_netdev(dev); diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 8b9a4747417d..7cf48aa6c95b 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -62,14 +62,10 @@ static void xfrm_state_gc_destroy(struct xfrm_state *x) { if (del_timer(&x->timer)) BUG(); - if (x->aalg) - kfree(x->aalg); - if (x->ealg) - kfree(x->ealg); - if (x->calg) - kfree(x->calg); - if (x->encap) - kfree(x->encap); + kfree(x->aalg); + kfree(x->ealg); + kfree(x->calg); + kfree(x->encap); if (x->type) { x->type->destructor(x); xfrm_put_type(x->type); -- cgit v1.2.3 From 89f5f0aeed14ac7245f760b0b96c9269c87bcbbe Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 8 Nov 2005 09:41:56 -0800 Subject: [IPV4]: Fix ip_queue_xmit identity increment for TSO packets When ip_queue_xmit calls ip_select_ident_more for IP identity selection it gives it the wrong packet count for TSO packets. The ip_select_* functions expect one less than the number of packets, so we need to subtract one for TSO packets. This bug was diagnosed and fixed by Tom Young. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/ipv4/ip_output.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index df7f20da422d..11c2f68254f0 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -353,7 +353,8 @@ packet_routed: ip_options_build(skb, opt, inet->daddr, rt, 0); } - ip_select_ident_more(iph, &rt->u.dst, sk, skb_shinfo(skb)->tso_segs); + ip_select_ident_more(iph, &rt->u.dst, sk, + (skb_shinfo(skb)->tso_segs ?: 1) - 1); /* Add an IP checksum. */ ip_send_check(iph); -- cgit v1.2.3 From 1ebb92521d0bc2d4ef772730d29333c06b807191 Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Tue, 8 Nov 2005 09:57:21 -0800 Subject: [Bluetooth]: Add endian annotations to the core This patch adds the endian annotations to the Bluetooth core. Signed-off-by: Marcel Holtmann Signed-off-by: David S. Miller --- drivers/bluetooth/bpa10x.c | 4 +- include/net/bluetooth/hci.h | 116 +++++++++++++++++++-------------------- include/net/bluetooth/hci_core.h | 2 +- net/bluetooth/hci_core.c | 2 +- net/bluetooth/hci_event.c | 6 +- net/bluetooth/hci_sock.c | 2 +- 6 files changed, 66 insertions(+), 66 deletions(-) (limited to 'net') diff --git a/drivers/bluetooth/bpa10x.c b/drivers/bluetooth/bpa10x.c index ecbeb7eaba8e..394796315adc 100644 --- a/drivers/bluetooth/bpa10x.c +++ b/drivers/bluetooth/bpa10x.c @@ -84,8 +84,8 @@ struct bpa10x_data { struct hci_vendor_hdr { __u8 type; - __u16 snum; - __u16 dlen; + __le16 snum; + __le16 dlen; } __attribute__ ((packed)); static void bpa10x_recv_bulk(struct bpa10x_data *data, unsigned char *buf, int count) diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index fa2d12b0579b..b06a2d2f63d2 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -184,10 +184,10 @@ enum { struct hci_rp_read_loc_version { __u8 status; __u8 hci_ver; - __u16 hci_rev; + __le16 hci_rev; __u8 lmp_ver; - __u16 manufacturer; - __u16 lmp_subver; + __le16 manufacturer; + __le16 lmp_subver; } __attribute__ ((packed)); #define OCF_READ_LOCAL_FEATURES 0x0003 @@ -199,10 +199,10 @@ struct hci_rp_read_loc_features { #define OCF_READ_BUFFER_SIZE 0x0005 struct hci_rp_read_buffer_size { __u8 status; - __u16 acl_mtu; + __le16 acl_mtu; __u8 sco_mtu; - __u16 acl_max_pkt; - __u16 sco_max_pkt; + __le16 acl_max_pkt; + __le16 sco_max_pkt; } __attribute__ ((packed)); #define OCF_READ_BD_ADDR 0x0009 @@ -267,21 +267,21 @@ struct hci_cp_write_dev_class { #define OCF_READ_VOICE_SETTING 0x0025 struct hci_rp_read_voice_setting { - __u8 status; - __u16 voice_setting; + __u8 status; + __le16 voice_setting; } __attribute__ ((packed)); #define OCF_WRITE_VOICE_SETTING 0x0026 struct hci_cp_write_voice_setting { - __u16 voice_setting; + __le16 voice_setting; } __attribute__ ((packed)); #define OCF_HOST_BUFFER_SIZE 0x0033 struct hci_cp_host_buffer_size { - __u16 acl_mtu; + __le16 acl_mtu; __u8 sco_mtu; - __u16 acl_max_pkt; - __u16 sco_max_pkt; + __le16 acl_max_pkt; + __le16 sco_max_pkt; } __attribute__ ((packed)); /* Link Control */ @@ -289,10 +289,10 @@ struct hci_cp_host_buffer_size { #define OCF_CREATE_CONN 0x0005 struct hci_cp_create_conn { bdaddr_t bdaddr; - __u16 pkt_type; + __le16 pkt_type; __u8 pscan_rep_mode; __u8 pscan_mode; - __u16 clock_offset; + __le16 clock_offset; __u8 role_switch; } __attribute__ ((packed)); @@ -310,14 +310,14 @@ struct hci_cp_reject_conn_req { #define OCF_DISCONNECT 0x0006 struct hci_cp_disconnect { - __u16 handle; + __le16 handle; __u8 reason; } __attribute__ ((packed)); #define OCF_ADD_SCO 0x0007 struct hci_cp_add_sco { - __u16 handle; - __u16 pkt_type; + __le16 handle; + __le16 pkt_type; } __attribute__ ((packed)); #define OCF_INQUIRY 0x0001 @@ -354,56 +354,56 @@ struct hci_cp_pin_code_neg_reply { #define OCF_CHANGE_CONN_PTYPE 0x000F struct hci_cp_change_conn_ptype { - __u16 handle; - __u16 pkt_type; + __le16 handle; + __le16 pkt_type; } __attribute__ ((packed)); #define OCF_AUTH_REQUESTED 0x0011 struct hci_cp_auth_requested { - __u16 handle; + __le16 handle; } __attribute__ ((packed)); #define OCF_SET_CONN_ENCRYPT 0x0013 struct hci_cp_set_conn_encrypt { - __u16 handle; + __le16 handle; __u8 encrypt; } __attribute__ ((packed)); #define OCF_CHANGE_CONN_LINK_KEY 0x0015 struct hci_cp_change_conn_link_key { - __u16 handle; + __le16 handle; } __attribute__ ((packed)); #define OCF_READ_REMOTE_FEATURES 0x001B struct hci_cp_read_rmt_features { - __u16 handle; + __le16 handle; } __attribute__ ((packed)); #define OCF_READ_REMOTE_VERSION 0x001D struct hci_cp_read_rmt_version { - __u16 handle; + __le16 handle; } __attribute__ ((packed)); /* Link Policy */ #define OGF_LINK_POLICY 0x02 #define OCF_ROLE_DISCOVERY 0x0009 struct hci_cp_role_discovery { - __u16 handle; + __le16 handle; } __attribute__ ((packed)); struct hci_rp_role_discovery { __u8 status; - __u16 handle; + __le16 handle; __u8 role; } __attribute__ ((packed)); #define OCF_READ_LINK_POLICY 0x000C struct hci_cp_read_link_policy { - __u16 handle; + __le16 handle; } __attribute__ ((packed)); struct hci_rp_read_link_policy { __u8 status; - __u16 handle; - __u16 policy; + __le16 handle; + __le16 policy; } __attribute__ ((packed)); #define OCF_SWITCH_ROLE 0x000B @@ -414,12 +414,12 @@ struct hci_cp_switch_role { #define OCF_WRITE_LINK_POLICY 0x000D struct hci_cp_write_link_policy { - __u16 handle; - __u16 policy; + __le16 handle; + __le16 policy; } __attribute__ ((packed)); struct hci_rp_write_link_policy { __u8 status; - __u16 handle; + __le16 handle; } __attribute__ ((packed)); /* Status params */ @@ -441,7 +441,7 @@ struct inquiry_info { __u8 pscan_period_mode; __u8 pscan_mode; __u8 dev_class[3]; - __u16 clock_offset; + __le16 clock_offset; } __attribute__ ((packed)); #define HCI_EV_INQUIRY_RESULT_WITH_RSSI 0x22 @@ -450,7 +450,7 @@ struct inquiry_info_with_rssi { __u8 pscan_rep_mode; __u8 pscan_period_mode; __u8 dev_class[3]; - __u16 clock_offset; + __le16 clock_offset; __s8 rssi; } __attribute__ ((packed)); struct inquiry_info_with_rssi_and_pscan_mode { @@ -459,7 +459,7 @@ struct inquiry_info_with_rssi_and_pscan_mode { __u8 pscan_period_mode; __u8 pscan_mode; __u8 dev_class[3]; - __u16 clock_offset; + __le16 clock_offset; __s8 rssi; } __attribute__ ((packed)); @@ -469,7 +469,7 @@ struct extended_inquiry_info { __u8 pscan_rep_mode; __u8 pscan_period_mode; __u8 dev_class[3]; - __u16 clock_offset; + __le16 clock_offset; __s8 rssi; __u8 data[240]; } __attribute__ ((packed)); @@ -477,7 +477,7 @@ struct extended_inquiry_info { #define HCI_EV_CONN_COMPLETE 0x03 struct hci_ev_conn_complete { __u8 status; - __u16 handle; + __le16 handle; bdaddr_t bdaddr; __u8 link_type; __u8 encr_mode; @@ -493,27 +493,27 @@ struct hci_ev_conn_request { #define HCI_EV_DISCONN_COMPLETE 0x05 struct hci_ev_disconn_complete { __u8 status; - __u16 handle; + __le16 handle; __u8 reason; } __attribute__ ((packed)); #define HCI_EV_AUTH_COMPLETE 0x06 struct hci_ev_auth_complete { __u8 status; - __u16 handle; + __le16 handle; } __attribute__ ((packed)); #define HCI_EV_ENCRYPT_CHANGE 0x08 struct hci_ev_encrypt_change { __u8 status; - __u16 handle; + __le16 handle; __u8 encrypt; } __attribute__ ((packed)); #define HCI_EV_CHANGE_CONN_LINK_KEY_COMPLETE 0x09 struct hci_ev_change_conn_link_key_complete { __u8 status; - __u16 handle; + __le16 handle; } __attribute__ ((packed)); #define HCI_EV_QOS_SETUP_COMPLETE 0x0D @@ -526,21 +526,21 @@ struct hci_qos { } __attribute__ ((packed)); struct hci_ev_qos_setup_complete { __u8 status; - __u16 handle; + __le16 handle; struct hci_qos qos; } __attribute__ ((packed)); #define HCI_EV_CMD_COMPLETE 0x0E struct hci_ev_cmd_complete { __u8 ncmd; - __u16 opcode; + __le16 opcode; } __attribute__ ((packed)); #define HCI_EV_CMD_STATUS 0x0F struct hci_ev_cmd_status { __u8 status; __u8 ncmd; - __u16 opcode; + __le16 opcode; } __attribute__ ((packed)); #define HCI_EV_NUM_COMP_PKTS 0x13 @@ -559,9 +559,9 @@ struct hci_ev_role_change { #define HCI_EV_MODE_CHANGE 0x14 struct hci_ev_mode_change { __u8 status; - __u16 handle; + __le16 handle; __u8 mode; - __u16 interval; + __le16 interval; } __attribute__ ((packed)); #define HCI_EV_PIN_CODE_REQ 0x16 @@ -584,24 +584,24 @@ struct hci_ev_link_key_notify { #define HCI_EV_RMT_FEATURES 0x0B struct hci_ev_rmt_features { __u8 status; - __u16 handle; + __le16 handle; __u8 features[8]; } __attribute__ ((packed)); #define HCI_EV_RMT_VERSION 0x0C struct hci_ev_rmt_version { __u8 status; - __u16 handle; + __le16 handle; __u8 lmp_ver; - __u16 manufacturer; - __u16 lmp_subver; + __le16 manufacturer; + __le16 lmp_subver; } __attribute__ ((packed)); #define HCI_EV_CLOCK_OFFSET 0x01C struct hci_ev_clock_offset { __u8 status; - __u16 handle; - __u16 clock_offset; + __le16 handle; + __le16 clock_offset; } __attribute__ ((packed)); #define HCI_EV_PSCAN_REP_MODE 0x20 @@ -638,7 +638,7 @@ struct hci_ev_si_security { #define HCI_SCO_HDR_SIZE 3 struct hci_command_hdr { - __u16 opcode; /* OCF & OGF */ + __le16 opcode; /* OCF & OGF */ __u8 plen; } __attribute__ ((packed)); @@ -648,22 +648,22 @@ struct hci_event_hdr { } __attribute__ ((packed)); struct hci_acl_hdr { - __u16 handle; /* Handle & Flags(PB, BC) */ - __u16 dlen; + __le16 handle; /* Handle & Flags(PB, BC) */ + __le16 dlen; } __attribute__ ((packed)); struct hci_sco_hdr { - __u16 handle; + __le16 handle; __u8 dlen; } __attribute__ ((packed)); /* Command opcode pack/unpack */ -#define hci_opcode_pack(ogf, ocf) (__u16)((ocf & 0x03ff)|(ogf << 10)) +#define hci_opcode_pack(ogf, ocf) (__u16) ((ocf & 0x03ff)|(ogf << 10)) #define hci_opcode_ogf(op) (op >> 10) #define hci_opcode_ocf(op) (op & 0x03ff) /* ACL handle and flags pack/unpack */ -#define hci_handle_pack(h, f) (__u16)((h & 0x0fff)|(f << 12)) +#define hci_handle_pack(h, f) (__u16) ((h & 0x0fff)|(f << 12)) #define hci_handle(h) (h & 0x0fff) #define hci_flags(h) (h >> 12) diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 7f933f302078..adb94508259b 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -44,7 +44,7 @@ struct inquiry_data { __u8 pscan_period_mode; __u8 pscan_mode; __u8 dev_class[3]; - __u16 clock_offset; + __le16 clock_offset; __s8 rssi; }; diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index cf0df1c8c933..9106354c781e 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -183,7 +183,7 @@ static void hci_reset_req(struct hci_dev *hdev, unsigned long opt) static void hci_init_req(struct hci_dev *hdev, unsigned long opt) { struct sk_buff *skb; - __u16 param; + __le16 param; BT_DBG("%s %ld", hdev->name, opt); diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index b61b4e8e36fd..eb64555d1fb3 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -242,7 +242,7 @@ static void hci_cc_host_ctl(struct hci_dev *hdev, __u16 ocf, struct sk_buff *skb break; status = *((__u8 *) skb->data); - setting = __le16_to_cpu(get_unaligned((__u16 *) sent)); + setting = __le16_to_cpu(get_unaligned((__le16 *) sent)); if (!status && hdev->voice_setting != setting) { hdev->voice_setting = setting; @@ -728,7 +728,7 @@ static inline void hci_disconn_complete_evt(struct hci_dev *hdev, struct sk_buff static inline void hci_num_comp_pkts_evt(struct hci_dev *hdev, struct sk_buff *skb) { struct hci_ev_num_comp_pkts *ev = (struct hci_ev_num_comp_pkts *) skb->data; - __u16 *ptr; + __le16 *ptr; int i; skb_pull(skb, sizeof(*ev)); @@ -742,7 +742,7 @@ static inline void hci_num_comp_pkts_evt(struct hci_dev *hdev, struct sk_buff *s tasklet_disable(&hdev->tx_task); - for (i = 0, ptr = (__u16 *) skb->data; i < ev->num_hndl; i++) { + for (i = 0, ptr = (__le16 *) skb->data; i < ev->num_hndl; i++) { struct hci_conn *conn; __u16 handle, count; diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index 799e448750ad..1d6d0a15c099 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -416,7 +416,7 @@ static int hci_sock_sendmsg(struct kiocb *iocb, struct socket *sock, skb->dev = (void *) hdev; if (bt_cb(skb)->pkt_type == HCI_COMMAND_PKT) { - u16 opcode = __le16_to_cpu(get_unaligned((u16 *)skb->data)); + u16 opcode = __le16_to_cpu(get_unaligned((__le16 *) skb->data)); u16 ogf = hci_opcode_ogf(opcode); u16 ocf = hci_opcode_ocf(opcode); -- cgit v1.2.3 From be9d122730c878baafe11e70d1436faac229f2fc Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Tue, 8 Nov 2005 09:57:38 -0800 Subject: [Bluetooth]: Remove the usage of /proc completely This patch removes all relics of the /proc usage from the Bluetooth subsystem core and its upper layers. All the previous information are now available via /sys/class/bluetooth through appropriate functions. Signed-off-by: Marcel Holtmann Signed-off-by: David S. Miller --- include/net/bluetooth/bluetooth.h | 4 +- include/net/bluetooth/hci_core.h | 7 --- include/net/bluetooth/rfcomm.h | 2 - net/bluetooth/af_bluetooth.c | 12 +--- net/bluetooth/hci_sysfs.c | 4 +- net/bluetooth/l2cap.c | 98 +++++------------------------- net/bluetooth/rfcomm/core.c | 124 ++++++-------------------------------- net/bluetooth/rfcomm/sock.c | 90 ++++----------------------- net/bluetooth/sco.c | 92 ++++------------------------ 9 files changed, 65 insertions(+), 368 deletions(-) (limited to 'net') diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h index e42d728b1620..911ceb5cd263 100644 --- a/include/net/bluetooth/bluetooth.h +++ b/include/net/bluetooth/bluetooth.h @@ -57,8 +57,6 @@ #define BT_DBG(fmt, arg...) printk(KERN_INFO "%s: " fmt "\n" , __FUNCTION__ , ## arg) #define BT_ERR(fmt, arg...) printk(KERN_ERR "%s: " fmt "\n" , __FUNCTION__ , ## arg) -extern struct proc_dir_entry *proc_bt; - /* Connection and socket states */ enum { BT_CONNECTED = 1, /* Equal to TCP_ESTABLISHED to make net code happy */ @@ -177,4 +175,6 @@ extern int hci_sock_cleanup(void); extern int bt_sysfs_init(void); extern void bt_sysfs_cleanup(void); +extern struct class bt_class; + #endif /* __BLUETOOTH_H */ diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index adb94508259b..bb9f81dc8723 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -25,7 +25,6 @@ #ifndef __HCI_CORE_H #define __HCI_CORE_H -#include #include /* HCI upper protocols */ @@ -34,8 +33,6 @@ #define HCI_INIT_TIMEOUT (HZ * 10) -extern struct proc_dir_entry *proc_bt_hci; - /* HCI Core structures */ struct inquiry_data { @@ -126,10 +123,6 @@ struct hci_dev { atomic_t promisc; -#ifdef CONFIG_PROC_FS - struct proc_dir_entry *proc; -#endif - struct class_device class_dev; struct module *owner; diff --git a/include/net/bluetooth/rfcomm.h b/include/net/bluetooth/rfcomm.h index e656be7c001a..bbfac86734ec 100644 --- a/include/net/bluetooth/rfcomm.h +++ b/include/net/bluetooth/rfcomm.h @@ -351,6 +351,4 @@ int rfcomm_dev_ioctl(struct sock *sk, unsigned int cmd, void __user *arg); int rfcomm_init_ttys(void); void rfcomm_cleanup_ttys(void); -extern struct proc_dir_entry *proc_bt_rfcomm; - #endif /* __RFCOMM_H */ diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c index 03532062a46a..ea616e3fc98e 100644 --- a/net/bluetooth/af_bluetooth.c +++ b/net/bluetooth/af_bluetooth.c @@ -36,7 +36,6 @@ #include #include #include -#include #include #if defined(CONFIG_KMOD) @@ -50,10 +49,7 @@ #define BT_DBG(D...) #endif -#define VERSION "2.7" - -struct proc_dir_entry *proc_bt; -EXPORT_SYMBOL(proc_bt); +#define VERSION "2.8" /* Bluetooth sockets */ #define BT_MAX_PROTO 8 @@ -312,10 +308,6 @@ static int __init bt_init(void) { BT_INFO("Core ver %s", VERSION); - proc_bt = proc_mkdir("bluetooth", NULL); - if (proc_bt) - proc_bt->owner = THIS_MODULE; - sock_register(&bt_sock_family_ops); BT_INFO("HCI device and connection manager initialized"); @@ -334,8 +326,6 @@ static void __exit bt_exit(void) bt_sysfs_cleanup(); sock_unregister(PF_BLUETOOTH); - - remove_proc_entry("bluetooth", NULL); } subsys_initcall(bt_init); diff --git a/net/bluetooth/hci_sysfs.c b/net/bluetooth/hci_sysfs.c index 7856bc26accb..bd7568ac87fc 100644 --- a/net/bluetooth/hci_sysfs.c +++ b/net/bluetooth/hci_sysfs.c @@ -103,7 +103,7 @@ static void bt_release(struct class_device *cdev) kfree(hdev); } -static struct class bt_class = { +struct class bt_class = { .name = "bluetooth", .release = bt_release, #ifdef CONFIG_HOTPLUG @@ -111,6 +111,8 @@ static struct class bt_class = { #endif }; +EXPORT_SYMBOL_GPL(bt_class); + int hci_register_sysfs(struct hci_dev *hdev) { struct class_device *cdev = &hdev->class_dev; diff --git a/net/bluetooth/l2cap.c b/net/bluetooth/l2cap.c index 59b2dd36baa7..e3bb11ca4235 100644 --- a/net/bluetooth/l2cap.c +++ b/net/bluetooth/l2cap.c @@ -38,9 +38,8 @@ #include #include #include -#include -#include #include +#include #include #include @@ -56,7 +55,7 @@ #define BT_DBG(D...) #endif -#define VERSION "2.7" +#define VERSION "2.8" static struct proto_ops l2cap_sock_ops; @@ -2137,94 +2136,29 @@ drop: return 0; } -/* ---- Proc fs support ---- */ -#ifdef CONFIG_PROC_FS -static void *l2cap_seq_start(struct seq_file *seq, loff_t *pos) +static ssize_t l2cap_sysfs_show(struct class *dev, char *buf) { struct sock *sk; struct hlist_node *node; - loff_t l = *pos; + char *str = buf; read_lock_bh(&l2cap_sk_list.lock); - sk_for_each(sk, node, &l2cap_sk_list.head) - if (!l--) - goto found; - sk = NULL; -found: - return sk; -} + sk_for_each(sk, node, &l2cap_sk_list.head) { + struct l2cap_pinfo *pi = l2cap_pi(sk); -static void *l2cap_seq_next(struct seq_file *seq, void *e, loff_t *pos) -{ - (*pos)++; - return sk_next(e); -} + str += sprintf(str, "%s %s %d %d 0x%4.4x 0x%4.4x %d %d 0x%x\n", + batostr(&bt_sk(sk)->src), batostr(&bt_sk(sk)->dst), + sk->sk_state, pi->psm, pi->scid, pi->dcid, pi->imtu, + pi->omtu, pi->link_mode); + } -static void l2cap_seq_stop(struct seq_file *seq, void *e) -{ read_unlock_bh(&l2cap_sk_list.lock); -} -static int l2cap_seq_show(struct seq_file *seq, void *e) -{ - struct sock *sk = e; - struct l2cap_pinfo *pi = l2cap_pi(sk); - - seq_printf(seq, "%s %s %d %d 0x%4.4x 0x%4.4x %d %d 0x%x\n", - batostr(&bt_sk(sk)->src), batostr(&bt_sk(sk)->dst), - sk->sk_state, pi->psm, pi->scid, pi->dcid, pi->imtu, - pi->omtu, pi->link_mode); - return 0; + return (str - buf); } -static struct seq_operations l2cap_seq_ops = { - .start = l2cap_seq_start, - .next = l2cap_seq_next, - .stop = l2cap_seq_stop, - .show = l2cap_seq_show -}; - -static int l2cap_seq_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &l2cap_seq_ops); -} - -static struct file_operations l2cap_seq_fops = { - .owner = THIS_MODULE, - .open = l2cap_seq_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - -static int __init l2cap_proc_init(void) -{ - struct proc_dir_entry *p = create_proc_entry("l2cap", S_IRUGO, proc_bt); - if (!p) - return -ENOMEM; - p->owner = THIS_MODULE; - p->proc_fops = &l2cap_seq_fops; - return 0; -} - -static void __exit l2cap_proc_cleanup(void) -{ - remove_proc_entry("l2cap", proc_bt); -} - -#else /* CONFIG_PROC_FS */ - -static int __init l2cap_proc_init(void) -{ - return 0; -} - -static void __exit l2cap_proc_cleanup(void) -{ - return; -} -#endif /* CONFIG_PROC_FS */ +static CLASS_ATTR(l2cap, S_IRUGO, l2cap_sysfs_show, NULL); static struct proto_ops l2cap_sock_ops = { .family = PF_BLUETOOTH, @@ -2266,7 +2200,7 @@ static struct hci_proto l2cap_hci_proto = { static int __init l2cap_init(void) { int err; - + err = proto_register(&l2cap_proto, 0); if (err < 0) return err; @@ -2284,7 +2218,7 @@ static int __init l2cap_init(void) goto error; } - l2cap_proc_init(); + class_create_file(&bt_class, &class_attr_l2cap); BT_INFO("L2CAP ver %s", VERSION); BT_INFO("L2CAP socket layer initialized"); @@ -2298,7 +2232,7 @@ error: static void __exit l2cap_exit(void) { - l2cap_proc_cleanup(); + class_remove_file(&bt_class, &class_attr_l2cap); if (bt_sock_unregister(BTPROTO_L2CAP) < 0) BT_ERR("L2CAP socket unregistration failed"); diff --git a/net/bluetooth/rfcomm/core.c b/net/bluetooth/rfcomm/core.c index c3d56ead840c..0d89d6434136 100644 --- a/net/bluetooth/rfcomm/core.c +++ b/net/bluetooth/rfcomm/core.c @@ -35,9 +35,8 @@ #include #include #include +#include #include -#include -#include #include #include #include @@ -47,17 +46,13 @@ #include #include -#define VERSION "1.5" +#define VERSION "1.6" #ifndef CONFIG_BT_RFCOMM_DEBUG #undef BT_DBG #define BT_DBG(D...) #endif -#ifdef CONFIG_PROC_FS -struct proc_dir_entry *proc_bt_rfcomm; -#endif - static struct task_struct *rfcomm_thread; static DECLARE_MUTEX(rfcomm_sem); @@ -2001,117 +1996,32 @@ static struct hci_cb rfcomm_cb = { .encrypt_cfm = rfcomm_encrypt_cfm }; -/* ---- Proc fs support ---- */ -#ifdef CONFIG_PROC_FS -static void *rfcomm_seq_start(struct seq_file *seq, loff_t *pos) +static ssize_t rfcomm_dlc_sysfs_show(struct class *dev, char *buf) { struct rfcomm_session *s; struct list_head *pp, *p; - loff_t l = *pos; + char *str = buf; rfcomm_lock(); list_for_each(p, &session_list) { s = list_entry(p, struct rfcomm_session, list); - list_for_each(pp, &s->dlcs) - if (!l--) { - seq->private = s; - return pp; - } - } - return NULL; -} + list_for_each(pp, &s->dlcs) { + struct sock *sk = s->sock->sk; + struct rfcomm_dlc *d = list_entry(pp, struct rfcomm_dlc, list); -static void *rfcomm_seq_next(struct seq_file *seq, void *e, loff_t *pos) -{ - struct rfcomm_session *s = seq->private; - struct list_head *pp, *p = e; - (*pos)++; - - if (p->next != &s->dlcs) - return p->next; - - list_for_each(p, &session_list) { - s = list_entry(p, struct rfcomm_session, list); - __list_for_each(pp, &s->dlcs) { - seq->private = s; - return pp; + str += sprintf(str, "%s %s %ld %d %d %d %d\n", + batostr(&bt_sk(sk)->src), batostr(&bt_sk(sk)->dst), + d->state, d->dlci, d->mtu, d->rx_credits, d->tx_credits); } } - return NULL; -} -static void rfcomm_seq_stop(struct seq_file *seq, void *e) -{ rfcomm_unlock(); -} - -static int rfcomm_seq_show(struct seq_file *seq, void *e) -{ - struct rfcomm_session *s = seq->private; - struct sock *sk = s->sock->sk; - struct rfcomm_dlc *d = list_entry(e, struct rfcomm_dlc, list); - - seq_printf(seq, "%s %s %ld %d %d %d %d\n", - batostr(&bt_sk(sk)->src), batostr(&bt_sk(sk)->dst), - d->state, d->dlci, d->mtu, d->rx_credits, d->tx_credits); - return 0; -} - -static struct seq_operations rfcomm_seq_ops = { - .start = rfcomm_seq_start, - .next = rfcomm_seq_next, - .stop = rfcomm_seq_stop, - .show = rfcomm_seq_show -}; - -static int rfcomm_seq_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &rfcomm_seq_ops); -} - -static struct file_operations rfcomm_seq_fops = { - .owner = THIS_MODULE, - .open = rfcomm_seq_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - -static int __init rfcomm_proc_init(void) -{ - struct proc_dir_entry *p; - - proc_bt_rfcomm = proc_mkdir("rfcomm", proc_bt); - if (proc_bt_rfcomm) { - proc_bt_rfcomm->owner = THIS_MODULE; - - p = create_proc_entry("dlc", S_IRUGO, proc_bt_rfcomm); - if (p) - p->proc_fops = &rfcomm_seq_fops; - } - return 0; -} - -static void __exit rfcomm_proc_cleanup(void) -{ - remove_proc_entry("dlc", proc_bt_rfcomm); - remove_proc_entry("rfcomm", proc_bt); + return (str - buf); } -#else /* CONFIG_PROC_FS */ - -static int __init rfcomm_proc_init(void) -{ - return 0; -} - -static void __exit rfcomm_proc_cleanup(void) -{ - return; -} -#endif /* CONFIG_PROC_FS */ +static CLASS_ATTR(rfcomm_dlc, S_IRUGO, rfcomm_dlc_sysfs_show, NULL); /* ---- Initialization ---- */ static int __init rfcomm_init(void) @@ -2122,9 +2032,7 @@ static int __init rfcomm_init(void) kernel_thread(rfcomm_run, NULL, CLONE_KERNEL); - BT_INFO("RFCOMM ver %s", VERSION); - - rfcomm_proc_init(); + class_create_file(&bt_class, &class_attr_rfcomm_dlc); rfcomm_init_sockets(); @@ -2132,11 +2040,15 @@ static int __init rfcomm_init(void) rfcomm_init_ttys(); #endif + BT_INFO("RFCOMM ver %s", VERSION); + return 0; } static void __exit rfcomm_exit(void) { + class_remove_file(&bt_class, &class_attr_rfcomm_dlc); + hci_unregister_cb(&rfcomm_cb); /* Terminate working thread. @@ -2153,8 +2065,6 @@ static void __exit rfcomm_exit(void) #endif rfcomm_cleanup_sockets(); - - rfcomm_proc_cleanup(); } module_init(rfcomm_init); diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c index a2b30f0aedb7..6c34261b232e 100644 --- a/net/bluetooth/rfcomm/sock.c +++ b/net/bluetooth/rfcomm/sock.c @@ -42,8 +42,7 @@ #include #include #include -#include -#include +#include #include #include @@ -887,89 +886,26 @@ done: return result; } -/* ---- Proc fs support ---- */ -#ifdef CONFIG_PROC_FS -static void *rfcomm_seq_start(struct seq_file *seq, loff_t *pos) +static ssize_t rfcomm_sock_sysfs_show(struct class *dev, char *buf) { struct sock *sk; struct hlist_node *node; - loff_t l = *pos; + char *str = buf; read_lock_bh(&rfcomm_sk_list.lock); - sk_for_each(sk, node, &rfcomm_sk_list.head) - if (!l--) - return sk; - return NULL; -} - -static void *rfcomm_seq_next(struct seq_file *seq, void *e, loff_t *pos) -{ - struct sock *sk = e; - (*pos)++; - return sk_next(sk); -} + sk_for_each(sk, node, &rfcomm_sk_list.head) { + str += sprintf(str, "%s %s %d %d\n", + batostr(&bt_sk(sk)->src), batostr(&bt_sk(sk)->dst), + sk->sk_state, rfcomm_pi(sk)->channel); + } -static void rfcomm_seq_stop(struct seq_file *seq, void *e) -{ read_unlock_bh(&rfcomm_sk_list.lock); -} -static int rfcomm_seq_show(struct seq_file *seq, void *e) -{ - struct sock *sk = e; - seq_printf(seq, "%s %s %d %d\n", - batostr(&bt_sk(sk)->src), batostr(&bt_sk(sk)->dst), - sk->sk_state, rfcomm_pi(sk)->channel); - return 0; -} - -static struct seq_operations rfcomm_seq_ops = { - .start = rfcomm_seq_start, - .next = rfcomm_seq_next, - .stop = rfcomm_seq_stop, - .show = rfcomm_seq_show -}; - -static int rfcomm_seq_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &rfcomm_seq_ops); + return (str - buf); } -static struct file_operations rfcomm_seq_fops = { - .owner = THIS_MODULE, - .open = rfcomm_seq_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - -static int __init rfcomm_sock_proc_init(void) -{ - struct proc_dir_entry *p = create_proc_entry("sock", S_IRUGO, proc_bt_rfcomm); - if (!p) - return -ENOMEM; - p->proc_fops = &rfcomm_seq_fops; - return 0; -} - -static void __exit rfcomm_sock_proc_cleanup(void) -{ - remove_proc_entry("sock", proc_bt_rfcomm); -} - -#else /* CONFIG_PROC_FS */ - -static int __init rfcomm_sock_proc_init(void) -{ - return 0; -} - -static void __exit rfcomm_sock_proc_cleanup(void) -{ - return; -} -#endif /* CONFIG_PROC_FS */ +static CLASS_ATTR(rfcomm, S_IRUGO, rfcomm_sock_sysfs_show, NULL); static struct proto_ops rfcomm_sock_ops = { .family = PF_BLUETOOTH, @@ -997,7 +933,7 @@ static struct net_proto_family rfcomm_sock_family_ops = { .create = rfcomm_sock_create }; -int __init rfcomm_init_sockets(void) +int __init rfcomm_init_sockets(void) { int err; @@ -1009,7 +945,7 @@ int __init rfcomm_init_sockets(void) if (err < 0) goto error; - rfcomm_sock_proc_init(); + class_create_file(&bt_class, &class_attr_rfcomm); BT_INFO("RFCOMM socket layer initialized"); @@ -1023,7 +959,7 @@ error: void __exit rfcomm_cleanup_sockets(void) { - rfcomm_sock_proc_cleanup(); + class_remove_file(&bt_class, &class_attr_rfcomm); if (bt_sock_unregister(BTPROTO_RFCOMM) < 0) BT_ERR("RFCOMM socket layer unregistration failed"); diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index 997e42df115c..9cb00dc6c08c 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -38,8 +38,7 @@ #include #include #include -#include -#include +#include #include #include @@ -55,7 +54,7 @@ #define BT_DBG(D...) #endif -#define VERSION "0.4" +#define VERSION "0.5" static struct proto_ops sco_sock_ops; @@ -893,91 +892,26 @@ drop: return 0; } -/* ---- Proc fs support ---- */ -#ifdef CONFIG_PROC_FS -static void *sco_seq_start(struct seq_file *seq, loff_t *pos) +static ssize_t sco_sysfs_show(struct class *dev, char *buf) { struct sock *sk; struct hlist_node *node; - loff_t l = *pos; + char *str = buf; read_lock_bh(&sco_sk_list.lock); - sk_for_each(sk, node, &sco_sk_list.head) - if (!l--) - goto found; - sk = NULL; -found: - return sk; -} - -static void *sco_seq_next(struct seq_file *seq, void *e, loff_t *pos) -{ - struct sock *sk = e; - (*pos)++; - return sk_next(sk); -} + sk_for_each(sk, node, &sco_sk_list.head) { + str += sprintf(str, "%s %s %d\n", + batostr(&bt_sk(sk)->src), batostr(&bt_sk(sk)->dst), + sk->sk_state); + } -static void sco_seq_stop(struct seq_file *seq, void *e) -{ read_unlock_bh(&sco_sk_list.lock); -} - -static int sco_seq_show(struct seq_file *seq, void *e) -{ - struct sock *sk = e; - seq_printf(seq, "%s %s %d\n", - batostr(&bt_sk(sk)->src), batostr(&bt_sk(sk)->dst), sk->sk_state); - return 0; -} -static struct seq_operations sco_seq_ops = { - .start = sco_seq_start, - .next = sco_seq_next, - .stop = sco_seq_stop, - .show = sco_seq_show -}; - -static int sco_seq_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &sco_seq_ops); + return (str - buf); } -static struct file_operations sco_seq_fops = { - .owner = THIS_MODULE, - .open = sco_seq_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - -static int __init sco_proc_init(void) -{ - struct proc_dir_entry *p = create_proc_entry("sco", S_IRUGO, proc_bt); - if (!p) - return -ENOMEM; - p->owner = THIS_MODULE; - p->proc_fops = &sco_seq_fops; - return 0; -} - -static void __exit sco_proc_cleanup(void) -{ - remove_proc_entry("sco", proc_bt); -} - -#else /* CONFIG_PROC_FS */ - -static int __init sco_proc_init(void) -{ - return 0; -} - -static void __exit sco_proc_cleanup(void) -{ - return; -} -#endif /* CONFIG_PROC_FS */ +static CLASS_ATTR(sco, S_IRUGO, sco_sysfs_show, NULL); static struct proto_ops sco_sock_ops = { .family = PF_BLUETOOTH, @@ -1035,7 +969,7 @@ static int __init sco_init(void) goto error; } - sco_proc_init(); + class_create_file(&bt_class, &class_attr_sco); BT_INFO("SCO (Voice Link) ver %s", VERSION); BT_INFO("SCO socket layer initialized"); @@ -1049,7 +983,7 @@ error: static void __exit sco_exit(void) { - sco_proc_cleanup(); + class_remove_file(&bt_class, &class_attr_sco); if (bt_sock_unregister(BTPROTO_SCO) < 0) BT_ERR("SCO socket unregistration failed"); -- cgit v1.2.3 From e3305626e0985faa8796f1f4e5a99c1f40bfa70e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 9 Nov 2005 01:01:04 -0500 Subject: ieee80211: cleanup crypto list handling, other minor cleanups. --- include/net/ieee80211_crypt.h | 1 + net/ieee80211/ieee80211_crypt.c | 152 +++++++++++----------------------------- 2 files changed, 42 insertions(+), 111 deletions(-) (limited to 'net') diff --git a/include/net/ieee80211_crypt.h b/include/net/ieee80211_crypt.h index 0a1c2d82ca4b..225fc751d464 100644 --- a/include/net/ieee80211_crypt.h +++ b/include/net/ieee80211_crypt.h @@ -31,6 +31,7 @@ enum { struct ieee80211_crypto_ops { const char *name; + struct list_head list; /* init new crypto context (e.g., allocate private data space, * select IV, etc.); returns NULL on failure or pointer to allocated diff --git a/net/ieee80211/ieee80211_crypt.c b/net/ieee80211/ieee80211_crypt.c index 20cc580a07e0..ecc9bb196abc 100644 --- a/net/ieee80211/ieee80211_crypt.c +++ b/net/ieee80211/ieee80211_crypt.c @@ -11,15 +11,14 @@ * */ -#include +#include #include #include #include -#include -#include - +#include #include + MODULE_AUTHOR("Jouni Malinen"); MODULE_DESCRIPTION("HostAP crypto"); MODULE_LICENSE("GPL"); @@ -29,32 +28,20 @@ struct ieee80211_crypto_alg { struct ieee80211_crypto_ops *ops; }; -struct ieee80211_crypto { - struct list_head algs; - spinlock_t lock; -}; - -static struct ieee80211_crypto *hcrypt; +static LIST_HEAD(ieee80211_crypto_algs); +static DEFINE_SPINLOCK(ieee80211_crypto_lock); void ieee80211_crypt_deinit_entries(struct ieee80211_device *ieee, int force) { - struct list_head *ptr, *n; - struct ieee80211_crypt_data *entry; + struct ieee80211_crypt_data *entry, *next; unsigned long flags; spin_lock_irqsave(&ieee->lock, flags); - - if (list_empty(&ieee->crypt_deinit_list)) - goto unlock; - - for (ptr = ieee->crypt_deinit_list.next, n = ptr->next; - ptr != &ieee->crypt_deinit_list; ptr = n, n = ptr->next) { - entry = list_entry(ptr, struct ieee80211_crypt_data, list); - + list_for_each_entry_safe(entry, next, &ieee->crypt_deinit_list, list) { if (atomic_read(&entry->refcnt) != 0 && !force) continue; - list_del(ptr); + list_del(&entry->list); if (entry->ops) { entry->ops->deinit(entry->priv); @@ -62,7 +49,6 @@ void ieee80211_crypt_deinit_entries(struct ieee80211_device *ieee, int force) } kfree(entry); } - unlock: spin_unlock_irqrestore(&ieee->lock, flags); } @@ -125,9 +111,6 @@ int ieee80211_register_crypto_ops(struct ieee80211_crypto_ops *ops) unsigned long flags; struct ieee80211_crypto_alg *alg; - if (hcrypt == NULL) - return -1; - alg = kmalloc(sizeof(*alg), GFP_KERNEL); if (alg == NULL) return -ENOMEM; @@ -135,9 +118,9 @@ int ieee80211_register_crypto_ops(struct ieee80211_crypto_ops *ops) memset(alg, 0, sizeof(*alg)); alg->ops = ops; - spin_lock_irqsave(&hcrypt->lock, flags); - list_add(&alg->list, &hcrypt->algs); - spin_unlock_irqrestore(&hcrypt->lock, flags); + spin_lock_irqsave(&ieee80211_crypto_lock, flags); + list_add(&alg->list, &ieee80211_crypto_algs); + spin_unlock_irqrestore(&ieee80211_crypto_lock, flags); printk(KERN_DEBUG "ieee80211_crypt: registered algorithm '%s'\n", ops->name); @@ -147,64 +130,49 @@ int ieee80211_register_crypto_ops(struct ieee80211_crypto_ops *ops) int ieee80211_unregister_crypto_ops(struct ieee80211_crypto_ops *ops) { + struct ieee80211_crypto_alg *alg; unsigned long flags; - struct list_head *ptr; - struct ieee80211_crypto_alg *del_alg = NULL; - - if (hcrypt == NULL) - return -1; - - spin_lock_irqsave(&hcrypt->lock, flags); - for (ptr = hcrypt->algs.next; ptr != &hcrypt->algs; ptr = ptr->next) { - struct ieee80211_crypto_alg *alg = - (struct ieee80211_crypto_alg *)ptr; - if (alg->ops == ops) { - list_del(&alg->list); - del_alg = alg; - break; - } - } - spin_unlock_irqrestore(&hcrypt->lock, flags); - if (del_alg) { - printk(KERN_DEBUG "ieee80211_crypt: unregistered algorithm " - "'%s'\n", ops->name); - kfree(del_alg); + spin_lock_irqsave(&ieee80211_crypto_lock, flags); + list_for_each_entry(alg, &ieee80211_crypto_algs, list) { + if (alg->ops == ops) + goto found; } - - return del_alg ? 0 : -1; + spin_unlock_irqrestore(&ieee80211_crypto_lock, flags); + return -EINVAL; + + found: + printk(KERN_DEBUG "ieee80211_crypt: unregistered algorithm " + "'%s'\n", ops->name); + list_del(&alg->list); + spin_unlock_irqrestore(&ieee80211_crypto_lock, flags); + kfree(alg); + return 0; } struct ieee80211_crypto_ops *ieee80211_get_crypto_ops(const char *name) { + struct ieee80211_crypto_alg *alg; unsigned long flags; - struct list_head *ptr; - struct ieee80211_crypto_alg *found_alg = NULL; - - if (hcrypt == NULL) - return NULL; - - spin_lock_irqsave(&hcrypt->lock, flags); - for (ptr = hcrypt->algs.next; ptr != &hcrypt->algs; ptr = ptr->next) { - struct ieee80211_crypto_alg *alg = - (struct ieee80211_crypto_alg *)ptr; - if (strcmp(alg->ops->name, name) == 0) { - found_alg = alg; - break; - } + + spin_lock_irqsave(&ieee80211_crypto_lock, flags); + list_for_each_entry(alg, &ieee80211_crypto_algs, list) { + if (strcmp(alg->ops->name, name) == 0) + goto found; } - spin_unlock_irqrestore(&hcrypt->lock, flags); + spin_unlock_irqrestore(&ieee80211_crypto_lock, flags); + return NULL; - if (found_alg) - return found_alg->ops; - else - return NULL; + found: + spin_unlock_irqrestore(&ieee80211_crypto_lock, flags); + return alg->ops; } static void *ieee80211_crypt_null_init(int keyidx) { return (void *)1; } + static void ieee80211_crypt_null_deinit(void *priv) { } @@ -213,56 +181,18 @@ static struct ieee80211_crypto_ops ieee80211_crypt_null = { .name = "NULL", .init = ieee80211_crypt_null_init, .deinit = ieee80211_crypt_null_deinit, - .encrypt_mpdu = NULL, - .decrypt_mpdu = NULL, - .encrypt_msdu = NULL, - .decrypt_msdu = NULL, - .set_key = NULL, - .get_key = NULL, - .extra_mpdu_prefix_len = 0, - .extra_mpdu_postfix_len = 0, .owner = THIS_MODULE, }; static int __init ieee80211_crypto_init(void) { - int ret = -ENOMEM; - - hcrypt = kmalloc(sizeof(*hcrypt), GFP_KERNEL); - if (!hcrypt) - goto out; - - memset(hcrypt, 0, sizeof(*hcrypt)); - INIT_LIST_HEAD(&hcrypt->algs); - spin_lock_init(&hcrypt->lock); - - ret = ieee80211_register_crypto_ops(&ieee80211_crypt_null); - if (ret < 0) { - kfree(hcrypt); - hcrypt = NULL; - } - out: - return ret; + return ieee80211_register_crypto_ops(&ieee80211_crypt_null); } static void __exit ieee80211_crypto_deinit(void) { - struct list_head *ptr, *n; - - if (hcrypt == NULL) - return; - - for (ptr = hcrypt->algs.next, n = ptr->next; ptr != &hcrypt->algs; - ptr = n, n = ptr->next) { - struct ieee80211_crypto_alg *alg = - (struct ieee80211_crypto_alg *)ptr; - list_del(ptr); - printk(KERN_DEBUG "ieee80211_crypt: unregistered algorithm " - "'%s' (deinit)\n", alg->ops->name); - kfree(alg); - } - - kfree(hcrypt); + ieee80211_unregister_crypto_ops(&ieee80211_crypt_null); + BUG_ON(!list_empty(&ieee80211_crypto_algs)); } EXPORT_SYMBOL(ieee80211_crypt_deinit_entries); -- cgit v1.2.3 From e4543eddfd3bf3e0d625841377fa695a519edfd4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 8 Nov 2005 21:35:04 -0800 Subject: [PATCH] add a vfs_permission helper Most permission() calls have a struct nameidata * available. This helper takes that as an argument and thus makes sure we pass it down for lookup intents and prepares for per-mount read-only support where we need a struct vfsmount for checking whether a file is writeable. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 4 ++-- fs/inotify.c | 2 +- fs/namei.c | 23 +++++++++++++++++++---- fs/namespace.c | 2 +- fs/open.c | 12 ++++++------ include/linux/fs.h | 1 + net/unix/af_unix.c | 2 +- 7 files changed, 31 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/fs/exec.c b/fs/exec.c index 5a4e3acc2e9f..7bbb781b9ac6 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -135,7 +135,7 @@ asmlinkage long sys_uselib(const char __user * library) if (!S_ISREG(nd.dentry->d_inode->i_mode)) goto exit; - error = permission(nd.dentry->d_inode, MAY_READ | MAY_EXEC, &nd); + error = vfs_permission(&nd, MAY_READ | MAY_EXEC); if (error) goto exit; @@ -495,7 +495,7 @@ struct file *open_exec(const char *name) file = ERR_PTR(-EACCES); if (!(nd.mnt->mnt_flags & MNT_NOEXEC) && S_ISREG(inode->i_mode)) { - int err = permission(inode, MAY_EXEC, &nd); + int err = vfs_permission(&nd, MAY_EXEC); if (!err && !(inode->i_mode & 0111)) err = -EACCES; file = ERR_PTR(err); diff --git a/fs/inotify.c b/fs/inotify.c index 9fbaebfdf40b..bf7ce1d2412b 100644 --- a/fs/inotify.c +++ b/fs/inotify.c @@ -372,7 +372,7 @@ static int find_inode(const char __user *dirname, struct nameidata *nd) if (error) return error; /* you can only watch an inode if you have read permissions on it */ - error = permission(nd->dentry->d_inode, MAY_READ, NULL); + error = vfs_permission(nd, MAY_READ); if (error) path_release(nd); return error; diff --git a/fs/namei.c b/fs/namei.c index b3f8a1966c9c..25e4ab4ce8b7 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -256,6 +256,21 @@ int permission(struct inode *inode, int mask, struct nameidata *nd) return security_inode_permission(inode, mask, nd); } +/** + * vfs_permission - check for access rights to a given path + * @nd: lookup result that describes the path + * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC) + * + * Used to check for read/write/execute permissions on a path. + * We use "fsuid" for this, letting us set arbitrary permissions + * for filesystem access without changing the "normal" uids which + * are used for other things. + */ +int vfs_permission(struct nameidata *nd, int mask) +{ + return permission(nd->dentry->d_inode, mask, nd); +} + /* * get_write_access() gets write permission for a file. * put_write_access() releases this write permission. @@ -765,9 +780,8 @@ static fastcall int __link_path_walk(const char * name, struct nameidata *nd) nd->flags |= LOOKUP_CONTINUE; err = exec_permission_lite(inode, nd); - if (err == -EAGAIN) { - err = permission(inode, MAY_EXEC, nd); - } + if (err == -EAGAIN) + err = vfs_permission(nd, MAY_EXEC); if (err) break; @@ -1407,7 +1421,7 @@ int may_open(struct nameidata *nd, int acc_mode, int flag) if (S_ISDIR(inode->i_mode) && (flag & FMODE_WRITE)) return -EISDIR; - error = permission(inode, acc_mode, nd); + error = vfs_permission(nd, acc_mode); if (error) return error; @@ -2536,6 +2550,7 @@ EXPORT_SYMBOL(path_lookup); EXPORT_SYMBOL(path_release); EXPORT_SYMBOL(path_walk); EXPORT_SYMBOL(permission); +EXPORT_SYMBOL(vfs_permission); EXPORT_SYMBOL(unlock_rename); EXPORT_SYMBOL(vfs_create); EXPORT_SYMBOL(vfs_follow_link); diff --git a/fs/namespace.c b/fs/namespace.c index caa9187f67e5..2019899f2ab8 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -637,7 +637,7 @@ static int mount_is_safe(struct nameidata *nd) if (current->uid != nd->dentry->d_inode->i_uid) return -EPERM; } - if (permission(nd->dentry->d_inode, MAY_WRITE, nd)) + if (vfs_permission(nd, MAY_WRITE)) return -EPERM; return 0; #endif diff --git a/fs/open.c b/fs/open.c index 6e8136751e9a..baffc084580d 100644 --- a/fs/open.c +++ b/fs/open.c @@ -240,7 +240,7 @@ static inline long do_sys_truncate(const char __user * path, loff_t length) if (!S_ISREG(inode->i_mode)) goto dput_and_out; - error = permission(inode,MAY_WRITE,&nd); + error = vfs_permission(&nd, MAY_WRITE); if (error) goto dput_and_out; @@ -394,7 +394,7 @@ asmlinkage long sys_utime(char __user * filename, struct utimbuf __user * times) goto dput_and_out; if (current->fsuid != inode->i_uid && - (error = permission(inode,MAY_WRITE,&nd)) != 0) + (error = vfs_permission(&nd, MAY_WRITE)) != 0) goto dput_and_out; } down(&inode->i_sem); @@ -447,7 +447,7 @@ long do_utimes(char __user * filename, struct timeval * times) goto dput_and_out; if (current->fsuid != inode->i_uid && - (error = permission(inode,MAY_WRITE,&nd)) != 0) + (error = vfs_permission(&nd, MAY_WRITE)) != 0) goto dput_and_out; } down(&inode->i_sem); @@ -506,7 +506,7 @@ asmlinkage long sys_access(const char __user * filename, int mode) res = __user_walk(filename, LOOKUP_FOLLOW|LOOKUP_ACCESS, &nd); if (!res) { - res = permission(nd.dentry->d_inode, mode, &nd); + res = vfs_permission(&nd, mode); /* SuS v2 requires we report a read only fs too */ if(!res && (mode & S_IWOTH) && IS_RDONLY(nd.dentry->d_inode) && !special_file(nd.dentry->d_inode->i_mode)) @@ -530,7 +530,7 @@ asmlinkage long sys_chdir(const char __user * filename) if (error) goto out; - error = permission(nd.dentry->d_inode,MAY_EXEC,&nd); + error = vfs_permission(&nd, MAY_EXEC); if (error) goto dput_and_out; @@ -581,7 +581,7 @@ asmlinkage long sys_chroot(const char __user * filename) if (error) goto out; - error = permission(nd.dentry->d_inode,MAY_EXEC,&nd); + error = vfs_permission(&nd, MAY_EXEC); if (error) goto dput_and_out; diff --git a/include/linux/fs.h b/include/linux/fs.h index 1b5f502a4b8f..c3b8c1dc7cdf 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -874,6 +874,7 @@ static inline void unlock_super(struct super_block * sb) /* * VFS helper functions.. */ +extern int vfs_permission(struct nameidata *, int); extern int vfs_create(struct inode *, struct dentry *, int, struct nameidata *); extern int vfs_mkdir(struct inode *, struct dentry *, int); extern int vfs_mknod(struct inode *, struct dentry *, int, dev_t); diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 41feca3bef86..acc73ba8bade 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -676,7 +676,7 @@ static struct sock *unix_find_other(struct sockaddr_un *sunname, int len, err = path_lookup(sunname->sun_path, LOOKUP_FOLLOW, &nd); if (err) goto fail; - err = permission(nd.dentry->d_inode,MAY_WRITE, &nd); + err = vfs_permission(&nd, MAY_WRITE); if (err) goto put_fail; -- cgit v1.2.3 From 49705b7743fd8f5632a95ec4c6547d169d27ac1f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 8 Nov 2005 21:35:06 -0800 Subject: [PATCH] sanitize lookup_hash prototype ->permission and ->lookup have a struct nameidata * argument these days to pass down lookup intents. Unfortunately some callers of lookup_hash don't actually pass this one down. For lookup_one_len() we don't have a struct nameidata to pass down, but as this function is a library function only used by filesystem code this is an acceptable limitation. All other callers should pass down the nameidata, so this patch changes the lookup_hash interface to only take a struct nameidata argument and derives the other two arguments to __lookup_hash from it. All callers already have the nameidata argument available so this is not a problem. At the same time I'd like to deprecate the lookup_hash interface as there are better exported interfaces for filesystem usage. Before it can actually be removed I need to fix up rpc_pipefs. Signed-off-by: Christoph Hellwig Cc: Ram Pai Cc: Jeff Mahoney Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/feature-removal-schedule.txt | 7 +++++++ fs/namei.c | 20 ++++++++++---------- include/linux/namei.h | 2 +- net/sunrpc/rpc_pipe.c | 6 +++--- 4 files changed, 21 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt index 910cc9998731..66e4ca28fc0a 100644 --- a/Documentation/feature-removal-schedule.txt +++ b/Documentation/feature-removal-schedule.txt @@ -118,3 +118,10 @@ Why: This interface has been obsoleted by the new layer3-independent to link against API-compatible library on top of libnfnetlink_queue instead of the current 'libipq'. Who: Harald Welte + +--------------------------- + +What: EXPORT_SYMBOL(lookup_hash) +When: January 2006 +Why: Too low-level interface. Use lookup_one_len or lookup_create instead. +Who: Christoph Hellwig diff --git a/fs/namei.c b/fs/namei.c index b69f6ebadb95..f02ec0e50fca 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1204,9 +1204,9 @@ out: return dentry; } -struct dentry * lookup_hash(struct qstr *name, struct dentry * base) +struct dentry * lookup_hash(struct nameidata *nd) { - return __lookup_hash(name, base, NULL); + return __lookup_hash(&nd->last, nd->dentry, nd); } /* SMP-safe */ @@ -1230,7 +1230,7 @@ struct dentry * lookup_one_len(const char * name, struct dentry * base, int len) } this.hash = end_name_hash(hash); - return lookup_hash(&this, base); + return __lookup_hash(&this, base, NULL); access: return ERR_PTR(-EACCES); } @@ -1563,7 +1563,7 @@ int open_namei(const char * pathname, int flag, int mode, struct nameidata *nd) dir = nd->dentry; nd->flags &= ~LOOKUP_PARENT; down(&dir->d_inode->i_sem); - path.dentry = __lookup_hash(&nd->last, nd->dentry, nd); + path.dentry = lookup_hash(nd); path.mnt = nd->mnt; do_last: @@ -1665,7 +1665,7 @@ do_link: } dir = nd->dentry; down(&dir->d_inode->i_sem); - path.dentry = __lookup_hash(&nd->last, nd->dentry, nd); + path.dentry = lookup_hash(nd); path.mnt = nd->mnt; __putname(nd->last.name); goto do_last; @@ -1697,7 +1697,7 @@ struct dentry *lookup_create(struct nameidata *nd, int is_dir) /* * Do the final lookup. */ - dentry = lookup_hash(&nd->last, nd->dentry); + dentry = lookup_hash(nd); if (IS_ERR(dentry)) goto fail; @@ -1932,7 +1932,7 @@ asmlinkage long sys_rmdir(const char __user * pathname) goto exit1; } down(&nd.dentry->d_inode->i_sem); - dentry = lookup_hash(&nd.last, nd.dentry); + dentry = lookup_hash(&nd); error = PTR_ERR(dentry); if (!IS_ERR(dentry)) { error = vfs_rmdir(nd.dentry->d_inode, dentry); @@ -2001,7 +2001,7 @@ asmlinkage long sys_unlink(const char __user * pathname) if (nd.last_type != LAST_NORM) goto exit1; down(&nd.dentry->d_inode->i_sem); - dentry = lookup_hash(&nd.last, nd.dentry); + dentry = lookup_hash(&nd); error = PTR_ERR(dentry); if (!IS_ERR(dentry)) { /* Why not before? Because we want correct error value */ @@ -2344,7 +2344,7 @@ static inline int do_rename(const char * oldname, const char * newname) trap = lock_rename(new_dir, old_dir); - old_dentry = lookup_hash(&oldnd.last, old_dir); + old_dentry = lookup_hash(&oldnd); error = PTR_ERR(old_dentry); if (IS_ERR(old_dentry)) goto exit3; @@ -2364,7 +2364,7 @@ static inline int do_rename(const char * oldname, const char * newname) error = -EINVAL; if (old_dentry == trap) goto exit4; - new_dentry = lookup_hash(&newnd.last, new_dir); + new_dentry = lookup_hash(&newnd); error = PTR_ERR(new_dentry); if (IS_ERR(new_dentry)) goto exit4; diff --git a/include/linux/namei.h b/include/linux/namei.h index 1c975d0d9e94..455660eafba9 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -74,7 +74,7 @@ extern struct file *nameidata_to_filp(struct nameidata *nd, int flags); extern void release_open_intent(struct nameidata *); extern struct dentry * lookup_one_len(const char *, struct dentry *, int); -extern struct dentry * lookup_hash(struct qstr *, struct dentry *); +extern struct dentry * lookup_hash(struct nameidata *); extern int follow_down(struct vfsmount **, struct dentry **); extern int follow_up(struct vfsmount **, struct dentry **); diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index 4f188d0a5d11..81e00a6c19de 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -603,7 +603,7 @@ rpc_lookup_negative(char *path, struct nameidata *nd) return ERR_PTR(error); dir = nd->dentry->d_inode; down(&dir->i_sem); - dentry = lookup_hash(&nd->last, nd->dentry); + dentry = lookup_hash(nd); if (IS_ERR(dentry)) goto out_err; if (dentry->d_inode) { @@ -665,7 +665,7 @@ rpc_rmdir(char *path) return error; dir = nd.dentry->d_inode; down(&dir->i_sem); - dentry = lookup_hash(&nd.last, nd.dentry); + dentry = lookup_hash(&nd); if (IS_ERR(dentry)) { error = PTR_ERR(dentry); goto out_release; @@ -726,7 +726,7 @@ rpc_unlink(char *path) return error; dir = nd.dentry->d_inode; down(&dir->i_sem); - dentry = lookup_hash(&nd.last, nd.dentry); + dentry = lookup_hash(&nd); if (IS_ERR(dentry)) { error = PTR_ERR(dentry); goto out_release; -- cgit v1.2.3 From 46998f59c03ecbd7c2250810f35af6fe24868845 Mon Sep 17 00:00:00 2001 From: Yasuyuki Kozakai Date: Wed, 9 Nov 2005 12:58:05 -0800 Subject: [NETFILTER]: packet counter of conntrack is 32bits The packet counter variable of conntrack was changed to 32bits from 64bits. This follows that change. Signed-off-by: Yasuyuki Kozakai Signed-off-by: Harald Welte Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_conntrack_netlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c index 82a65043a8ef..431a446994f6 100644 --- a/net/ipv4/netfilter/ip_conntrack_netlink.c +++ b/net/ipv4/netfilter/ip_conntrack_netlink.c @@ -175,7 +175,7 @@ ctnetlink_dump_counters(struct sk_buff *skb, const struct ip_conntrack *ct, { enum ctattr_type type = dir ? CTA_COUNTERS_REPLY: CTA_COUNTERS_ORIG; struct nfattr *nest_count = NFA_NEST(skb, type); - u_int64_t tmp; + u_int32_t tmp; tmp = htonl(ct->counters[dir].packets); NFA_PUT(skb, CTA_COUNTERS32_PACKETS, sizeof(u_int32_t), &tmp); -- cgit v1.2.3 From eaae4fa45e0f4cd1da0f00ae93551edb1002b2b9 Mon Sep 17 00:00:00 2001 From: Yasuyuki Kozakai Date: Wed, 9 Nov 2005 12:58:46 -0800 Subject: [NETFILTER]: refcount leak of proto when ctnetlink dumping tuple Signed-off-by: Yasuyuki Kozakai Signed-off-by: Harald Welte Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_conntrack_netlink.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c index 431a446994f6..02f303cf201e 100644 --- a/net/ipv4/netfilter/ip_conntrack_netlink.c +++ b/net/ipv4/netfilter/ip_conntrack_netlink.c @@ -58,14 +58,17 @@ ctnetlink_dump_tuples_proto(struct sk_buff *skb, const struct ip_conntrack_tuple *tuple) { struct ip_conntrack_protocol *proto; + int ret = 0; NFA_PUT(skb, CTA_PROTO_NUM, sizeof(u_int8_t), &tuple->dst.protonum); proto = ip_conntrack_proto_find_get(tuple->dst.protonum); - if (proto && proto->tuple_to_nfattr) - return proto->tuple_to_nfattr(skb, tuple); + if (likely(proto && proto->tuple_to_nfattr)) { + ret = proto->tuple_to_nfattr(skb, tuple); + ip_conntrack_proto_put(proto); + } - return 0; + return ret; nfattr_failure: return -1; -- cgit v1.2.3 From a2506c04322ca266fe2f9bd7d02a67b1972da611 Mon Sep 17 00:00:00 2001 From: Harald Welte Date: Wed, 9 Nov 2005 12:59:13 -0800 Subject: [NETFILTER] nfnetlink: nfattr_parse() can never fail, make it void nfattr_parse (and thus nfattr_parse_nested) always returns success. So we can make them 'void' and remove all the checking at the caller side. Based on original patch by Pablo Neira Ayuso Signed-off-by: Harald Welte Signed-off-by: David S. Miller --- include/linux/netfilter/nfnetlink.h | 2 +- net/ipv4/netfilter/ip_conntrack_netlink.c | 45 +++++------------------------ net/ipv4/netfilter/ip_conntrack_proto_tcp.c | 6 +--- net/netfilter/nfnetlink.c | 4 +-- 4 files changed, 11 insertions(+), 46 deletions(-) (limited to 'net') diff --git a/include/linux/netfilter/nfnetlink.h b/include/linux/netfilter/nfnetlink.h index f08e870100f4..72975fa8795d 100644 --- a/include/linux/netfilter/nfnetlink.h +++ b/include/linux/netfilter/nfnetlink.h @@ -146,7 +146,7 @@ extern void nfnl_unlock(void); extern int nfnetlink_subsys_register(struct nfnetlink_subsystem *n); extern int nfnetlink_subsys_unregister(struct nfnetlink_subsystem *n); -extern int nfattr_parse(struct nfattr *tb[], int maxattr, +extern void nfattr_parse(struct nfattr *tb[], int maxattr, struct nfattr *nfa, int len); #define nfattr_parse_nested(tb, max, nfa) \ diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c index 02f303cf201e..838262e17376 100644 --- a/net/ipv4/netfilter/ip_conntrack_netlink.c +++ b/net/ipv4/netfilter/ip_conntrack_netlink.c @@ -482,9 +482,7 @@ ctnetlink_parse_tuple_ip(struct nfattr *attr, struct ip_conntrack_tuple *tuple) DEBUGP("entered %s\n", __FUNCTION__); - - if (nfattr_parse_nested(tb, CTA_IP_MAX, attr) < 0) - goto nfattr_failure; + nfattr_parse_nested(tb, CTA_IP_MAX, attr); if (nfattr_bad_size(tb, CTA_IP_MAX, cta_min_ip)) return -EINVAL; @@ -500,9 +498,6 @@ ctnetlink_parse_tuple_ip(struct nfattr *attr, struct ip_conntrack_tuple *tuple) DEBUGP("leaving\n"); return 0; - -nfattr_failure: - return -1; } static const int cta_min_proto[CTA_PROTO_MAX] = { @@ -524,8 +519,7 @@ ctnetlink_parse_tuple_proto(struct nfattr *attr, DEBUGP("entered %s\n", __FUNCTION__); - if (nfattr_parse_nested(tb, CTA_PROTO_MAX, attr) < 0) - goto nfattr_failure; + nfattr_parse_nested(tb, CTA_PROTO_MAX, attr); if (nfattr_bad_size(tb, CTA_PROTO_MAX, cta_min_proto)) return -EINVAL; @@ -542,9 +536,6 @@ ctnetlink_parse_tuple_proto(struct nfattr *attr, } return ret; - -nfattr_failure: - return -1; } static inline int @@ -558,8 +549,7 @@ ctnetlink_parse_tuple(struct nfattr *cda[], struct ip_conntrack_tuple *tuple, memset(tuple, 0, sizeof(*tuple)); - if (nfattr_parse_nested(tb, CTA_TUPLE_MAX, cda[type-1]) < 0) - goto nfattr_failure; + nfattr_parse_nested(tb, CTA_TUPLE_MAX, cda[type-1]); if (!tb[CTA_TUPLE_IP-1]) return -EINVAL; @@ -586,9 +576,6 @@ ctnetlink_parse_tuple(struct nfattr *cda[], struct ip_conntrack_tuple *tuple, DEBUGP("leaving\n"); return 0; - -nfattr_failure: - return -1; } #ifdef CONFIG_IP_NF_NAT_NEEDED @@ -606,11 +593,10 @@ static int ctnetlink_parse_nat_proto(struct nfattr *attr, DEBUGP("entered %s\n", __FUNCTION__); - if (nfattr_parse_nested(tb, CTA_PROTONAT_MAX, attr) < 0) - goto nfattr_failure; + nfattr_parse_nested(tb, CTA_PROTONAT_MAX, attr); if (nfattr_bad_size(tb, CTA_PROTONAT_MAX, cta_min_protonat)) - goto nfattr_failure; + return -1; npt = ip_nat_proto_find_get(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum); if (!npt) @@ -629,9 +615,6 @@ static int ctnetlink_parse_nat_proto(struct nfattr *attr, DEBUGP("leaving\n"); return 0; - -nfattr_failure: - return -1; } static inline int @@ -645,8 +628,7 @@ ctnetlink_parse_nat(struct nfattr *cda[], memset(range, 0, sizeof(*range)); - if (nfattr_parse_nested(tb, CTA_NAT_MAX, cda[CTA_NAT-1]) < 0) - goto nfattr_failure; + nfattr_parse_nested(tb, CTA_NAT_MAX, cda[CTA_NAT-1]); if (tb[CTA_NAT_MINIP-1]) range->min_ip = *(u_int32_t *)NFA_DATA(tb[CTA_NAT_MINIP-1]); @@ -668,9 +650,6 @@ ctnetlink_parse_nat(struct nfattr *cda[], DEBUGP("leaving\n"); return 0; - -nfattr_failure: - return -1; } #endif @@ -681,8 +660,7 @@ ctnetlink_parse_help(struct nfattr *attr, char **helper_name) DEBUGP("entered %s\n", __FUNCTION__); - if (nfattr_parse_nested(tb, CTA_HELP_MAX, attr) < 0) - goto nfattr_failure; + nfattr_parse_nested(tb, CTA_HELP_MAX, attr); if (!tb[CTA_HELP_NAME-1]) return -EINVAL; @@ -690,9 +668,6 @@ ctnetlink_parse_help(struct nfattr *attr, char **helper_name) *helper_name = NFA_DATA(tb[CTA_HELP_NAME-1]); return 0; - -nfattr_failure: - return -1; } static int @@ -960,8 +935,7 @@ ctnetlink_change_protoinfo(struct ip_conntrack *ct, struct nfattr *cda[]) u_int16_t npt = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum; int err = 0; - if (nfattr_parse_nested(tb, CTA_PROTOINFO_MAX, attr) < 0) - goto nfattr_failure; + nfattr_parse_nested(tb, CTA_PROTOINFO_MAX, attr); proto = ip_conntrack_proto_find_get(npt); if (!proto) @@ -972,9 +946,6 @@ ctnetlink_change_protoinfo(struct ip_conntrack *ct, struct nfattr *cda[]) ip_conntrack_proto_put(proto); return err; - -nfattr_failure: - return -ENOMEM; } static int diff --git a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c index d6701cafbcc2..6ea4b22ff28d 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c @@ -362,8 +362,7 @@ static int nfattr_to_tcp(struct nfattr *cda[], struct ip_conntrack *ct) struct nfattr *attr = cda[CTA_PROTOINFO_TCP-1]; struct nfattr *tb[CTA_PROTOINFO_TCP_MAX]; - if (nfattr_parse_nested(tb, CTA_PROTOINFO_TCP_MAX, attr) < 0) - goto nfattr_failure; + nfattr_parse_nested(tb, CTA_PROTOINFO_TCP_MAX, attr); if (!tb[CTA_PROTOINFO_TCP_STATE-1]) return -EINVAL; @@ -374,9 +373,6 @@ static int nfattr_to_tcp(struct nfattr *cda[], struct ip_conntrack *ct) write_unlock_bh(&tcp_lock); return 0; - -nfattr_failure: - return -1; } #endif diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index 4bc27a6334c1..f8bd7c7e7921 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -128,7 +128,7 @@ void __nfa_fill(struct sk_buff *skb, int attrtype, int attrlen, memset(NFA_DATA(nfa) + attrlen, 0, NFA_ALIGN(size) - size); } -int nfattr_parse(struct nfattr *tb[], int maxattr, struct nfattr *nfa, int len) +void nfattr_parse(struct nfattr *tb[], int maxattr, struct nfattr *nfa, int len) { memset(tb, 0, sizeof(struct nfattr *) * maxattr); @@ -138,8 +138,6 @@ int nfattr_parse(struct nfattr *tb[], int maxattr, struct nfattr *nfa, int len) tb[flavor-1] = nfa; nfa = NFA_NEXT(nfa, len); } - - return 0; } /** -- cgit v1.2.3 From 51df784ed739246a3774b300e5f536e17bec36ed Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 9 Nov 2005 12:59:41 -0800 Subject: [NETFILTER] ctnetlink: check if protoinfo is present This fixes an oops triggered from userspace. If we don't pass information about the private protocol info, the reference to attr will be NULL. This is likely to happen in update messages. Signed-off-by: Pablo Neira Ayuso Signed-off-by: Harald Welte Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_conntrack_proto_tcp.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net') diff --git a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c index 6ea4b22ff28d..468c6003b4c7 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c @@ -362,6 +362,11 @@ static int nfattr_to_tcp(struct nfattr *cda[], struct ip_conntrack *ct) struct nfattr *attr = cda[CTA_PROTOINFO_TCP-1]; struct nfattr *tb[CTA_PROTOINFO_TCP_MAX]; + /* updates could not contain anything about the private + * protocol info, in that case skip the parsing */ + if (!attr) + return 0; + nfattr_parse_nested(tb, CTA_PROTOINFO_TCP_MAX, attr); if (!tb[CTA_PROTOINFO_TCP_STATE-1]) -- cgit v1.2.3 From 02a78cdf425156b86abdb6883f837a70fb7106da Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 9 Nov 2005 13:00:04 -0800 Subject: [NETFILTER] ctnetlink: add marking support from userspace This patch adds support for conntrack marking from user space. Signed-off-by: Pablo Neira Ayuso Signed-off-by: Harald Welte Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_conntrack_netlink.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'net') diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c index 838262e17376..09957f9be97d 100644 --- a/net/ipv4/netfilter/ip_conntrack_netlink.c +++ b/net/ipv4/netfilter/ip_conntrack_netlink.c @@ -979,6 +979,11 @@ ctnetlink_change_conntrack(struct ip_conntrack *ct, struct nfattr *cda[]) return err; } +#if defined(CONFIG_IP_NF_CONNTRACK_MARK) + if (cda[CTA_MARK-1]) + ct->mark = ntohl(*(u_int32_t *)NFA_DATA(cda[CTA_MARK-1])); +#endif + DEBUGP("all done\n"); return 0; } @@ -1022,6 +1027,11 @@ ctnetlink_create_conntrack(struct nfattr *cda[], if (ct->helper) ip_conntrack_helper_put(ct->helper); +#if defined(CONFIG_IP_NF_CONNTRACK_MARK) + if (cda[CTA_MARK-1]) + ct->mark = ntohl(*(u_int32_t *)NFA_DATA(cda[CTA_MARK-1])); +#endif + DEBUGP("conntrack with id %u inserted\n", ct->id); return 0; -- cgit v1.2.3 From 119a31849442215fa66e4d18a33443a55c45e631 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 9 Nov 2005 13:00:29 -0800 Subject: [NETFILTER] ctnetlink: add module alias to fix autoloading Add missing module alias. This is a must to load ctnetlink on demand. For example, the conntrack tool will fail if the module isn't loaded. Signed-off-by: Pablo Neira Ayuso Signed-off-by: Harald Welte Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_conntrack_netlink.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c index 09957f9be97d..842fe2c081c9 100644 --- a/net/ipv4/netfilter/ip_conntrack_netlink.c +++ b/net/ipv4/netfilter/ip_conntrack_netlink.c @@ -1538,6 +1538,8 @@ static struct nfnetlink_subsystem ctnl_exp_subsys = { .cb = ctnl_exp_cb, }; +MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_CTNETLINK); + static int __init ctnetlink_init(void) { int ret; -- cgit v1.2.3 From 7a4fe3664b3cfecd2a40a46f54c71333639e28b7 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 9 Nov 2005 13:00:47 -0800 Subject: [NETFILTER] ctnetlink: kill unused includes Kill some useless headers included in ctnetlink. They aren't used in any way. Signed-off-by: Pablo Neira Ayuso Signed-off-by: Harald Welte Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_conntrack_netlink.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c index 842fe2c081c9..bf584249571a 100644 --- a/net/ipv4/netfilter/ip_conntrack_netlink.c +++ b/net/ipv4/netfilter/ip_conntrack_netlink.c @@ -28,11 +28,8 @@ #include #include #include -#include #include -#include -#include #include #include #include -- cgit v1.2.3 From 81e5c27d08bb39e646fe822ea80ab8feba62b94d Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 9 Nov 2005 13:01:19 -0800 Subject: [NETFILTER] ctnetlink: get_conntrack can use GFP_KERNEL ctnetlink_get_conntrack is always called from user context, so GFP_KERNEL is enough. Signed-off-by: Pablo Neira Ayuso Signed-off-by: Harald Welte Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_conntrack_netlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c index bf584249571a..d1dde1417ef7 100644 --- a/net/ipv4/netfilter/ip_conntrack_netlink.c +++ b/net/ipv4/netfilter/ip_conntrack_netlink.c @@ -779,7 +779,7 @@ ctnetlink_get_conntrack(struct sock *ctnl, struct sk_buff *skb, ct = tuplehash_to_ctrack(h); err = -ENOMEM; - skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC); + skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb2) { ip_conntrack_put(ct); return -ENOMEM; -- cgit v1.2.3 From 5978a9b82c55b82a1087bd86e0ae8b00f94d0d0b Mon Sep 17 00:00:00 2001 From: Philip Craig Date: Wed, 9 Nov 2005 13:01:53 -0800 Subject: [NETFILTER] PPTP helper: fix PNS-PAC expectation call id The reply tuple of the PNS->PAC expectation was using the wrong call id. So we had the following situation: - PNS behind NAT firewall - PNS call id requires NATing - PNS->PAC gre packet arrives first then the PNS->PAC expectation is matched, and the other expectation is deleted, but the PAC->PNS gre packets do not match the gre conntrack because the call id is wrong. We also cannot use ip_nat_follow_master(). Signed-off-by: Philip Craig Signed-off-by: Harald Welte Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_nat_helper_pptp.c | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/ip_nat_helper_pptp.c b/net/ipv4/netfilter/ip_nat_helper_pptp.c index ee6ab74ad3a9..e546203f5662 100644 --- a/net/ipv4/netfilter/ip_nat_helper_pptp.c +++ b/net/ipv4/netfilter/ip_nat_helper_pptp.c @@ -73,6 +73,7 @@ static void pptp_nat_expected(struct ip_conntrack *ct, struct ip_conntrack_tuple t; struct ip_ct_pptp_master *ct_pptp_info; struct ip_nat_pptp *nat_pptp_info; + struct ip_nat_range range; ct_pptp_info = &master->help.ct_pptp_info; nat_pptp_info = &master->nat.help.nat_pptp_info; @@ -110,7 +111,30 @@ static void pptp_nat_expected(struct ip_conntrack *ct, DEBUGP("not found!\n"); } - ip_nat_follow_master(ct, exp); + /* This must be a fresh one. */ + BUG_ON(ct->status & IPS_NAT_DONE_MASK); + + /* Change src to where master sends to */ + range.flags = IP_NAT_RANGE_MAP_IPS; + range.min_ip = range.max_ip + = ct->master->tuplehash[!exp->dir].tuple.dst.ip; + if (exp->dir == IP_CT_DIR_ORIGINAL) { + range.flags |= IP_NAT_RANGE_PROTO_SPECIFIED; + range.min = range.max = exp->saved_proto; + } + /* hook doesn't matter, but it has to do source manip */ + ip_nat_setup_info(ct, &range, NF_IP_POST_ROUTING); + + /* For DST manip, map port here to where it's expected. */ + range.flags = IP_NAT_RANGE_MAP_IPS; + range.min_ip = range.max_ip + = ct->master->tuplehash[!exp->dir].tuple.src.ip; + if (exp->dir == IP_CT_DIR_REPLY) { + range.flags |= IP_NAT_RANGE_PROTO_SPECIFIED; + range.min = range.max = exp->saved_proto; + } + /* hook doesn't matter, but it has to do destination manip */ + ip_nat_setup_info(ct, &range, NF_IP_PRE_ROUTING); } /* outbound packets == from PNS to PAC */ @@ -213,7 +237,7 @@ pptp_exp_gre(struct ip_conntrack_expect *expect_orig, /* alter expectation for PNS->PAC direction */ invert_tuplepr(&inv_t, &expect_orig->tuple); - expect_orig->saved_proto.gre.key = htons(nat_pptp_info->pac_call_id); + expect_orig->saved_proto.gre.key = htons(ct_pptp_info->pns_call_id); expect_orig->tuple.src.u.gre.key = htons(nat_pptp_info->pns_call_id); expect_orig->tuple.dst.u.gre.key = htons(ct_pptp_info->pac_call_id); expect_orig->dir = IP_CT_DIR_ORIGINAL; -- cgit v1.2.3 From ed77de9fc69076e6e7c85edf7c1b70650f53121a Mon Sep 17 00:00:00 2001 From: Harald Welte Date: Wed, 9 Nov 2005 13:02:16 -0800 Subject: [NETFILTER] nfnetlink: only load subsystems if CAP_NET_ADMIN is set Without this patch, any user can cause nfnetlink subsystems to be autoloaded. Those subsystems however could add significant processing overhead to packet processing, and would refuse any configuration messages from non-CAP_NET_ADMIN processes anyway. This patch follows a suggestion from Patrick McHardy. Signed-off-by: Harald Welte Signed-off-by: David S. Miller --- net/netfilter/nfnetlink.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index f8bd7c7e7921..83f4c53030fc 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -240,15 +240,18 @@ static inline int nfnetlink_rcv_msg(struct sk_buff *skb, ss = nfnetlink_get_subsys(type); if (!ss) { #ifdef CONFIG_KMOD - /* don't call nfnl_shunlock, since it would reenter - * with further packet processing */ - up(&nfnl_sem); - request_module("nfnetlink-subsys-%d", NFNL_SUBSYS_ID(type)); - nfnl_shlock(); - ss = nfnetlink_get_subsys(type); + if (cap_raised(NETLINK_CB(skb).eff_cap, CAP_NET_ADMIN)) { + /* don't call nfnl_shunlock, since it would reenter + * with further packet processing */ + up(&nfnl_sem); + request_module("nfnetlink-subsys-%d", + NFNL_SUBSYS_ID(type)); + nfnl_shlock(); + ss = nfnetlink_get_subsys(type); + } if (!ss) #endif - goto err_inval; + goto err_inval; } nc = nfnetlink_find_client(type, ss); -- cgit v1.2.3 From d63a92810807e8da298895236f2b99697e884014 Mon Sep 17 00:00:00 2001 From: Yasuyuki Kozakai Date: Wed, 9 Nov 2005 13:02:45 -0800 Subject: [NETFILTER]: stop tracking ICMP error at early point Currently connection tracking handles ICMP error like normal packets if it failed to get related connection. But it fails that after all. This makes connection tracking stop tracking ICMP error at early point. Signed-off-by: Yasuyuki Kozakai Signed-off-by: Harald Welte Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_conntrack_proto_icmp.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c index 98f0015dd255..9481d159acb6 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c @@ -151,13 +151,13 @@ icmp_error_message(struct sk_buff *skb, /* Not enough header? */ inside = skb_header_pointer(skb, skb->nh.iph->ihl*4, sizeof(_in), &_in); if (inside == NULL) - return NF_ACCEPT; + return -NF_ACCEPT; /* Ignore ICMP's containing fragments (shouldn't happen) */ if (inside->ip.frag_off & htons(IP_OFFSET)) { DEBUGP("icmp_error_track: fragment of proto %u\n", inside->ip.protocol); - return NF_ACCEPT; + return -NF_ACCEPT; } innerproto = ip_conntrack_proto_find_get(inside->ip.protocol); @@ -166,7 +166,7 @@ icmp_error_message(struct sk_buff *skb, if (!ip_ct_get_tuple(&inside->ip, skb, dataoff, &origtuple, innerproto)) { DEBUGP("icmp_error: ! get_tuple p=%u", inside->ip.protocol); ip_conntrack_proto_put(innerproto); - return NF_ACCEPT; + return -NF_ACCEPT; } /* Ordinarily, we'd expect the inverted tupleproto, but it's @@ -174,7 +174,7 @@ icmp_error_message(struct sk_buff *skb, if (!ip_ct_invert_tuple(&innertuple, &origtuple, innerproto)) { DEBUGP("icmp_error_track: Can't invert tuple\n"); ip_conntrack_proto_put(innerproto); - return NF_ACCEPT; + return -NF_ACCEPT; } ip_conntrack_proto_put(innerproto); @@ -190,7 +190,7 @@ icmp_error_message(struct sk_buff *skb, if (!h) { DEBUGP("icmp_error_track: no match\n"); - return NF_ACCEPT; + return -NF_ACCEPT; } /* Reverse direction from that found */ if (DIRECTION(h) != IP_CT_DIR_REPLY) -- cgit v1.2.3 From fe902a91ff427af7dbf20e7c196623b2a4eade13 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 9 Nov 2005 13:03:09 -0800 Subject: [NETFILTER] ctnetlink: return -EINVAL if size is wrong Return -EINVAL if the size isn't OK instead of -EPERM. Signed-off-by: Pablo Neira Ayuso Signed-off-by: Harald Welte Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_conntrack_netlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c index d1dde1417ef7..cfc5487e627a 100644 --- a/net/ipv4/netfilter/ip_conntrack_netlink.c +++ b/net/ipv4/netfilter/ip_conntrack_netlink.c @@ -593,7 +593,7 @@ static int ctnetlink_parse_nat_proto(struct nfattr *attr, nfattr_parse_nested(tb, CTA_PROTONAT_MAX, attr); if (nfattr_bad_size(tb, CTA_PROTONAT_MAX, cta_min_protonat)) - return -1; + return -EINVAL; npt = ip_nat_proto_find_get(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum); if (!npt) -- cgit v1.2.3 From fcda46128d5cb50075339b79ce585ab767337e9e Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 9 Nov 2005 13:03:26 -0800 Subject: [NETFILTER] ctnetlink: propagate error instaed of returning -EPERM Propagate the error to userspace instead of returning -EPERM if the get conntrack operation fails. Signed-off-by: Pablo Neira Ayuso Signed-off-by: Harald Welte Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_conntrack_netlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c index cfc5487e627a..7fe745659642 100644 --- a/net/ipv4/netfilter/ip_conntrack_netlink.c +++ b/net/ipv4/netfilter/ip_conntrack_netlink.c @@ -802,7 +802,7 @@ ctnetlink_get_conntrack(struct sock *ctnl, struct sk_buff *skb, free: kfree_skb(skb2); out: - return -1; + return err; } static inline int -- cgit v1.2.3 From a856a19a9f3ee14fc0d555470f3af138aeb0245c Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 9 Nov 2005 13:03:42 -0800 Subject: [NETFILTER] ctnetlink: Add support to identify expectations by ID's Signed-off-by: Pablo Neira Ayuso Signed-off-by: Harald Welte Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_conntrack_netlink.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'net') diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c index 7fe745659642..5c1c0a3d1c4b 100644 --- a/net/ipv4/netfilter/ip_conntrack_netlink.c +++ b/net/ipv4/netfilter/ip_conntrack_netlink.c @@ -1293,6 +1293,14 @@ ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb, if (!exp) return -ENOENT; + if (cda[CTA_EXPECT_ID-1]) { + u_int32_t id = *(u_int32_t *)NFA_DATA(cda[CTA_EXPECT_ID-1]); + if (exp->id != ntohl(id)) { + ip_conntrack_expect_put(exp); + return -ENOENT; + } + } + err = -ENOMEM; skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb2) -- cgit v1.2.3 From 439a9994bb6ae3c7cab1f0b776bca6bc7aa58a11 Mon Sep 17 00:00:00 2001 From: Krzysztof Piotr Oledzki Date: Wed, 9 Nov 2005 13:04:08 -0800 Subject: [NETFILTER] ctnetlink: Fix oops when no ICMP ID info in message This patch fixes an userspace triggered oops. If there is no ICMP_ID info the reference to attr will be NULL. Signed-off-by: Krzysztof Piotr Oledzki Signed-off-by: Pablo Neira Ayuso Signed-off-by: Harald Welte Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_conntrack_proto_icmp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c index 9481d159acb6..083951e20690 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c @@ -296,7 +296,8 @@ static int icmp_nfattr_to_tuple(struct nfattr *tb[], struct ip_conntrack_tuple *tuple) { if (!tb[CTA_PROTO_ICMP_TYPE-1] - || !tb[CTA_PROTO_ICMP_CODE-1]) + || !tb[CTA_PROTO_ICMP_CODE-1] + || !tb[CTA_PROTO_ICMP_ID-1]) return -1; tuple->dst.u.icmp.type = -- cgit v1.2.3 From 5fd52fe0989f8c84abd8d4a40ded79d4da911744 Mon Sep 17 00:00:00 2001 From: Krzysztof Piotr Oledzki Date: Wed, 9 Nov 2005 13:04:32 -0800 Subject: [NETFILTER] ctnetlink: ICMP_ID is u_int16_t not u_int8_t. Signed-off-by: Krzysztof Piotr Oledzki Signed-off-by: Pablo Neira Ayuso Signed-off-by: Harald Welte Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_conntrack_proto_icmp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c index 083951e20690..5198f3a1e2cd 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c @@ -305,7 +305,7 @@ static int icmp_nfattr_to_tuple(struct nfattr *tb[], tuple->dst.u.icmp.code = *(u_int8_t *)NFA_DATA(tb[CTA_PROTO_ICMP_CODE-1]); tuple->src.u.icmp.id = - *(u_int8_t *)NFA_DATA(tb[CTA_PROTO_ICMP_ID-1]); + *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_ICMP_ID-1]); return 0; } -- cgit v1.2.3 From 44fd0261d3509b0b4303fd9ba792058d230186ab Mon Sep 17 00:00:00 2001 From: Peter Chubb Date: Wed, 9 Nov 2005 13:05:47 -0800 Subject: [IPV6]: Fix fallout from CONFIG_IPV6_PRIVACY Trying to build today's 2.6.14+git snapshot gives undefined references to use_tempaddr Looks like an ifdef got left out. Signed-off-by: Peter Chubb Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index b7a5f51238b3..ddcf7754eec2 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1022,6 +1022,7 @@ int ipv6_dev_get_saddr(struct net_device *daddr_dev, continue; } +#ifdef CONFIG_IPV6_PRIVACY /* Rule 7: Prefer public address * Note: prefer temprary address if use_tempaddr >= 2 */ @@ -1042,7 +1043,7 @@ int ipv6_dev_get_saddr(struct net_device *daddr_dev, if (hiscore.attrs & IPV6_SADDR_SCORE_PRIVACY) continue; } - +#endif /* Rule 8: Use longest matching prefix */ if (hiscore.rule < 8) hiscore.matchlen = ipv6_addr_diff(&ifa_result->addr, daddr); -- cgit v1.2.3 From 9f0ede52a0ebfe1fe99ee5bfd99d17e6ac0c503d Mon Sep 17 00:00:00 2001 From: Ken-ichirou MATSUZAWA Date: Wed, 9 Nov 2005 13:08:29 -0800 Subject: [IPV6]: ip6ip6_lock is not unlocked in error path. From: Ken-ichirou MATSUZAWA Signed-off-by: David S. Miller --- net/ipv6/ip6_tunnel.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index e6b0e3954c02..e315d0f80af1 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -525,6 +525,7 @@ ip6ip6_rcv(struct sk_buff **pskb, unsigned int *nhoffp) if ((t = ip6ip6_tnl_lookup(&ipv6h->saddr, &ipv6h->daddr)) != NULL) { if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) { + read_unlock(&ip6ip6_lock); kfree_skb(skb); return 0; } -- cgit v1.2.3 From 9fb9cbb1082d6b31fb45aa1a14432449a0df6cf1 Mon Sep 17 00:00:00 2001 From: Yasuyuki Kozakai Date: Wed, 9 Nov 2005 16:38:16 -0800 Subject: [NETFILTER]: Add nf_conntrack subsystem. The existing connection tracking subsystem in netfilter can only handle ipv4. There were basically two choices present to add connection tracking support for ipv6. We could either duplicate all of the ipv4 connection tracking code into an ipv6 counterpart, or (the choice taken by these patches) we could design a generic layer that could handle both ipv4 and ipv6 and thus requiring only one sub-protocol (TCP, UDP, etc.) connection tracking helper module to be written. In fact nf_conntrack is capable of working with any layer 3 protocol. The existing ipv4 specific conntrack code could also not deal with the pecularities of doing connection tracking on ipv6, which is also cured here. For example, these issues include: 1) ICMPv6 handling, which is used for neighbour discovery in ipv6 thus some messages such as these should not participate in connection tracking since effectively they are like ARP messages 2) fragmentation must be handled differently in ipv6, because the simplistic "defrag, connection track and NAT, refrag" (which the existing ipv4 connection tracking does) approach simply isn't feasible in ipv6 3) ipv6 extension header parsing must occur at the correct spots before and after connection tracking decisions, and there were no provisions for this in the existing connection tracking design 4) ipv6 has no need for stateful NAT The ipv4 specific conntrack layer is kept around, until all of the ipv4 specific conntrack helpers are ported over to nf_conntrack and it is feature complete. Once that occurs, the old conntrack stuff will get placed into the feature-removal-schedule and we will fully kill it off 6 months later. Signed-off-by: Yasuyuki Kozakai Signed-off-by: Harald Welte Signed-off-by: Arnaldo Carvalho de Melo --- include/linux/netfilter/nf_conntrack_common.h | 159 ++ include/linux/netfilter/nf_conntrack_ftp.h | 44 + include/linux/netfilter/nf_conntrack_sctp.h | 27 + include/linux/netfilter/nf_conntrack_tcp.h | 56 + .../linux/netfilter/nf_conntrack_tuple_common.h | 13 + include/linux/netfilter_ipv4/ip_conntrack.h | 152 +- include/linux/netfilter_ipv4/ip_conntrack_ftp.h | 39 +- include/linux/netfilter_ipv4/ip_conntrack_icmp.h | 9 +- include/linux/netfilter_ipv4/ip_conntrack_sctp.h | 21 +- include/linux/netfilter_ipv4/ip_conntrack_tcp.h | 47 +- include/linux/netfilter_ipv4/ip_conntrack_tuple.h | 10 +- include/linux/netfilter_ipv6.h | 1 + include/linux/skbuff.h | 19 + include/linux/sysctl.h | 37 + include/net/netfilter/ipv4/nf_conntrack_icmp.h | 11 + include/net/netfilter/ipv4/nf_conntrack_ipv4.h | 43 + include/net/netfilter/ipv6/nf_conntrack_icmpv6.h | 27 + include/net/netfilter/nf_conntrack.h | 354 +++++ include/net/netfilter/nf_conntrack_compat.h | 108 ++ include/net/netfilter/nf_conntrack_core.h | 76 + include/net/netfilter/nf_conntrack_helper.h | 51 + include/net/netfilter/nf_conntrack_l3proto.h | 93 ++ include/net/netfilter/nf_conntrack_protocol.h | 105 ++ include/net/netfilter/nf_conntrack_tuple.h | 190 +++ net/core/skbuff.c | 15 + net/ipv4/netfilter/Kconfig | 41 +- net/ipv4/netfilter/Makefile | 6 + net/ipv4/netfilter/ip_conntrack_netlink.c | 2 +- net/ipv4/netfilter/ipt_CLUSTERIP.c | 12 +- net/ipv4/netfilter/ipt_CONNMARK.c | 22 +- net/ipv4/netfilter/ipt_NOTRACK.c | 4 +- net/ipv4/netfilter/ipt_connbytes.c | 39 +- net/ipv4/netfilter/ipt_connmark.c | 10 +- net/ipv4/netfilter/ipt_conntrack.c | 96 ++ net/ipv4/netfilter/ipt_helper.c | 54 + net/ipv4/netfilter/ipt_state.c | 6 +- net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c | 571 ++++++++ net/ipv4/netfilter/nf_conntrack_proto_icmp.c | 301 ++++ net/ipv6/ip6_input.c | 5 + net/ipv6/ip6_output.c | 6 + net/ipv6/netfilter/Kconfig | 14 + net/ipv6/netfilter/Makefile | 6 + net/ipv6/netfilter/ip6t_MARK.c | 6 +- net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c | 556 +++++++ net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c | 272 ++++ net/ipv6/netfilter/nf_conntrack_reasm.c | 885 +++++++++++ net/ipv6/raw.c | 4 +- net/netfilter/Kconfig | 74 + net/netfilter/Makefile | 8 + net/netfilter/nf_conntrack_core.c | 1538 ++++++++++++++++++++ net/netfilter/nf_conntrack_ftp.c | 698 +++++++++ net/netfilter/nf_conntrack_l3proto_generic.c | 98 ++ net/netfilter/nf_conntrack_proto_generic.c | 85 ++ net/netfilter/nf_conntrack_proto_sctp.c | 670 +++++++++ net/netfilter/nf_conntrack_proto_tcp.c | 1162 +++++++++++++++ net/netfilter/nf_conntrack_proto_udp.c | 216 +++ net/netfilter/nf_conntrack_standalone.c | 869 +++++++++++ 57 files changed, 9710 insertions(+), 333 deletions(-) create mode 100644 include/linux/netfilter/nf_conntrack_common.h create mode 100644 include/linux/netfilter/nf_conntrack_ftp.h create mode 100644 include/linux/netfilter/nf_conntrack_sctp.h create mode 100644 include/linux/netfilter/nf_conntrack_tcp.h create mode 100644 include/linux/netfilter/nf_conntrack_tuple_common.h create mode 100644 include/net/netfilter/ipv4/nf_conntrack_icmp.h create mode 100644 include/net/netfilter/ipv4/nf_conntrack_ipv4.h create mode 100644 include/net/netfilter/ipv6/nf_conntrack_icmpv6.h create mode 100644 include/net/netfilter/nf_conntrack.h create mode 100644 include/net/netfilter/nf_conntrack_compat.h create mode 100644 include/net/netfilter/nf_conntrack_core.h create mode 100644 include/net/netfilter/nf_conntrack_helper.h create mode 100644 include/net/netfilter/nf_conntrack_l3proto.h create mode 100644 include/net/netfilter/nf_conntrack_protocol.h create mode 100644 include/net/netfilter/nf_conntrack_tuple.h create mode 100644 net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c create mode 100644 net/ipv4/netfilter/nf_conntrack_proto_icmp.c create mode 100644 net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c create mode 100644 net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c create mode 100644 net/ipv6/netfilter/nf_conntrack_reasm.c create mode 100644 net/netfilter/nf_conntrack_core.c create mode 100644 net/netfilter/nf_conntrack_ftp.c create mode 100644 net/netfilter/nf_conntrack_l3proto_generic.c create mode 100644 net/netfilter/nf_conntrack_proto_generic.c create mode 100644 net/netfilter/nf_conntrack_proto_sctp.c create mode 100644 net/netfilter/nf_conntrack_proto_tcp.c create mode 100644 net/netfilter/nf_conntrack_proto_udp.c create mode 100644 net/netfilter/nf_conntrack_standalone.c (limited to 'net') diff --git a/include/linux/netfilter/nf_conntrack_common.h b/include/linux/netfilter/nf_conntrack_common.h new file mode 100644 index 000000000000..6d39b518486b --- /dev/null +++ b/include/linux/netfilter/nf_conntrack_common.h @@ -0,0 +1,159 @@ +#ifndef _NF_CONNTRACK_COMMON_H +#define _NF_CONNTRACK_COMMON_H +/* Connection state tracking for netfilter. This is separated from, + but required by, the NAT layer; it can also be used by an iptables + extension. */ +enum ip_conntrack_info +{ + /* Part of an established connection (either direction). */ + IP_CT_ESTABLISHED, + + /* Like NEW, but related to an existing connection, or ICMP error + (in either direction). */ + IP_CT_RELATED, + + /* Started a new connection to track (only + IP_CT_DIR_ORIGINAL); may be a retransmission. */ + IP_CT_NEW, + + /* >= this indicates reply direction */ + IP_CT_IS_REPLY, + + /* Number of distinct IP_CT types (no NEW in reply dirn). */ + IP_CT_NUMBER = IP_CT_IS_REPLY * 2 - 1 +}; + +/* Bitset representing status of connection. */ +enum ip_conntrack_status { + /* It's an expected connection: bit 0 set. This bit never changed */ + IPS_EXPECTED_BIT = 0, + IPS_EXPECTED = (1 << IPS_EXPECTED_BIT), + + /* We've seen packets both ways: bit 1 set. Can be set, not unset. */ + IPS_SEEN_REPLY_BIT = 1, + IPS_SEEN_REPLY = (1 << IPS_SEEN_REPLY_BIT), + + /* Conntrack should never be early-expired. */ + IPS_ASSURED_BIT = 2, + IPS_ASSURED = (1 << IPS_ASSURED_BIT), + + /* Connection is confirmed: originating packet has left box */ + IPS_CONFIRMED_BIT = 3, + IPS_CONFIRMED = (1 << IPS_CONFIRMED_BIT), + + /* Connection needs src nat in orig dir. This bit never changed. */ + IPS_SRC_NAT_BIT = 4, + IPS_SRC_NAT = (1 << IPS_SRC_NAT_BIT), + + /* Connection needs dst nat in orig dir. This bit never changed. */ + IPS_DST_NAT_BIT = 5, + IPS_DST_NAT = (1 << IPS_DST_NAT_BIT), + + /* Both together. */ + IPS_NAT_MASK = (IPS_DST_NAT | IPS_SRC_NAT), + + /* Connection needs TCP sequence adjusted. */ + IPS_SEQ_ADJUST_BIT = 6, + IPS_SEQ_ADJUST = (1 << IPS_SEQ_ADJUST_BIT), + + /* NAT initialization bits. */ + IPS_SRC_NAT_DONE_BIT = 7, + IPS_SRC_NAT_DONE = (1 << IPS_SRC_NAT_DONE_BIT), + + IPS_DST_NAT_DONE_BIT = 8, + IPS_DST_NAT_DONE = (1 << IPS_DST_NAT_DONE_BIT), + + /* Both together */ + IPS_NAT_DONE_MASK = (IPS_DST_NAT_DONE | IPS_SRC_NAT_DONE), + + /* Connection is dying (removed from lists), can not be unset. */ + IPS_DYING_BIT = 9, + IPS_DYING = (1 << IPS_DYING_BIT), +}; + +/* Connection tracking event bits */ +enum ip_conntrack_events +{ + /* New conntrack */ + IPCT_NEW_BIT = 0, + IPCT_NEW = (1 << IPCT_NEW_BIT), + + /* Expected connection */ + IPCT_RELATED_BIT = 1, + IPCT_RELATED = (1 << IPCT_RELATED_BIT), + + /* Destroyed conntrack */ + IPCT_DESTROY_BIT = 2, + IPCT_DESTROY = (1 << IPCT_DESTROY_BIT), + + /* Timer has been refreshed */ + IPCT_REFRESH_BIT = 3, + IPCT_REFRESH = (1 << IPCT_REFRESH_BIT), + + /* Status has changed */ + IPCT_STATUS_BIT = 4, + IPCT_STATUS = (1 << IPCT_STATUS_BIT), + + /* Update of protocol info */ + IPCT_PROTOINFO_BIT = 5, + IPCT_PROTOINFO = (1 << IPCT_PROTOINFO_BIT), + + /* Volatile protocol info */ + IPCT_PROTOINFO_VOLATILE_BIT = 6, + IPCT_PROTOINFO_VOLATILE = (1 << IPCT_PROTOINFO_VOLATILE_BIT), + + /* New helper for conntrack */ + IPCT_HELPER_BIT = 7, + IPCT_HELPER = (1 << IPCT_HELPER_BIT), + + /* Update of helper info */ + IPCT_HELPINFO_BIT = 8, + IPCT_HELPINFO = (1 << IPCT_HELPINFO_BIT), + + /* Volatile helper info */ + IPCT_HELPINFO_VOLATILE_BIT = 9, + IPCT_HELPINFO_VOLATILE = (1 << IPCT_HELPINFO_VOLATILE_BIT), + + /* NAT info */ + IPCT_NATINFO_BIT = 10, + IPCT_NATINFO = (1 << IPCT_NATINFO_BIT), + + /* Counter highest bit has been set */ + IPCT_COUNTER_FILLING_BIT = 11, + IPCT_COUNTER_FILLING = (1 << IPCT_COUNTER_FILLING_BIT), +}; + +enum ip_conntrack_expect_events { + IPEXP_NEW_BIT = 0, + IPEXP_NEW = (1 << IPEXP_NEW_BIT), +}; + +#ifdef __KERNEL__ +struct ip_conntrack_counter +{ + u_int32_t packets; + u_int32_t bytes; +}; + +struct ip_conntrack_stat +{ + unsigned int searched; + unsigned int found; + unsigned int new; + unsigned int invalid; + unsigned int ignore; + unsigned int delete; + unsigned int delete_list; + unsigned int insert; + unsigned int insert_failed; + unsigned int drop; + unsigned int early_drop; + unsigned int error; + unsigned int expect_new; + unsigned int expect_create; + unsigned int expect_delete; +}; + +#endif /* __KERNEL__ */ + +#endif /* _NF_CONNTRACK_COMMON_H */ diff --git a/include/linux/netfilter/nf_conntrack_ftp.h b/include/linux/netfilter/nf_conntrack_ftp.h new file mode 100644 index 000000000000..ad4a41c9ce93 --- /dev/null +++ b/include/linux/netfilter/nf_conntrack_ftp.h @@ -0,0 +1,44 @@ +#ifndef _NF_CONNTRACK_FTP_H +#define _NF_CONNTRACK_FTP_H +/* FTP tracking. */ + +/* This enum is exposed to userspace */ +enum ip_ct_ftp_type +{ + /* PORT command from client */ + IP_CT_FTP_PORT, + /* PASV response from server */ + IP_CT_FTP_PASV, + /* EPRT command from client */ + IP_CT_FTP_EPRT, + /* EPSV response from server */ + IP_CT_FTP_EPSV, +}; + +#ifdef __KERNEL__ + +#define FTP_PORT 21 + +#define NUM_SEQ_TO_REMEMBER 2 +/* This structure exists only once per master */ +struct ip_ct_ftp_master { + /* Valid seq positions for cmd matching after newline */ + u_int32_t seq_aft_nl[IP_CT_DIR_MAX][NUM_SEQ_TO_REMEMBER]; + /* 0 means seq_match_aft_nl not set */ + int seq_aft_nl_num[IP_CT_DIR_MAX]; +}; + +struct ip_conntrack_expect; + +/* For NAT to hook in when we find a packet which describes what other + * connection we should expect. */ +extern unsigned int (*ip_nat_ftp_hook)(struct sk_buff **pskb, + enum ip_conntrack_info ctinfo, + enum ip_ct_ftp_type type, + unsigned int matchoff, + unsigned int matchlen, + struct ip_conntrack_expect *exp, + u32 *seq); +#endif /* __KERNEL__ */ + +#endif /* _NF_CONNTRACK_FTP_H */ diff --git a/include/linux/netfilter/nf_conntrack_sctp.h b/include/linux/netfilter/nf_conntrack_sctp.h new file mode 100644 index 000000000000..b8994d9fd1a9 --- /dev/null +++ b/include/linux/netfilter/nf_conntrack_sctp.h @@ -0,0 +1,27 @@ +#ifndef _NF_CONNTRACK_SCTP_H +#define _NF_CONNTRACK_SCTP_H +/* SCTP tracking. */ + +#include + +enum sctp_conntrack { + SCTP_CONNTRACK_NONE, + SCTP_CONNTRACK_CLOSED, + SCTP_CONNTRACK_COOKIE_WAIT, + SCTP_CONNTRACK_COOKIE_ECHOED, + SCTP_CONNTRACK_ESTABLISHED, + SCTP_CONNTRACK_SHUTDOWN_SENT, + SCTP_CONNTRACK_SHUTDOWN_RECD, + SCTP_CONNTRACK_SHUTDOWN_ACK_SENT, + SCTP_CONNTRACK_MAX +}; + +struct ip_ct_sctp +{ + enum sctp_conntrack state; + + u_int32_t vtag[IP_CT_DIR_MAX]; + u_int32_t ttag[IP_CT_DIR_MAX]; +}; + +#endif /* _NF_CONNTRACK_SCTP_H */ diff --git a/include/linux/netfilter/nf_conntrack_tcp.h b/include/linux/netfilter/nf_conntrack_tcp.h new file mode 100644 index 000000000000..b2feeffde384 --- /dev/null +++ b/include/linux/netfilter/nf_conntrack_tcp.h @@ -0,0 +1,56 @@ +#ifndef _NF_CONNTRACK_TCP_H +#define _NF_CONNTRACK_TCP_H +/* TCP tracking. */ + +/* This is exposed to userspace (ctnetlink) */ +enum tcp_conntrack { + TCP_CONNTRACK_NONE, + TCP_CONNTRACK_SYN_SENT, + TCP_CONNTRACK_SYN_RECV, + TCP_CONNTRACK_ESTABLISHED, + TCP_CONNTRACK_FIN_WAIT, + TCP_CONNTRACK_CLOSE_WAIT, + TCP_CONNTRACK_LAST_ACK, + TCP_CONNTRACK_TIME_WAIT, + TCP_CONNTRACK_CLOSE, + TCP_CONNTRACK_LISTEN, + TCP_CONNTRACK_MAX, + TCP_CONNTRACK_IGNORE +}; + +/* Window scaling is advertised by the sender */ +#define IP_CT_TCP_FLAG_WINDOW_SCALE 0x01 + +/* SACK is permitted by the sender */ +#define IP_CT_TCP_FLAG_SACK_PERM 0x02 + +/* This sender sent FIN first */ +#define IP_CT_TCP_FLAG_CLOSE_INIT 0x03 + +#ifdef __KERNEL__ + +struct ip_ct_tcp_state { + u_int32_t td_end; /* max of seq + len */ + u_int32_t td_maxend; /* max of ack + max(win, 1) */ + u_int32_t td_maxwin; /* max(win) */ + u_int8_t td_scale; /* window scale factor */ + u_int8_t loose; /* used when connection picked up from the middle */ + u_int8_t flags; /* per direction options */ +}; + +struct ip_ct_tcp +{ + struct ip_ct_tcp_state seen[2]; /* connection parameters per direction */ + u_int8_t state; /* state of the connection (enum tcp_conntrack) */ + /* For detecting stale connections */ + u_int8_t last_dir; /* Direction of the last packet (enum ip_conntrack_dir) */ + u_int8_t retrans; /* Number of retransmitted packets */ + u_int8_t last_index; /* Index of the last packet */ + u_int32_t last_seq; /* Last sequence number seen in dir */ + u_int32_t last_ack; /* Last sequence number seen in opposite dir */ + u_int32_t last_end; /* Last seq + len */ +}; + +#endif /* __KERNEL__ */ + +#endif /* _NF_CONNTRACK_TCP_H */ diff --git a/include/linux/netfilter/nf_conntrack_tuple_common.h b/include/linux/netfilter/nf_conntrack_tuple_common.h new file mode 100644 index 000000000000..8e145f0d61cb --- /dev/null +++ b/include/linux/netfilter/nf_conntrack_tuple_common.h @@ -0,0 +1,13 @@ +#ifndef _NF_CONNTRACK_TUPLE_COMMON_H +#define _NF_CONNTRACK_TUPLE_COMMON_H + +enum ip_conntrack_dir +{ + IP_CT_DIR_ORIGINAL, + IP_CT_DIR_REPLY, + IP_CT_DIR_MAX +}; + +#define CTINFO2DIR(ctinfo) ((ctinfo) >= IP_CT_IS_REPLY ? IP_CT_DIR_REPLY : IP_CT_DIR_ORIGINAL) + +#endif /* _NF_CONNTRACK_TUPLE_COMMON_H */ diff --git a/include/linux/netfilter_ipv4/ip_conntrack.h b/include/linux/netfilter_ipv4/ip_conntrack.h index d078bb91d9e5..b3432ab59a17 100644 --- a/include/linux/netfilter_ipv4/ip_conntrack.h +++ b/include/linux/netfilter_ipv4/ip_conntrack.h @@ -1,132 +1,7 @@ #ifndef _IP_CONNTRACK_H #define _IP_CONNTRACK_H -/* Connection state tracking for netfilter. This is separated from, - but required by, the NAT layer; it can also be used by an iptables - extension. */ -enum ip_conntrack_info -{ - /* Part of an established connection (either direction). */ - IP_CT_ESTABLISHED, - - /* Like NEW, but related to an existing connection, or ICMP error - (in either direction). */ - IP_CT_RELATED, - - /* Started a new connection to track (only - IP_CT_DIR_ORIGINAL); may be a retransmission. */ - IP_CT_NEW, - - /* >= this indicates reply direction */ - IP_CT_IS_REPLY, - - /* Number of distinct IP_CT types (no NEW in reply dirn). */ - IP_CT_NUMBER = IP_CT_IS_REPLY * 2 - 1 -}; - -/* Bitset representing status of connection. */ -enum ip_conntrack_status { - /* It's an expected connection: bit 0 set. This bit never changed */ - IPS_EXPECTED_BIT = 0, - IPS_EXPECTED = (1 << IPS_EXPECTED_BIT), - - /* We've seen packets both ways: bit 1 set. Can be set, not unset. */ - IPS_SEEN_REPLY_BIT = 1, - IPS_SEEN_REPLY = (1 << IPS_SEEN_REPLY_BIT), - - /* Conntrack should never be early-expired. */ - IPS_ASSURED_BIT = 2, - IPS_ASSURED = (1 << IPS_ASSURED_BIT), - - /* Connection is confirmed: originating packet has left box */ - IPS_CONFIRMED_BIT = 3, - IPS_CONFIRMED = (1 << IPS_CONFIRMED_BIT), - - /* Connection needs src nat in orig dir. This bit never changed. */ - IPS_SRC_NAT_BIT = 4, - IPS_SRC_NAT = (1 << IPS_SRC_NAT_BIT), - - /* Connection needs dst nat in orig dir. This bit never changed. */ - IPS_DST_NAT_BIT = 5, - IPS_DST_NAT = (1 << IPS_DST_NAT_BIT), - - /* Both together. */ - IPS_NAT_MASK = (IPS_DST_NAT | IPS_SRC_NAT), - - /* Connection needs TCP sequence adjusted. */ - IPS_SEQ_ADJUST_BIT = 6, - IPS_SEQ_ADJUST = (1 << IPS_SEQ_ADJUST_BIT), - - /* NAT initialization bits. */ - IPS_SRC_NAT_DONE_BIT = 7, - IPS_SRC_NAT_DONE = (1 << IPS_SRC_NAT_DONE_BIT), - - IPS_DST_NAT_DONE_BIT = 8, - IPS_DST_NAT_DONE = (1 << IPS_DST_NAT_DONE_BIT), - - /* Both together */ - IPS_NAT_DONE_MASK = (IPS_DST_NAT_DONE | IPS_SRC_NAT_DONE), - - /* Connection is dying (removed from lists), can not be unset. */ - IPS_DYING_BIT = 9, - IPS_DYING = (1 << IPS_DYING_BIT), -}; - -/* Connection tracking event bits */ -enum ip_conntrack_events -{ - /* New conntrack */ - IPCT_NEW_BIT = 0, - IPCT_NEW = (1 << IPCT_NEW_BIT), - - /* Expected connection */ - IPCT_RELATED_BIT = 1, - IPCT_RELATED = (1 << IPCT_RELATED_BIT), - - /* Destroyed conntrack */ - IPCT_DESTROY_BIT = 2, - IPCT_DESTROY = (1 << IPCT_DESTROY_BIT), - - /* Timer has been refreshed */ - IPCT_REFRESH_BIT = 3, - IPCT_REFRESH = (1 << IPCT_REFRESH_BIT), - - /* Status has changed */ - IPCT_STATUS_BIT = 4, - IPCT_STATUS = (1 << IPCT_STATUS_BIT), - - /* Update of protocol info */ - IPCT_PROTOINFO_BIT = 5, - IPCT_PROTOINFO = (1 << IPCT_PROTOINFO_BIT), - - /* Volatile protocol info */ - IPCT_PROTOINFO_VOLATILE_BIT = 6, - IPCT_PROTOINFO_VOLATILE = (1 << IPCT_PROTOINFO_VOLATILE_BIT), - - /* New helper for conntrack */ - IPCT_HELPER_BIT = 7, - IPCT_HELPER = (1 << IPCT_HELPER_BIT), - - /* Update of helper info */ - IPCT_HELPINFO_BIT = 8, - IPCT_HELPINFO = (1 << IPCT_HELPINFO_BIT), - - /* Volatile helper info */ - IPCT_HELPINFO_VOLATILE_BIT = 9, - IPCT_HELPINFO_VOLATILE = (1 << IPCT_HELPINFO_VOLATILE_BIT), - /* NAT info */ - IPCT_NATINFO_BIT = 10, - IPCT_NATINFO = (1 << IPCT_NATINFO_BIT), - - /* Counter highest bit has been set */ - IPCT_COUNTER_FILLING_BIT = 11, - IPCT_COUNTER_FILLING = (1 << IPCT_COUNTER_FILLING_BIT), -}; - -enum ip_conntrack_expect_events { - IPEXP_NEW_BIT = 0, - IPEXP_NEW = (1 << IPEXP_NEW_BIT), -}; +#include #ifdef __KERNEL__ #include @@ -194,12 +69,6 @@ do { \ #define IP_NF_ASSERT(x) #endif -struct ip_conntrack_counter -{ - u_int32_t packets; - u_int32_t bytes; -}; - struct ip_conntrack_helper; struct ip_conntrack @@ -426,25 +295,6 @@ static inline int is_dying(struct ip_conntrack *ct) extern unsigned int ip_conntrack_htable_size; -struct ip_conntrack_stat -{ - unsigned int searched; - unsigned int found; - unsigned int new; - unsigned int invalid; - unsigned int ignore; - unsigned int delete; - unsigned int delete_list; - unsigned int insert; - unsigned int insert_failed; - unsigned int drop; - unsigned int early_drop; - unsigned int error; - unsigned int expect_new; - unsigned int expect_create; - unsigned int expect_delete; -}; - #define CONNTRACK_STAT_INC(count) (__get_cpu_var(ip_conntrack_stat).count++) #ifdef CONFIG_IP_NF_CONNTRACK_EVENTS diff --git a/include/linux/netfilter_ipv4/ip_conntrack_ftp.h b/include/linux/netfilter_ipv4/ip_conntrack_ftp.h index 5f06429b9047..63811934de4d 100644 --- a/include/linux/netfilter_ipv4/ip_conntrack_ftp.h +++ b/include/linux/netfilter_ipv4/ip_conntrack_ftp.h @@ -1,43 +1,6 @@ #ifndef _IP_CONNTRACK_FTP_H #define _IP_CONNTRACK_FTP_H -/* FTP tracking. */ -#ifdef __KERNEL__ +#include -#define FTP_PORT 21 - -#endif /* __KERNEL__ */ - -enum ip_ct_ftp_type -{ - /* PORT command from client */ - IP_CT_FTP_PORT, - /* PASV response from server */ - IP_CT_FTP_PASV, - /* EPRT command from client */ - IP_CT_FTP_EPRT, - /* EPSV response from server */ - IP_CT_FTP_EPSV, -}; - -#define NUM_SEQ_TO_REMEMBER 2 -/* This structure exists only once per master */ -struct ip_ct_ftp_master { - /* Valid seq positions for cmd matching after newline */ - u_int32_t seq_aft_nl[IP_CT_DIR_MAX][NUM_SEQ_TO_REMEMBER]; - /* 0 means seq_match_aft_nl not set */ - int seq_aft_nl_num[IP_CT_DIR_MAX]; -}; - -struct ip_conntrack_expect; - -/* For NAT to hook in when we find a packet which describes what other - * connection we should expect. */ -extern unsigned int (*ip_nat_ftp_hook)(struct sk_buff **pskb, - enum ip_conntrack_info ctinfo, - enum ip_ct_ftp_type type, - unsigned int matchoff, - unsigned int matchlen, - struct ip_conntrack_expect *exp, - u32 *seq); #endif /* _IP_CONNTRACK_FTP_H */ diff --git a/include/linux/netfilter_ipv4/ip_conntrack_icmp.h b/include/linux/netfilter_ipv4/ip_conntrack_icmp.h index f1664abbe392..eed5ee3e4744 100644 --- a/include/linux/netfilter_ipv4/ip_conntrack_icmp.h +++ b/include/linux/netfilter_ipv4/ip_conntrack_icmp.h @@ -1,11 +1,6 @@ #ifndef _IP_CONNTRACK_ICMP_H #define _IP_CONNTRACK_ICMP_H -/* ICMP tracking. */ -#include -struct ip_ct_icmp -{ - /* Optimization: when number in == number out, forget immediately. */ - atomic_t count; -}; +#include + #endif /* _IP_CONNTRACK_ICMP_H */ diff --git a/include/linux/netfilter_ipv4/ip_conntrack_sctp.h b/include/linux/netfilter_ipv4/ip_conntrack_sctp.h index 7a8d869321f7..4099a041a32a 100644 --- a/include/linux/netfilter_ipv4/ip_conntrack_sctp.h +++ b/include/linux/netfilter_ipv4/ip_conntrack_sctp.h @@ -1,25 +1,6 @@ #ifndef _IP_CONNTRACK_SCTP_H #define _IP_CONNTRACK_SCTP_H -/* SCTP tracking. */ -enum sctp_conntrack { - SCTP_CONNTRACK_NONE, - SCTP_CONNTRACK_CLOSED, - SCTP_CONNTRACK_COOKIE_WAIT, - SCTP_CONNTRACK_COOKIE_ECHOED, - SCTP_CONNTRACK_ESTABLISHED, - SCTP_CONNTRACK_SHUTDOWN_SENT, - SCTP_CONNTRACK_SHUTDOWN_RECD, - SCTP_CONNTRACK_SHUTDOWN_ACK_SENT, - SCTP_CONNTRACK_MAX -}; - -struct ip_ct_sctp -{ - enum sctp_conntrack state; - - u_int32_t vtag[IP_CT_DIR_MAX]; - u_int32_t ttag[IP_CT_DIR_MAX]; -}; +#include #endif /* _IP_CONNTRACK_SCTP_H */ diff --git a/include/linux/netfilter_ipv4/ip_conntrack_tcp.h b/include/linux/netfilter_ipv4/ip_conntrack_tcp.h index 16da044d97a7..876b8fb17e68 100644 --- a/include/linux/netfilter_ipv4/ip_conntrack_tcp.h +++ b/include/linux/netfilter_ipv4/ip_conntrack_tcp.h @@ -1,51 +1,6 @@ #ifndef _IP_CONNTRACK_TCP_H #define _IP_CONNTRACK_TCP_H -/* TCP tracking. */ -enum tcp_conntrack { - TCP_CONNTRACK_NONE, - TCP_CONNTRACK_SYN_SENT, - TCP_CONNTRACK_SYN_RECV, - TCP_CONNTRACK_ESTABLISHED, - TCP_CONNTRACK_FIN_WAIT, - TCP_CONNTRACK_CLOSE_WAIT, - TCP_CONNTRACK_LAST_ACK, - TCP_CONNTRACK_TIME_WAIT, - TCP_CONNTRACK_CLOSE, - TCP_CONNTRACK_LISTEN, - TCP_CONNTRACK_MAX, - TCP_CONNTRACK_IGNORE -}; - -/* Window scaling is advertised by the sender */ -#define IP_CT_TCP_FLAG_WINDOW_SCALE 0x01 - -/* SACK is permitted by the sender */ -#define IP_CT_TCP_FLAG_SACK_PERM 0x02 - -/* This sender sent FIN first */ -#define IP_CT_TCP_FLAG_CLOSE_INIT 0x03 - -struct ip_ct_tcp_state { - u_int32_t td_end; /* max of seq + len */ - u_int32_t td_maxend; /* max of ack + max(win, 1) */ - u_int32_t td_maxwin; /* max(win) */ - u_int8_t td_scale; /* window scale factor */ - u_int8_t loose; /* used when connection picked up from the middle */ - u_int8_t flags; /* per direction options */ -}; - -struct ip_ct_tcp -{ - struct ip_ct_tcp_state seen[2]; /* connection parameters per direction */ - u_int8_t state; /* state of the connection (enum tcp_conntrack) */ - /* For detecting stale connections */ - u_int8_t last_dir; /* Direction of the last packet (enum ip_conntrack_dir) */ - u_int8_t retrans; /* Number of retransmitted packets */ - u_int8_t last_index; /* Index of the last packet */ - u_int32_t last_seq; /* Last sequence number seen in dir */ - u_int32_t last_ack; /* Last sequence number seen in opposite dir */ - u_int32_t last_end; /* Last seq + len */ -}; +#include #endif /* _IP_CONNTRACK_TCP_H */ diff --git a/include/linux/netfilter_ipv4/ip_conntrack_tuple.h b/include/linux/netfilter_ipv4/ip_conntrack_tuple.h index 3232db11a4e5..2fdabdb4c0ef 100644 --- a/include/linux/netfilter_ipv4/ip_conntrack_tuple.h +++ b/include/linux/netfilter_ipv4/ip_conntrack_tuple.h @@ -2,6 +2,7 @@ #define _IP_CONNTRACK_TUPLE_H #include +#include /* A `tuple' is a structure containing the information to uniquely identify a connection. ie. if two packets have the same tuple, they @@ -88,13 +89,6 @@ struct ip_conntrack_tuple (tuple)->dst.u.all = 0; \ } while (0) -enum ip_conntrack_dir -{ - IP_CT_DIR_ORIGINAL, - IP_CT_DIR_REPLY, - IP_CT_DIR_MAX -}; - #ifdef __KERNEL__ #define DUMP_TUPLE(tp) \ @@ -103,8 +97,6 @@ DEBUGP("tuple %p: %u %u.%u.%u.%u:%hu -> %u.%u.%u.%u:%hu\n", \ NIPQUAD((tp)->src.ip), ntohs((tp)->src.u.all), \ NIPQUAD((tp)->dst.ip), ntohs((tp)->dst.u.all)) -#define CTINFO2DIR(ctinfo) ((ctinfo) >= IP_CT_IS_REPLY ? IP_CT_DIR_REPLY : IP_CT_DIR_ORIGINAL) - /* If we're the first tuple, it's the original dir. */ #define DIRECTION(h) ((enum ip_conntrack_dir)(h)->tuple.dst.dir) diff --git a/include/linux/netfilter_ipv6.h b/include/linux/netfilter_ipv6.h index edcc2c6eb5c7..53b2983f6278 100644 --- a/include/linux/netfilter_ipv6.h +++ b/include/linux/netfilter_ipv6.h @@ -59,6 +59,7 @@ enum nf_ip6_hook_priorities { NF_IP6_PRI_FIRST = INT_MIN, + NF_IP6_PRI_CONNTRACK_DEFRAG = -400, NF_IP6_PRI_SELINUX_FIRST = -225, NF_IP6_PRI_CONNTRACK = -200, NF_IP6_PRI_BRIDGE_SABOTAGE_FORWARD = -175, diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index fdfb8fe8c38c..83010231db99 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -274,6 +274,9 @@ struct sk_buff { #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE) __u8 ipvs_property:1; #endif +#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) + struct sk_buff *nfct_reasm; +#endif #ifdef CONFIG_BRIDGE_NETFILTER struct nf_bridge_info *nf_bridge; #endif @@ -1313,10 +1316,26 @@ static inline void nf_conntrack_get(struct nf_conntrack *nfct) if (nfct) atomic_inc(&nfct->use); } +#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) +static inline void nf_conntrack_get_reasm(struct sk_buff *skb) +{ + if (skb) + atomic_inc(&skb->users); +} +static inline void nf_conntrack_put_reasm(struct sk_buff *skb) +{ + if (skb) + kfree_skb(skb); +} +#endif static inline void nf_reset(struct sk_buff *skb) { nf_conntrack_put(skb->nfct); skb->nfct = NULL; +#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) + nf_conntrack_put_reasm(skb->nfct_reasm); + skb->nfct_reasm = NULL; +#endif } #ifdef CONFIG_BRIDGE_NETFILTER diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index fc131d6602b9..22cf5e1ac987 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -205,6 +205,7 @@ enum NET_ECONET=16, NET_SCTP=17, NET_LLC=18, + NET_NETFILTER=19, }; /* /proc/sys/kernel/random */ @@ -270,6 +271,42 @@ enum NET_UNIX_MAX_DGRAM_QLEN=3, }; +/* /proc/sys/net/netfilter */ +enum +{ + NET_NF_CONNTRACK_MAX=1, + NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT=2, + NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_RECV=3, + NET_NF_CONNTRACK_TCP_TIMEOUT_ESTABLISHED=4, + NET_NF_CONNTRACK_TCP_TIMEOUT_FIN_WAIT=5, + NET_NF_CONNTRACK_TCP_TIMEOUT_CLOSE_WAIT=6, + NET_NF_CONNTRACK_TCP_TIMEOUT_LAST_ACK=7, + NET_NF_CONNTRACK_TCP_TIMEOUT_TIME_WAIT=8, + NET_NF_CONNTRACK_TCP_TIMEOUT_CLOSE=9, + NET_NF_CONNTRACK_UDP_TIMEOUT=10, + NET_NF_CONNTRACK_UDP_TIMEOUT_STREAM=11, + NET_NF_CONNTRACK_ICMP_TIMEOUT=12, + NET_NF_CONNTRACK_GENERIC_TIMEOUT=13, + NET_NF_CONNTRACK_BUCKETS=14, + NET_NF_CONNTRACK_LOG_INVALID=15, + NET_NF_CONNTRACK_TCP_TIMEOUT_MAX_RETRANS=16, + NET_NF_CONNTRACK_TCP_LOOSE=17, + NET_NF_CONNTRACK_TCP_BE_LIBERAL=18, + NET_NF_CONNTRACK_TCP_MAX_RETRANS=19, + NET_NF_CONNTRACK_SCTP_TIMEOUT_CLOSED=20, + NET_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_WAIT=21, + NET_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_ECHOED=22, + NET_NF_CONNTRACK_SCTP_TIMEOUT_ESTABLISHED=23, + NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_SENT=24, + NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_RECD=25, + NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_ACK_SENT=26, + NET_NF_CONNTRACK_COUNT=27, + NET_NF_CONNTRACK_ICMPV6_TIMEOUT=28, + NET_NF_CONNTRACK_FRAG6_TIMEOUT=29, + NET_NF_CONNTRACK_FRAG6_LOW_THRESH=30, + NET_NF_CONNTRACK_FRAG6_HIGH_THRESH=31, +}; + /* /proc/sys/net/ipv4 */ enum { diff --git a/include/net/netfilter/ipv4/nf_conntrack_icmp.h b/include/net/netfilter/ipv4/nf_conntrack_icmp.h new file mode 100644 index 000000000000..3dd22cff23ec --- /dev/null +++ b/include/net/netfilter/ipv4/nf_conntrack_icmp.h @@ -0,0 +1,11 @@ +#ifndef _NF_CONNTRACK_ICMP_H +#define _NF_CONNTRACK_ICMP_H +/* ICMP tracking. */ +#include + +struct ip_ct_icmp +{ + /* Optimization: when number in == number out, forget immediately. */ + atomic_t count; +}; +#endif /* _NF_CONNTRACK_ICMP_H */ diff --git a/include/net/netfilter/ipv4/nf_conntrack_ipv4.h b/include/net/netfilter/ipv4/nf_conntrack_ipv4.h new file mode 100644 index 000000000000..25b081a730e6 --- /dev/null +++ b/include/net/netfilter/ipv4/nf_conntrack_ipv4.h @@ -0,0 +1,43 @@ +/* + * IPv4 support for nf_conntrack. + * + * 23 Mar 2004: Yasuyuki Kozakai @ USAGI + * - move L3 protocol dependent part from include/linux/netfilter_ipv4/ + * ip_conntarck.h + */ + +#ifndef _NF_CONNTRACK_IPV4_H +#define _NF_CONNTRACK_IPV4_H + +#ifdef CONFIG_IP_NF_NAT_NEEDED +#include + +/* per conntrack: nat application helper private data */ +union ip_conntrack_nat_help { + /* insert nat helper private data here */ +}; + +struct nf_conntrack_ipv4_nat { + struct ip_nat_info info; + union ip_conntrack_nat_help help; +#if defined(CONFIG_IP_NF_TARGET_MASQUERADE) || \ + defined(CONFIG_IP_NF_TARGET_MASQUERADE_MODULE) + int masq_index; +#endif +}; +#endif /* CONFIG_IP_NF_NAT_NEEDED */ + +struct nf_conntrack_ipv4 { +#ifdef CONFIG_IP_NF_NAT_NEEDED + struct nf_conntrack_ipv4_nat *nat; +#endif +}; + +/* Returns new sk_buff, or NULL */ +struct sk_buff * +nf_ct_ipv4_ct_gather_frags(struct sk_buff *skb); + +/* call to create an explicit dependency on nf_conntrack_l3proto_ipv4. */ +extern void need_ip_conntrack(void); + +#endif /*_NF_CONNTRACK_IPV4_H*/ diff --git a/include/net/netfilter/ipv6/nf_conntrack_icmpv6.h b/include/net/netfilter/ipv6/nf_conntrack_icmpv6.h new file mode 100644 index 000000000000..86591afda29c --- /dev/null +++ b/include/net/netfilter/ipv6/nf_conntrack_icmpv6.h @@ -0,0 +1,27 @@ +/* + * ICMPv6 tracking. + * + * 21 Apl 2004: Yasuyuki Kozakai @USAGI + * - separated from nf_conntrack_icmp.h + * + * Derived from include/linux/netfiter_ipv4/ip_conntrack_icmp.h + */ + +#ifndef _NF_CONNTRACK_ICMPV6_H +#define _NF_CONNTRACK_ICMPV6_H +#include + +#ifndef ICMPV6_NI_QUERY +#define ICMPV6_NI_QUERY 139 +#endif +#ifndef ICMPV6_NI_REPLY +#define ICMPV6_NI_REPLY 140 +#endif + +struct nf_ct_icmpv6 +{ + /* Optimization: when number in == number out, forget immediately. */ + atomic_t count; +}; + +#endif /* _NF_CONNTRACK_ICMPV6_H */ diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h new file mode 100644 index 000000000000..cc4825610795 --- /dev/null +++ b/include/net/netfilter/nf_conntrack.h @@ -0,0 +1,354 @@ +/* + * Connection state tracking for netfilter. This is separated from, + * but required by, the (future) NAT layer; it can also be used by an iptables + * extension. + * + * 16 Dec 2003: Yasuyuki Kozakai @USAGI + * - generalize L3 protocol dependent part. + * + * Derived from include/linux/netfiter_ipv4/ip_conntrack.h + */ + +#ifndef _NF_CONNTRACK_H +#define _NF_CONNTRACK_H + +#include + +#ifdef __KERNEL__ +#include +#include +#include +#include + +#include +#include +#include +#include + +#include + +/* per conntrack: protocol private data */ +union nf_conntrack_proto { + /* insert conntrack proto private data here */ + struct ip_ct_sctp sctp; + struct ip_ct_tcp tcp; + struct ip_ct_icmp icmp; + struct nf_ct_icmpv6 icmpv6; +}; + +union nf_conntrack_expect_proto { + /* insert expect proto private data here */ +}; + +/* Add protocol helper include file here */ +#include + +/* per conntrack: application helper private data */ +union nf_conntrack_help { + /* insert conntrack helper private data (master) here */ + struct ip_ct_ftp_master ct_ftp_info; +}; + +#include +#include + +#ifdef CONFIG_NETFILTER_DEBUG +#define NF_CT_ASSERT(x) \ +do { \ + if (!(x)) \ + /* Wooah! I'm tripping my conntrack in a frenzy of \ + netplay... */ \ + printk("NF_CT_ASSERT: %s:%i(%s)\n", \ + __FILE__, __LINE__, __FUNCTION__); \ +} while(0) +#else +#define NF_CT_ASSERT(x) +#endif + +struct nf_conntrack_helper; + +#include +struct nf_conn +{ + /* Usage count in here is 1 for hash table/destruct timer, 1 per skb, + plus 1 for any connection(s) we are `master' for */ + struct nf_conntrack ct_general; + + /* XXX should I move this to the tail ? - Y.K */ + /* These are my tuples; original and reply */ + struct nf_conntrack_tuple_hash tuplehash[IP_CT_DIR_MAX]; + + /* Have we seen traffic both ways yet? (bitset) */ + unsigned long status; + + /* Timer function; drops refcnt when it goes off. */ + struct timer_list timeout; + +#ifdef CONFIG_NF_CT_ACCT + /* Accounting Information (same cache line as other written members) */ + struct ip_conntrack_counter counters[IP_CT_DIR_MAX]; +#endif + /* If we were expected by an expectation, this will be it */ + struct nf_conn *master; + + /* Current number of expected connections */ + unsigned int expecting; + + /* Helper. if any */ + struct nf_conntrack_helper *helper; + + /* features - nat, helper, ... used by allocating system */ + u_int32_t features; + + /* Storage reserved for other modules: */ + + union nf_conntrack_proto proto; + +#if defined(CONFIG_NF_CONNTRACK_MARK) + u_int32_t mark; +#endif + + /* These members are dynamically allocated. */ + + union nf_conntrack_help *help; + + /* Layer 3 dependent members. (ex: NAT) */ + union { + struct nf_conntrack_ipv4 *ipv4; + } l3proto; + void *data[0]; +}; + +struct nf_conntrack_expect +{ + /* Internal linked list (global expectation list) */ + struct list_head list; + + /* We expect this tuple, with the following mask */ + struct nf_conntrack_tuple tuple, mask; + + /* Function to call after setup and insertion */ + void (*expectfn)(struct nf_conn *new, + struct nf_conntrack_expect *this); + + /* The conntrack of the master connection */ + struct nf_conn *master; + + /* Timer function; deletes the expectation. */ + struct timer_list timeout; + + /* Usage count. */ + atomic_t use; + + /* Flags */ + unsigned int flags; + +#ifdef CONFIG_NF_NAT_NEEDED + /* This is the original per-proto part, used to map the + * expected connection the way the recipient expects. */ + union nf_conntrack_manip_proto saved_proto; + /* Direction relative to the master connection. */ + enum ip_conntrack_dir dir; +#endif +}; + +#define NF_CT_EXPECT_PERMANENT 0x1 + +static inline struct nf_conn * +nf_ct_tuplehash_to_ctrack(const struct nf_conntrack_tuple_hash *hash) +{ + return container_of(hash, struct nf_conn, + tuplehash[hash->tuple.dst.dir]); +} + +/* get master conntrack via master expectation */ +#define master_ct(conntr) (conntr->master) + +/* Alter reply tuple (maybe alter helper). */ +extern void +nf_conntrack_alter_reply(struct nf_conn *conntrack, + const struct nf_conntrack_tuple *newreply); + +/* Is this tuple taken? (ignoring any belonging to the given + conntrack). */ +extern int +nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple, + const struct nf_conn *ignored_conntrack); + +/* Return conntrack_info and tuple hash for given skb. */ +static inline struct nf_conn * +nf_ct_get(const struct sk_buff *skb, enum ip_conntrack_info *ctinfo) +{ + *ctinfo = skb->nfctinfo; + return (struct nf_conn *)skb->nfct; +} + +/* decrement reference count on a conntrack */ +static inline void nf_ct_put(struct nf_conn *ct) +{ + NF_CT_ASSERT(ct); + nf_conntrack_put(&ct->ct_general); +} + +/* call to create an explicit dependency on nf_conntrack. */ +extern void need_nf_conntrack(void); + +extern int nf_ct_invert_tuplepr(struct nf_conntrack_tuple *inverse, + const struct nf_conntrack_tuple *orig); + +extern void __nf_ct_refresh_acct(struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + const struct sk_buff *skb, + unsigned long extra_jiffies, + int do_acct); + +/* Refresh conntrack for this many jiffies and do accounting */ +static inline void nf_ct_refresh_acct(struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + const struct sk_buff *skb, + unsigned long extra_jiffies) +{ + __nf_ct_refresh_acct(ct, ctinfo, skb, extra_jiffies, 1); +} + +/* Refresh conntrack for this many jiffies */ +static inline void nf_ct_refresh(struct nf_conn *ct, + const struct sk_buff *skb, + unsigned long extra_jiffies) +{ + __nf_ct_refresh_acct(ct, 0, skb, extra_jiffies, 0); +} + +/* These are for NAT. Icky. */ +/* Update TCP window tracking data when NAT mangles the packet */ +extern void nf_conntrack_tcp_update(struct sk_buff *skb, + unsigned int dataoff, + struct nf_conn *conntrack, + int dir); + +/* Call me when a conntrack is destroyed. */ +extern void (*nf_conntrack_destroyed)(struct nf_conn *conntrack); + +/* Fake conntrack entry for untracked connections */ +extern struct nf_conn nf_conntrack_untracked; + +extern int nf_ct_no_defrag; + +/* Iterate over all conntracks: if iter returns true, it's deleted. */ +extern void +nf_ct_iterate_cleanup(int (*iter)(struct nf_conn *i, void *data), void *data); +extern void nf_conntrack_free(struct nf_conn *ct); +extern struct nf_conn * +nf_conntrack_alloc(const struct nf_conntrack_tuple *orig, + const struct nf_conntrack_tuple *repl); + +/* It's confirmed if it is, or has been in the hash table. */ +static inline int nf_ct_is_confirmed(struct nf_conn *ct) +{ + return test_bit(IPS_CONFIRMED_BIT, &ct->status); +} + +static inline int nf_ct_is_dying(struct nf_conn *ct) +{ + return test_bit(IPS_DYING_BIT, &ct->status); +} + +extern unsigned int nf_conntrack_htable_size; + +#define NF_CT_STAT_INC(count) (__get_cpu_var(nf_conntrack_stat).count++) + +#ifdef CONFIG_NF_CONNTRACK_EVENTS +#include +#include + +struct nf_conntrack_ecache { + struct nf_conn *ct; + unsigned int events; +}; +DECLARE_PER_CPU(struct nf_conntrack_ecache, nf_conntrack_ecache); + +#define CONNTRACK_ECACHE(x) (__get_cpu_var(nf_conntrack_ecache).x) + +extern struct notifier_block *nf_conntrack_chain; +extern struct notifier_block *nf_conntrack_expect_chain; + +static inline int nf_conntrack_register_notifier(struct notifier_block *nb) +{ + return notifier_chain_register(&nf_conntrack_chain, nb); +} + +static inline int nf_conntrack_unregister_notifier(struct notifier_block *nb) +{ + return notifier_chain_unregister(&nf_conntrack_chain, nb); +} + +static inline int +nf_conntrack_expect_register_notifier(struct notifier_block *nb) +{ + return notifier_chain_register(&nf_conntrack_expect_chain, nb); +} + +static inline int +nf_conntrack_expect_unregister_notifier(struct notifier_block *nb) +{ + return notifier_chain_unregister(&nf_conntrack_expect_chain, nb); +} + +extern void nf_ct_deliver_cached_events(const struct nf_conn *ct); +extern void __nf_ct_event_cache_init(struct nf_conn *ct); + +static inline void +nf_conntrack_event_cache(enum ip_conntrack_events event, + const struct sk_buff *skb) +{ + struct nf_conn *ct = (struct nf_conn *)skb->nfct; + struct nf_conntrack_ecache *ecache; + + local_bh_disable(); + ecache = &__get_cpu_var(nf_conntrack_ecache); + if (ct != ecache->ct) + __nf_ct_event_cache_init(ct); + ecache->events |= event; + local_bh_enable(); +} + +static inline void nf_conntrack_event(enum ip_conntrack_events event, + struct nf_conn *ct) +{ + if (nf_ct_is_confirmed(ct) && !nf_ct_is_dying(ct)) + notifier_call_chain(&nf_conntrack_chain, event, ct); +} + +static inline void +nf_conntrack_expect_event(enum ip_conntrack_expect_events event, + struct nf_conntrack_expect *exp) +{ + notifier_call_chain(&nf_conntrack_expect_chain, event, exp); +} +#else /* CONFIG_NF_CONNTRACK_EVENTS */ +static inline void nf_conntrack_event_cache(enum ip_conntrack_events event, + const struct sk_buff *skb) {} +static inline void nf_conntrack_event(enum ip_conntrack_events event, + struct nf_conn *ct) {} +static inline void nf_ct_deliver_cached_events(const struct nf_conn *ct) {} +static inline void +nf_conntrack_expect_event(enum ip_conntrack_expect_events event, + struct nf_conntrack_expect *exp) {} +#endif /* CONFIG_NF_CONNTRACK_EVENTS */ + +/* no helper, no nat */ +#define NF_CT_F_BASIC 0 +/* for helper */ +#define NF_CT_F_HELP 1 +/* for nat. */ +#define NF_CT_F_NAT 2 +#define NF_CT_F_NUM 4 + +extern int +nf_conntrack_register_cache(u_int32_t features, const char *name, size_t size, + int (*init_conntrack)(struct nf_conn *, u_int32_t)); +extern void +nf_conntrack_unregister_cache(u_int32_t features); + +#endif /* __KERNEL__ */ +#endif /* _NF_CONNTRACK_H */ diff --git a/include/net/netfilter/nf_conntrack_compat.h b/include/net/netfilter/nf_conntrack_compat.h new file mode 100644 index 000000000000..3cac19fb3648 --- /dev/null +++ b/include/net/netfilter/nf_conntrack_compat.h @@ -0,0 +1,108 @@ +#ifndef _NF_CONNTRACK_COMPAT_H +#define _NF_CONNTRACK_COMPAT_H + +#ifdef __KERNEL__ + +#if defined(CONFIG_IP_NF_CONNTRACK) || defined(CONFIG_IP_NF_CONNTRACK_MODULE) + +#include + +#ifdef CONFIG_IP_NF_CONNTRACK_MARK +static inline u_int32_t *nf_ct_get_mark(const struct sk_buff *skb, + u_int32_t *ctinfo) +{ + struct ip_conntrack *ct = ip_conntrack_get(skb, ctinfo); + + if (ct) + return &ct->mark; + else + return NULL; +} +#endif /* CONFIG_IP_NF_CONNTRACK_MARK */ + +#ifdef CONFIG_IP_NF_CT_ACCT +static inline struct ip_conntrack_counter * +nf_ct_get_counters(const struct sk_buff *skb) +{ + enum ip_conntrack_info ctinfo; + struct ip_conntrack *ct = ip_conntrack_get(skb, &ctinfo); + + if (ct) + return ct->counters; + else + return NULL; +} +#endif /* CONFIG_IP_NF_CT_ACCT */ + +static inline int nf_ct_is_untracked(const struct sk_buff *skb) +{ + return (skb->nfct == &ip_conntrack_untracked.ct_general); +} + +static inline void nf_ct_untrack(struct sk_buff *skb) +{ + skb->nfct = &ip_conntrack_untracked.ct_general; +} + +static inline int nf_ct_get_ctinfo(const struct sk_buff *skb, + enum ip_conntrack_info *ctinfo) +{ + struct ip_conntrack *ct = ip_conntrack_get(skb, ctinfo); + return (ct != NULL); +} + +#else /* CONFIG_IP_NF_CONNTRACK */ + +#include +#include + +#ifdef CONFIG_NF_CONNTRACK_MARK + +static inline u_int32_t *nf_ct_get_mark(const struct sk_buff *skb, + u_int32_t *ctinfo) +{ + struct nf_conn *ct = nf_ct_get(skb, ctinfo); + + if (ct) + return &ct->mark; + else + return NULL; +} +#endif /* CONFIG_NF_CONNTRACK_MARK */ + +#ifdef CONFIG_NF_CT_ACCT +static inline struct ip_conntrack_counter * +nf_ct_get_counters(const struct sk_buff *skb) +{ + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + + if (ct) + return ct->counters; + else + return NULL; +} +#endif /* CONFIG_NF_CT_ACCT */ + +static inline int nf_ct_is_untracked(const struct sk_buff *skb) +{ + return (skb->nfct == &nf_conntrack_untracked.ct_general); +} + +static inline void nf_ct_untrack(struct sk_buff *skb) +{ + skb->nfct = &nf_conntrack_untracked.ct_general; +} + +static inline int nf_ct_get_ctinfo(const struct sk_buff *skb, + enum ip_conntrack_info *ctinfo) +{ + struct nf_conn *ct = nf_ct_get(skb, ctinfo); + return (ct != NULL); +} + +#endif /* CONFIG_IP_NF_CONNTRACK */ + +#endif /* __KERNEL__ */ + +#endif /* _NF_CONNTRACK_COMPAT_H */ diff --git a/include/net/netfilter/nf_conntrack_core.h b/include/net/netfilter/nf_conntrack_core.h new file mode 100644 index 000000000000..da254525a4ce --- /dev/null +++ b/include/net/netfilter/nf_conntrack_core.h @@ -0,0 +1,76 @@ +/* + * This header is used to share core functionality between the + * standalone connection tracking module, and the compatibility layer's use + * of connection tracking. + * + * 16 Dec 2003: Yasuyuki Kozakai @USAGI + * - generalize L3 protocol dependent part. + * + * Derived from include/linux/netfiter_ipv4/ip_conntrack_core.h + */ + +#ifndef _NF_CONNTRACK_CORE_H +#define _NF_CONNTRACK_CORE_H + +#include + +/* This header is used to share core functionality between the + standalone connection tracking module, and the compatibility layer's use + of connection tracking. */ +extern unsigned int nf_conntrack_in(int pf, + unsigned int hooknum, + struct sk_buff **pskb); + +extern int nf_conntrack_init(void); +extern void nf_conntrack_cleanup(void); + +struct nf_conntrack_l3proto; +extern struct nf_conntrack_l3proto *nf_ct_find_l3proto(u_int16_t pf); +/* Like above, but you already have conntrack read lock. */ +extern struct nf_conntrack_l3proto *__nf_ct_find_l3proto(u_int16_t l3proto); + +struct nf_conntrack_protocol; + +extern int +nf_ct_get_tuple(const struct sk_buff *skb, + unsigned int nhoff, + unsigned int dataoff, + u_int16_t l3num, + u_int8_t protonum, + struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_l3proto *l3proto, + const struct nf_conntrack_protocol *protocol); + +extern int +nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse, + const struct nf_conntrack_tuple *orig, + const struct nf_conntrack_l3proto *l3proto, + const struct nf_conntrack_protocol *protocol); + +/* Find a connection corresponding to a tuple. */ +extern struct nf_conntrack_tuple_hash * +nf_conntrack_find_get(const struct nf_conntrack_tuple *tuple, + const struct nf_conn *ignored_conntrack); + +extern int __nf_conntrack_confirm(struct sk_buff **pskb); + +/* Confirm a connection: returns NF_DROP if packet must be dropped. */ +static inline int nf_conntrack_confirm(struct sk_buff **pskb) +{ + struct nf_conn *ct = (struct nf_conn *)(*pskb)->nfct; + int ret = NF_ACCEPT; + + if (ct) { + if (!nf_ct_is_confirmed(ct)) + ret = __nf_conntrack_confirm(pskb); + nf_ct_deliver_cached_events(ct); + } + return ret; +} + +extern void __nf_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb); + +extern struct list_head *nf_conntrack_hash; +extern struct list_head nf_conntrack_expect_list; +extern rwlock_t nf_conntrack_lock ; +#endif /* _NF_CONNTRACK_CORE_H */ diff --git a/include/net/netfilter/nf_conntrack_helper.h b/include/net/netfilter/nf_conntrack_helper.h new file mode 100644 index 000000000000..5a66b2a3a623 --- /dev/null +++ b/include/net/netfilter/nf_conntrack_helper.h @@ -0,0 +1,51 @@ +/* + * connection tracking helpers. + * + * 16 Dec 2003: Yasuyuki Kozakai @USAGI + * - generalize L3 protocol dependent part. + * + * Derived from include/linux/netfiter_ipv4/ip_conntrack_helper.h + */ + +#ifndef _NF_CONNTRACK_HELPER_H +#define _NF_CONNTRACK_HELPER_H +#include + +struct module; + +struct nf_conntrack_helper +{ + struct list_head list; /* Internal use. */ + + const char *name; /* name of the module */ + struct module *me; /* pointer to self */ + unsigned int max_expected; /* Maximum number of concurrent + * expected connections */ + unsigned int timeout; /* timeout for expecteds */ + + /* Mask of things we will help (compared against server response) */ + struct nf_conntrack_tuple tuple; + struct nf_conntrack_tuple mask; + + /* Function to call when data passes; return verdict, or -1 to + invalidate. */ + int (*help)(struct sk_buff **pskb, + unsigned int protoff, + struct nf_conn *ct, + enum ip_conntrack_info conntrackinfo); +}; + +extern int nf_conntrack_helper_register(struct nf_conntrack_helper *); +extern void nf_conntrack_helper_unregister(struct nf_conntrack_helper *); + +/* Allocate space for an expectation: this is mandatory before calling + nf_conntrack_expect_related. You will have to call put afterwards. */ +extern struct nf_conntrack_expect * +nf_conntrack_expect_alloc(struct nf_conn *master); +extern void nf_conntrack_expect_put(struct nf_conntrack_expect *exp); + +/* Add an expected connection: can have more than one per connection */ +extern int nf_conntrack_expect_related(struct nf_conntrack_expect *exp); +extern void nf_conntrack_unexpect_related(struct nf_conntrack_expect *exp); + +#endif /*_NF_CONNTRACK_HELPER_H*/ diff --git a/include/net/netfilter/nf_conntrack_l3proto.h b/include/net/netfilter/nf_conntrack_l3proto.h new file mode 100644 index 000000000000..01663e5b33df --- /dev/null +++ b/include/net/netfilter/nf_conntrack_l3proto.h @@ -0,0 +1,93 @@ +/* + * Copyright (C)2003,2004 USAGI/WIDE Project + * + * Header for use in defining a given L3 protocol for connection tracking. + * + * Author: + * Yasuyuki Kozakai @USAGI + * + * Derived from include/netfilter_ipv4/ip_conntrack_protocol.h + */ + +#ifndef _NF_CONNTRACK_L3PROTO_H +#define _NF_CONNTRACK_L3PROTO_H +#include +#include + +struct nf_conntrack_l3proto +{ + /* Next pointer. */ + struct list_head list; + + /* L3 Protocol Family number. ex) PF_INET */ + u_int16_t l3proto; + + /* Protocol name */ + const char *name; + + /* + * Try to fill in the third arg: nhoff is offset of l3 proto + * hdr. Return true if possible. + */ + int (*pkt_to_tuple)(const struct sk_buff *skb, unsigned int nhoff, + struct nf_conntrack_tuple *tuple); + + /* + * Invert the per-proto part of the tuple: ie. turn xmit into reply. + * Some packets can't be inverted: return 0 in that case. + */ + int (*invert_tuple)(struct nf_conntrack_tuple *inverse, + const struct nf_conntrack_tuple *orig); + + /* Print out the per-protocol part of the tuple. */ + int (*print_tuple)(struct seq_file *s, + const struct nf_conntrack_tuple *); + + /* Print out the private part of the conntrack. */ + int (*print_conntrack)(struct seq_file *s, const struct nf_conn *); + + /* Returns verdict for packet, or -1 for invalid. */ + int (*packet)(struct nf_conn *conntrack, + const struct sk_buff *skb, + enum ip_conntrack_info ctinfo); + + /* + * Called when a new connection for this protocol found; + * returns TRUE if it's OK. If so, packet() called next. + */ + int (*new)(struct nf_conn *conntrack, const struct sk_buff *skb); + + /* Called when a conntrack entry is destroyed */ + void (*destroy)(struct nf_conn *conntrack); + + /* + * Called before tracking. + * *dataoff: offset of protocol header (TCP, UDP,...) in *pskb + * *protonum: protocol number + */ + int (*prepare)(struct sk_buff **pskb, unsigned int hooknum, + unsigned int *dataoff, u_int8_t *protonum); + + u_int32_t (*get_features)(const struct nf_conntrack_tuple *tuple); + + /* Module (if any) which this is connected to. */ + struct module *me; +}; + +extern struct nf_conntrack_l3proto *nf_ct_l3protos[AF_MAX]; + +/* Protocol registration. */ +extern int nf_conntrack_l3proto_register(struct nf_conntrack_l3proto *proto); +extern void nf_conntrack_l3proto_unregister(struct nf_conntrack_l3proto *proto); + +static inline struct nf_conntrack_l3proto * +nf_ct_find_l3proto(u_int16_t l3proto) +{ + return nf_ct_l3protos[l3proto]; +} + +/* Existing built-in protocols */ +extern struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4; +extern struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv6; +extern struct nf_conntrack_l3proto nf_conntrack_generic_l3proto; +#endif /*_NF_CONNTRACK_L3PROTO_H*/ diff --git a/include/net/netfilter/nf_conntrack_protocol.h b/include/net/netfilter/nf_conntrack_protocol.h new file mode 100644 index 000000000000..b3afda35397a --- /dev/null +++ b/include/net/netfilter/nf_conntrack_protocol.h @@ -0,0 +1,105 @@ +/* + * Header for use in defining a given protocol for connection tracking. + * + * 16 Dec 2003: Yasuyuki Kozakai @USAGI + * - generalized L3 protocol dependent part. + * + * Derived from include/linux/netfiter_ipv4/ip_conntrack_protcol.h + */ + +#ifndef _NF_CONNTRACK_PROTOCOL_H +#define _NF_CONNTRACK_PROTOCOL_H +#include + +struct seq_file; + +struct nf_conntrack_protocol +{ + /* Next pointer. */ + struct list_head list; + + /* L3 Protocol number. */ + u_int16_t l3proto; + + /* Protocol number. */ + u_int8_t proto; + + /* Protocol name */ + const char *name; + + /* Try to fill in the third arg: dataoff is offset past network protocol + hdr. Return true if possible. */ + int (*pkt_to_tuple)(const struct sk_buff *skb, + unsigned int dataoff, + struct nf_conntrack_tuple *tuple); + + /* Invert the per-proto part of the tuple: ie. turn xmit into reply. + * Some packets can't be inverted: return 0 in that case. + */ + int (*invert_tuple)(struct nf_conntrack_tuple *inverse, + const struct nf_conntrack_tuple *orig); + + /* Print out the per-protocol part of the tuple. Return like seq_* */ + int (*print_tuple)(struct seq_file *s, + const struct nf_conntrack_tuple *); + + /* Print out the private part of the conntrack. */ + int (*print_conntrack)(struct seq_file *s, const struct nf_conn *); + + /* Returns verdict for packet, or -1 for invalid. */ + int (*packet)(struct nf_conn *conntrack, + const struct sk_buff *skb, + unsigned int dataoff, + enum ip_conntrack_info ctinfo, + int pf, + unsigned int hooknum); + + /* Called when a new connection for this protocol found; + * returns TRUE if it's OK. If so, packet() called next. */ + int (*new)(struct nf_conn *conntrack, const struct sk_buff *skb, + unsigned int dataoff); + + /* Called when a conntrack entry is destroyed */ + void (*destroy)(struct nf_conn *conntrack); + + int (*error)(struct sk_buff *skb, unsigned int dataoff, + enum ip_conntrack_info *ctinfo, + int pf, unsigned int hooknum); + + /* Module (if any) which this is connected to. */ + struct module *me; +}; + +/* Existing built-in protocols */ +extern struct nf_conntrack_protocol nf_conntrack_protocol_tcp6; +extern struct nf_conntrack_protocol nf_conntrack_protocol_udp4; +extern struct nf_conntrack_protocol nf_conntrack_protocol_udp6; +extern struct nf_conntrack_protocol nf_conntrack_generic_protocol; + +#define MAX_NF_CT_PROTO 256 +extern struct nf_conntrack_protocol **nf_ct_protos[PF_MAX]; + +extern struct nf_conntrack_protocol * +nf_ct_find_proto(u_int16_t l3proto, u_int8_t protocol); + +/* Protocol registration. */ +extern int nf_conntrack_protocol_register(struct nf_conntrack_protocol *proto); +extern void nf_conntrack_protocol_unregister(struct nf_conntrack_protocol *proto); + +/* Log invalid packets */ +extern unsigned int nf_ct_log_invalid; + +#ifdef CONFIG_SYSCTL +#ifdef DEBUG_INVALID_PACKETS +#define LOG_INVALID(proto) \ + (nf_ct_log_invalid == (proto) || nf_ct_log_invalid == IPPROTO_RAW) +#else +#define LOG_INVALID(proto) \ + ((nf_ct_log_invalid == (proto) || nf_ct_log_invalid == IPPROTO_RAW) \ + && net_ratelimit()) +#endif +#else +#define LOG_INVALID(proto) 0 +#endif /* CONFIG_SYSCTL */ + +#endif /*_NF_CONNTRACK_PROTOCOL_H*/ diff --git a/include/net/netfilter/nf_conntrack_tuple.h b/include/net/netfilter/nf_conntrack_tuple.h new file mode 100644 index 000000000000..14ce790e5c65 --- /dev/null +++ b/include/net/netfilter/nf_conntrack_tuple.h @@ -0,0 +1,190 @@ +/* + * Definitions and Declarations for tuple. + * + * 16 Dec 2003: Yasuyuki Kozakai @USAGI + * - generalize L3 protocol dependent part. + * + * Derived from include/linux/netfiter_ipv4/ip_conntrack_tuple.h + */ + +#ifndef _NF_CONNTRACK_TUPLE_H +#define _NF_CONNTRACK_TUPLE_H + +#include + +/* A `tuple' is a structure containing the information to uniquely + identify a connection. ie. if two packets have the same tuple, they + are in the same connection; if not, they are not. + + We divide the structure along "manipulatable" and + "non-manipulatable" lines, for the benefit of the NAT code. +*/ + +#define NF_CT_TUPLE_L3SIZE 4 + +/* The l3 protocol-specific manipulable parts of the tuple: always in + network order! */ +union nf_conntrack_man_l3proto { + u_int32_t all[NF_CT_TUPLE_L3SIZE]; + u_int32_t ip; + u_int32_t ip6[4]; +}; + +/* The protocol-specific manipulable parts of the tuple: always in + network order! */ +union nf_conntrack_man_proto +{ + /* Add other protocols here. */ + u_int16_t all; + + struct { + u_int16_t port; + } tcp; + struct { + u_int16_t port; + } udp; + struct { + u_int16_t id; + } icmp; + struct { + u_int16_t port; + } sctp; +}; + +/* The manipulable part of the tuple. */ +struct nf_conntrack_man +{ + union nf_conntrack_man_l3proto u3; + union nf_conntrack_man_proto u; + /* Layer 3 protocol */ + u_int16_t l3num; +}; + +/* This contains the information to distinguish a connection. */ +struct nf_conntrack_tuple +{ + struct nf_conntrack_man src; + + /* These are the parts of the tuple which are fixed. */ + struct { + union { + u_int32_t all[NF_CT_TUPLE_L3SIZE]; + u_int32_t ip; + u_int32_t ip6[4]; + } u3; + union { + /* Add other protocols here. */ + u_int16_t all; + + struct { + u_int16_t port; + } tcp; + struct { + u_int16_t port; + } udp; + struct { + u_int8_t type, code; + } icmp; + struct { + u_int16_t port; + } sctp; + } u; + + /* The protocol. */ + u_int8_t protonum; + + /* The direction (for tuplehash) */ + u_int8_t dir; + } dst; +}; + +/* This is optimized opposed to a memset of the whole structure. Everything we + * really care about is the source/destination unions */ +#define NF_CT_TUPLE_U_BLANK(tuple) \ + do { \ + (tuple)->src.u.all = 0; \ + (tuple)->dst.u.all = 0; \ + memset(&(tuple)->src.u3, 0, sizeof((tuple)->src.u3)); \ + memset(&(tuple)->dst.u3, 0, sizeof((tuple)->dst.u3)); \ + } while (0) + +#ifdef __KERNEL__ + +#define NF_CT_DUMP_TUPLE(tp) \ +DEBUGP("tuple %p: %u %u %04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x %hu -> %04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x %hu\n", \ + (tp), (tp)->src.l3num, (tp)->dst.protonum, \ + NIP6(*(struct in6_addr *)(tp)->src.u3.all), ntohs((tp)->src.u.all), \ + NIP6(*(struct in6_addr *)(tp)->dst.u3.all), ntohs((tp)->dst.u.all)) + +/* If we're the first tuple, it's the original dir. */ +#define NF_CT_DIRECTION(h) \ + ((enum ip_conntrack_dir)(h)->tuple.dst.dir) + +/* Connections have two entries in the hash table: one for each way */ +struct nf_conntrack_tuple_hash +{ + struct list_head list; + + struct nf_conntrack_tuple tuple; +}; + +#endif /* __KERNEL__ */ + +static inline int nf_ct_tuple_src_equal(const struct nf_conntrack_tuple *t1, + const struct nf_conntrack_tuple *t2) +{ + return (t1->src.u3.all[0] == t2->src.u3.all[0] && + t1->src.u3.all[1] == t2->src.u3.all[1] && + t1->src.u3.all[2] == t2->src.u3.all[2] && + t1->src.u3.all[3] == t2->src.u3.all[3] && + t1->src.u.all == t2->src.u.all && + t1->src.l3num == t2->src.l3num && + t1->dst.protonum == t2->dst.protonum); +} + +static inline int nf_ct_tuple_dst_equal(const struct nf_conntrack_tuple *t1, + const struct nf_conntrack_tuple *t2) +{ + return (t1->dst.u3.all[0] == t2->dst.u3.all[0] && + t1->dst.u3.all[1] == t2->dst.u3.all[1] && + t1->dst.u3.all[2] == t2->dst.u3.all[2] && + t1->dst.u3.all[3] == t2->dst.u3.all[3] && + t1->dst.u.all == t2->dst.u.all && + t1->src.l3num == t2->src.l3num && + t1->dst.protonum == t2->dst.protonum); +} + +static inline int nf_ct_tuple_equal(const struct nf_conntrack_tuple *t1, + const struct nf_conntrack_tuple *t2) +{ + return nf_ct_tuple_src_equal(t1, t2) && nf_ct_tuple_dst_equal(t1, t2); +} + +static inline int nf_ct_tuple_mask_cmp(const struct nf_conntrack_tuple *t, + const struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_tuple *mask) +{ + int count = 0; + + for (count = 0; count < NF_CT_TUPLE_L3SIZE; count++){ + if ((t->src.u3.all[count] ^ tuple->src.u3.all[count]) & + mask->src.u3.all[count]) + return 0; + } + + for (count = 0; count < NF_CT_TUPLE_L3SIZE; count++){ + if ((t->dst.u3.all[count] ^ tuple->dst.u3.all[count]) & + mask->dst.u3.all[count]) + return 0; + } + + if ((t->src.u.all ^ tuple->src.u.all) & mask->src.u.all || + (t->dst.u.all ^ tuple->dst.u.all) & mask->dst.u.all || + (t->src.l3num ^ tuple->src.l3num) & mask->src.l3num || + (t->dst.protonum ^ tuple->dst.protonum) & mask->dst.protonum) + return 0; + + return 1; +} + +#endif /* _NF_CONNTRACK_TUPLE_H */ diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 95501e40100e..b7d13a4fff48 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -336,6 +336,9 @@ void __kfree_skb(struct sk_buff *skb) } #ifdef CONFIG_NETFILTER nf_conntrack_put(skb->nfct); +#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) + nf_conntrack_put_reasm(skb->nfct_reasm); +#endif #ifdef CONFIG_BRIDGE_NETFILTER nf_bridge_put(skb->nf_bridge); #endif @@ -414,9 +417,17 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask) C(nfct); nf_conntrack_get(skb->nfct); C(nfctinfo); +#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) + C(nfct_reasm); + nf_conntrack_get_reasm(skb->nfct_reasm); +#endif #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE) C(ipvs_property); #endif +#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) + C(nfct_reasm); + nf_conntrack_get_reasm(skb->nfct_reasm); +#endif #ifdef CONFIG_BRIDGE_NETFILTER C(nf_bridge); nf_bridge_get(skb->nf_bridge); @@ -474,6 +485,10 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old) new->nfct = old->nfct; nf_conntrack_get(old->nfct); new->nfctinfo = old->nfctinfo; +#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) + new->nfct_reasm = old->nfct_reasm; + nf_conntrack_get_reasm(old->nfct_reasm); +#endif #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE) new->ipvs_property = old->ipvs_property; #endif diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index 7d917e4ce1d9..9d3c8b5f327e 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -5,6 +5,20 @@ menu "IP: Netfilter Configuration" depends on INET && NETFILTER +config NF_CONNTRACK_IPV4 + tristate "IPv4 support for new connection tracking (EXPERIMENTAL)" + depends on EXPERIMENTAL && NF_CONNTRACK + ---help--- + Connection tracking keeps a record of what packets have passed + through your machine, in order to figure out how they are related + into connections. + + This is IPv4 support on Layer 3 independent connection tracking. + Layer 3 independent connection tracking is experimental scheme + which generalize ip_conntrack to support other layer 3 protocols. + + To compile it as a module, choose M here. If unsure, say N. + # connection tracking, helpers and protocols config IP_NF_CONNTRACK tristate "Connection tracking (required for masq/NAT)" @@ -209,8 +223,8 @@ config IP_NF_MATCH_PKTTYPE tristate "Packet type match support" depends on IP_NF_IPTABLES help - Packet type matching allows you to match a packet by - its "class", eg. BROADCAST, MULTICAST, ... + Packet type matching allows you to match a packet by + its "class", eg. BROADCAST, MULTICAST, ... Typical usage: iptables -A INPUT -m pkttype --pkt-type broadcast -j LOG @@ -317,7 +331,8 @@ config IP_NF_MATCH_TCPMSS config IP_NF_MATCH_HELPER tristate "Helper match support" - depends on IP_NF_CONNTRACK && IP_NF_IPTABLES + depends on IP_NF_IPTABLES + depends on IP_NF_CONNTRACK || NF_CONNTRACK_IPV4 help Helper matching allows you to match packets in dynamic connections tracked by a conntrack-helper, ie. ip_conntrack_ftp @@ -326,7 +341,8 @@ config IP_NF_MATCH_HELPER config IP_NF_MATCH_STATE tristate "Connection state match support" - depends on IP_NF_CONNTRACK && IP_NF_IPTABLES + depends on IP_NF_IPTABLES + depends on IP_NF_CONNTRACK || NF_CONNTRACK_IPV4 help Connection state matching allows you to match packets based on their relationship to a tracked connection (ie. previous packets). This @@ -336,7 +352,8 @@ config IP_NF_MATCH_STATE config IP_NF_MATCH_CONNTRACK tristate "Connection tracking match support" - depends on IP_NF_CONNTRACK && IP_NF_IPTABLES + depends on IP_NF_IPTABLES + depends on IP_NF_CONNTRACK || NF_CONNTRACK_IPV4 help This is a general conntrack match module, a superset of the state match. @@ -422,7 +439,8 @@ config IP_NF_MATCH_COMMENT config IP_NF_MATCH_CONNMARK tristate 'Connection mark match support' - depends on IP_NF_CONNTRACK_MARK && IP_NF_IPTABLES + depends on IP_NF_IPTABLES + depends on IP_NF_CONNTRACK_MARK || (NF_CONNTRACK_MARK && NF_CONNTRACK_IPV4) help This option adds a `connmark' match, which allows you to match the connection mark value previously set for the session by `CONNMARK'. @@ -433,7 +451,8 @@ config IP_NF_MATCH_CONNMARK config IP_NF_MATCH_CONNBYTES tristate 'Connection byte/packet counter match support' - depends on IP_NF_CT_ACCT && IP_NF_IPTABLES + depends on IP_NF_IPTABLES + depends on IP_NF_CT_ACCT || (NF_CT_ACCT && NF_CONNTRACK_IPV4) help This option adds a `connbytes' match, which allows you to match the number of bytes and/or packets for each direction within a connection. @@ -747,7 +766,8 @@ config IP_NF_TARGET_TTL config IP_NF_TARGET_CONNMARK tristate 'CONNMARK target support' - depends on IP_NF_CONNTRACK_MARK && IP_NF_MANGLE + depends on IP_NF_MANGLE + depends on IP_NF_CONNTRACK_MARK || (NF_CONNTRACK_MARK && NF_CONNTRACK_IPV4) help This option adds a `CONNMARK' target, which allows one to manipulate the connection mark value. Similar to the MARK target, but @@ -759,7 +779,8 @@ config IP_NF_TARGET_CONNMARK config IP_NF_TARGET_CLUSTERIP tristate "CLUSTERIP target support (EXPERIMENTAL)" - depends on IP_NF_CONNTRACK_MARK && IP_NF_IPTABLES && EXPERIMENTAL + depends on IP_NF_IPTABLES && EXPERIMENTAL + depends on IP_NF_CONNTRACK_MARK || (NF_CONNTRACK_MARK && NF_CONNTRACK_IPV4) help The CLUSTERIP target allows you to build load-balancing clusters of network servers without having a dedicated load-balancing @@ -782,7 +803,7 @@ config IP_NF_RAW config IP_NF_TARGET_NOTRACK tristate 'NOTRACK target support' depends on IP_NF_RAW - depends on IP_NF_CONNTRACK + depends on IP_NF_CONNTRACK || NF_CONNTRACK_IPV4 help The NOTRACK target allows a select rule to specify which packets *not* to enter the conntrack/NAT diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index dab4b58dd31e..058c48e258fc 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -103,3 +103,9 @@ obj-$(CONFIG_IP_NF_ARP_MANGLE) += arpt_mangle.o obj-$(CONFIG_IP_NF_ARPFILTER) += arptable_filter.o obj-$(CONFIG_IP_NF_QUEUE) += ip_queue.o + +# objects for l3 independent conntrack +nf_conntrack_ipv4-objs := nf_conntrack_l3proto_ipv4.o nf_conntrack_proto_icmp.o + +# l3 independent conntrack +obj-$(CONFIG_NF_CONNTRACK_IPV4) += nf_conntrack_ipv4.o diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c index 5c1c0a3d1c4b..d2a4fec22862 100644 --- a/net/ipv4/netfilter/ip_conntrack_netlink.c +++ b/net/ipv4/netfilter/ip_conntrack_netlink.c @@ -1376,7 +1376,7 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb, ip_conntrack_expect_put(exp); } } - write_unlock(&ip_conntrack_lock); + write_unlock_bh(&ip_conntrack_lock); } else { /* This basically means we have to flush everything*/ write_lock_bh(&ip_conntrack_lock); diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index 9bcb398fbc1f..45c52d8f4d99 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -29,7 +29,7 @@ #include #include -#include +#include #define CLUSTERIP_VERSION "0.8" @@ -316,14 +316,14 @@ target(struct sk_buff **pskb, { const struct ipt_clusterip_tgt_info *cipinfo = targinfo; enum ip_conntrack_info ctinfo; - struct ip_conntrack *ct = ip_conntrack_get((*pskb), &ctinfo); - u_int32_t hash; + u_int32_t *mark, hash; /* don't need to clusterip_config_get() here, since refcount * is only decremented by destroy() - and ip_tables guarantees * that the ->target() function isn't called after ->destroy() */ - if (!ct) { + mark = nf_ct_get_mark((*pskb), &ctinfo); + if (mark == NULL) { printk(KERN_ERR "CLUSTERIP: no conntrack!\n"); /* FIXME: need to drop invalid ones, since replies * to outgoing connections of other nodes will be @@ -346,7 +346,7 @@ target(struct sk_buff **pskb, switch (ctinfo) { case IP_CT_NEW: - ct->mark = hash; + *mark = hash; break; case IP_CT_RELATED: case IP_CT_RELATED+IP_CT_IS_REPLY: @@ -363,7 +363,7 @@ target(struct sk_buff **pskb, #ifdef DEBUG_CLUSTERP DUMP_TUPLE(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); #endif - DEBUGP("hash=%u ct_hash=%u ", hash, ct->mark); + DEBUGP("hash=%u ct_hash=%u ", hash, *mark); if (!clusterip_responsible(cipinfo->config, hash)) { DEBUGP("not responsible\n"); return NF_DROP; diff --git a/net/ipv4/netfilter/ipt_CONNMARK.c b/net/ipv4/netfilter/ipt_CONNMARK.c index 05d66ab59424..8acac5a40a92 100644 --- a/net/ipv4/netfilter/ipt_CONNMARK.c +++ b/net/ipv4/netfilter/ipt_CONNMARK.c @@ -29,7 +29,7 @@ MODULE_LICENSE("GPL"); #include #include -#include +#include static unsigned int target(struct sk_buff **pskb, @@ -43,24 +43,24 @@ target(struct sk_buff **pskb, u_int32_t diff; u_int32_t nfmark; u_int32_t newmark; + u_int32_t ctinfo; + u_int32_t *ctmark = nf_ct_get_mark(*pskb, &ctinfo); - enum ip_conntrack_info ctinfo; - struct ip_conntrack *ct = ip_conntrack_get((*pskb), &ctinfo); - if (ct) { + if (ctmark) { switch(markinfo->mode) { case IPT_CONNMARK_SET: - newmark = (ct->mark & ~markinfo->mask) | markinfo->mark; - if (newmark != ct->mark) - ct->mark = newmark; + newmark = (*ctmark & ~markinfo->mask) | markinfo->mark; + if (newmark != *ctmark) + *ctmark = newmark; break; case IPT_CONNMARK_SAVE: - newmark = (ct->mark & ~markinfo->mask) | ((*pskb)->nfmark & markinfo->mask); - if (ct->mark != newmark) - ct->mark = newmark; + newmark = (*ctmark & ~markinfo->mask) | ((*pskb)->nfmark & markinfo->mask); + if (*ctmark != newmark) + *ctmark = newmark; break; case IPT_CONNMARK_RESTORE: nfmark = (*pskb)->nfmark; - diff = (ct->mark ^ nfmark) & markinfo->mask; + diff = (*ctmark ^ nfmark) & markinfo->mask; if (diff != 0) (*pskb)->nfmark = nfmark ^ diff; break; diff --git a/net/ipv4/netfilter/ipt_NOTRACK.c b/net/ipv4/netfilter/ipt_NOTRACK.c index a4bb9b3bc292..e3c69d072c6e 100644 --- a/net/ipv4/netfilter/ipt_NOTRACK.c +++ b/net/ipv4/netfilter/ipt_NOTRACK.c @@ -5,7 +5,7 @@ #include #include -#include +#include static unsigned int target(struct sk_buff **pskb, @@ -23,7 +23,7 @@ target(struct sk_buff **pskb, If there is a real ct entry correspondig to this packet, it'll hang aroun till timing out. We don't deal with it for performance reasons. JK */ - (*pskb)->nfct = &ip_conntrack_untracked.ct_general; + nf_ct_untrack(*pskb); (*pskb)->nfctinfo = IP_CT_NEW; nf_conntrack_get((*pskb)->nfct); diff --git a/net/ipv4/netfilter/ipt_connbytes.c b/net/ipv4/netfilter/ipt_connbytes.c index df4a42c6da22..d68a048b7176 100644 --- a/net/ipv4/netfilter/ipt_connbytes.c +++ b/net/ipv4/netfilter/ipt_connbytes.c @@ -10,7 +10,7 @@ */ #include #include -#include +#include #include #include @@ -46,60 +46,59 @@ match(const struct sk_buff *skb, int *hotdrop) { const struct ipt_connbytes_info *sinfo = matchinfo; - enum ip_conntrack_info ctinfo; - struct ip_conntrack *ct; u_int64_t what = 0; /* initialize to make gcc happy */ + const struct ip_conntrack_counter *counters; - if (!(ct = ip_conntrack_get((struct sk_buff *)skb, &ctinfo))) + if (!(counters = nf_ct_get_counters(skb))) return 0; /* no match */ switch (sinfo->what) { case IPT_CONNBYTES_PKTS: switch (sinfo->direction) { case IPT_CONNBYTES_DIR_ORIGINAL: - what = ct->counters[IP_CT_DIR_ORIGINAL].packets; + what = counters[IP_CT_DIR_ORIGINAL].packets; break; case IPT_CONNBYTES_DIR_REPLY: - what = ct->counters[IP_CT_DIR_REPLY].packets; + what = counters[IP_CT_DIR_REPLY].packets; break; case IPT_CONNBYTES_DIR_BOTH: - what = ct->counters[IP_CT_DIR_ORIGINAL].packets; - what += ct->counters[IP_CT_DIR_REPLY].packets; + what = counters[IP_CT_DIR_ORIGINAL].packets; + what += counters[IP_CT_DIR_REPLY].packets; break; } break; case IPT_CONNBYTES_BYTES: switch (sinfo->direction) { case IPT_CONNBYTES_DIR_ORIGINAL: - what = ct->counters[IP_CT_DIR_ORIGINAL].bytes; + what = counters[IP_CT_DIR_ORIGINAL].bytes; break; case IPT_CONNBYTES_DIR_REPLY: - what = ct->counters[IP_CT_DIR_REPLY].bytes; + what = counters[IP_CT_DIR_REPLY].bytes; break; case IPT_CONNBYTES_DIR_BOTH: - what = ct->counters[IP_CT_DIR_ORIGINAL].bytes; - what += ct->counters[IP_CT_DIR_REPLY].bytes; + what = counters[IP_CT_DIR_ORIGINAL].bytes; + what += counters[IP_CT_DIR_REPLY].bytes; break; } break; case IPT_CONNBYTES_AVGPKT: switch (sinfo->direction) { case IPT_CONNBYTES_DIR_ORIGINAL: - what = div64_64(ct->counters[IP_CT_DIR_ORIGINAL].bytes, - ct->counters[IP_CT_DIR_ORIGINAL].packets); + what = div64_64(counters[IP_CT_DIR_ORIGINAL].bytes, + counters[IP_CT_DIR_ORIGINAL].packets); break; case IPT_CONNBYTES_DIR_REPLY: - what = div64_64(ct->counters[IP_CT_DIR_REPLY].bytes, - ct->counters[IP_CT_DIR_REPLY].packets); + what = div64_64(counters[IP_CT_DIR_REPLY].bytes, + counters[IP_CT_DIR_REPLY].packets); break; case IPT_CONNBYTES_DIR_BOTH: { u_int64_t bytes; u_int64_t pkts; - bytes = ct->counters[IP_CT_DIR_ORIGINAL].bytes + - ct->counters[IP_CT_DIR_REPLY].bytes; - pkts = ct->counters[IP_CT_DIR_ORIGINAL].packets+ - ct->counters[IP_CT_DIR_REPLY].packets; + bytes = counters[IP_CT_DIR_ORIGINAL].bytes + + counters[IP_CT_DIR_REPLY].bytes; + pkts = counters[IP_CT_DIR_ORIGINAL].packets+ + counters[IP_CT_DIR_REPLY].packets; /* FIXME_THEORETICAL: what to do if sum * overflows ? */ diff --git a/net/ipv4/netfilter/ipt_connmark.c b/net/ipv4/netfilter/ipt_connmark.c index bf8de47ce004..5306ef293b92 100644 --- a/net/ipv4/netfilter/ipt_connmark.c +++ b/net/ipv4/netfilter/ipt_connmark.c @@ -28,7 +28,7 @@ MODULE_LICENSE("GPL"); #include #include -#include +#include static int match(const struct sk_buff *skb, @@ -39,12 +39,12 @@ match(const struct sk_buff *skb, int *hotdrop) { const struct ipt_connmark_info *info = matchinfo; - enum ip_conntrack_info ctinfo; - struct ip_conntrack *ct = ip_conntrack_get((struct sk_buff *)skb, &ctinfo); - if (!ct) + u_int32_t ctinfo; + const u_int32_t *ctmark = nf_ct_get_mark(skb, &ctinfo); + if (!ctmark) return 0; - return ((ct->mark & info->mask) == info->mark) ^ info->invert; + return (((*ctmark) & info->mask) == info->mark) ^ info->invert; } static int diff --git a/net/ipv4/netfilter/ipt_conntrack.c b/net/ipv4/netfilter/ipt_conntrack.c index c1d22801b7cf..c8d18705469b 100644 --- a/net/ipv4/netfilter/ipt_conntrack.c +++ b/net/ipv4/netfilter/ipt_conntrack.c @@ -10,7 +10,14 @@ #include #include + +#if defined(CONFIG_IP_NF_CONNTRACK) || defined(CONFIG_IP_NF_CONNTRACK_MODULE) #include +#include +#else +#include +#endif + #include #include @@ -18,6 +25,8 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Marc Boucher "); MODULE_DESCRIPTION("iptables connection tracking match module"); +#if defined(CONFIG_IP_NF_CONNTRACK) || defined(CONFIG_IP_NF_CONNTRACK_MODULE) + static int match(const struct sk_buff *skb, const struct net_device *in, @@ -102,6 +111,93 @@ match(const struct sk_buff *skb, return 1; } +#else /* CONFIG_IP_NF_CONNTRACK */ +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + int *hotdrop) +{ + const struct ipt_conntrack_info *sinfo = matchinfo; + struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + unsigned int statebit; + + ct = nf_ct_get((struct sk_buff *)skb, &ctinfo); + +#define FWINV(bool,invflg) ((bool) ^ !!(sinfo->invflags & invflg)) + + if (ct == &nf_conntrack_untracked) + statebit = IPT_CONNTRACK_STATE_UNTRACKED; + else if (ct) + statebit = IPT_CONNTRACK_STATE_BIT(ctinfo); + else + statebit = IPT_CONNTRACK_STATE_INVALID; + + if(sinfo->flags & IPT_CONNTRACK_STATE) { + if (ct) { + if(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip != + ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip) + statebit |= IPT_CONNTRACK_STATE_SNAT; + + if(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.ip != + ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.ip) + statebit |= IPT_CONNTRACK_STATE_DNAT; + } + + if (FWINV((statebit & sinfo->statemask) == 0, IPT_CONNTRACK_STATE)) + return 0; + } + + if(sinfo->flags & IPT_CONNTRACK_PROTO) { + if (!ct || FWINV(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum != sinfo->tuple[IP_CT_DIR_ORIGINAL].dst.protonum, IPT_CONNTRACK_PROTO)) + return 0; + } + + if(sinfo->flags & IPT_CONNTRACK_ORIGSRC) { + if (!ct || FWINV((ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip&sinfo->sipmsk[IP_CT_DIR_ORIGINAL].s_addr) != sinfo->tuple[IP_CT_DIR_ORIGINAL].src.ip, IPT_CONNTRACK_ORIGSRC)) + return 0; + } + + if(sinfo->flags & IPT_CONNTRACK_ORIGDST) { + if (!ct || FWINV((ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.ip&sinfo->dipmsk[IP_CT_DIR_ORIGINAL].s_addr) != sinfo->tuple[IP_CT_DIR_ORIGINAL].dst.ip, IPT_CONNTRACK_ORIGDST)) + return 0; + } + + if(sinfo->flags & IPT_CONNTRACK_REPLSRC) { + if (!ct || FWINV((ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.ip&sinfo->sipmsk[IP_CT_DIR_REPLY].s_addr) != sinfo->tuple[IP_CT_DIR_REPLY].src.ip, IPT_CONNTRACK_REPLSRC)) + return 0; + } + + if(sinfo->flags & IPT_CONNTRACK_REPLDST) { + if (!ct || FWINV((ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip&sinfo->dipmsk[IP_CT_DIR_REPLY].s_addr) != sinfo->tuple[IP_CT_DIR_REPLY].dst.ip, IPT_CONNTRACK_REPLDST)) + return 0; + } + + if(sinfo->flags & IPT_CONNTRACK_STATUS) { + if (!ct || FWINV((ct->status & sinfo->statusmask) == 0, IPT_CONNTRACK_STATUS)) + return 0; + } + + if(sinfo->flags & IPT_CONNTRACK_EXPIRES) { + unsigned long expires; + + if(!ct) + return 0; + + expires = timer_pending(&ct->timeout) ? (ct->timeout.expires - jiffies)/HZ : 0; + + if (FWINV(!(expires >= sinfo->expires_min && expires <= sinfo->expires_max), IPT_CONNTRACK_EXPIRES)) + return 0; + } + + return 1; +} + +#endif /* CONFIG_NF_IP_CONNTRACK */ + static int check(const char *tablename, const struct ipt_ip *ip, void *matchinfo, diff --git a/net/ipv4/netfilter/ipt_helper.c b/net/ipv4/netfilter/ipt_helper.c index 3e7dd014de43..bf14e1c7798a 100644 --- a/net/ipv4/netfilter/ipt_helper.c +++ b/net/ipv4/netfilter/ipt_helper.c @@ -13,9 +13,15 @@ #include #include #include +#if defined(CONFIG_IP_NF_CONNTRACK) || defined(CONFIG_IP_NF_CONNTRACK_MODULE) #include #include #include +#else +#include +#include +#include +#endif #include #include @@ -29,6 +35,7 @@ MODULE_DESCRIPTION("iptables helper match module"); #define DEBUGP(format, args...) #endif +#if defined(CONFIG_IP_NF_CONNTRACK) || defined(CONFIG_IP_NF_CONNTRACK_MODULE) static int match(const struct sk_buff *skb, const struct net_device *in, @@ -73,6 +80,53 @@ out_unlock: return ret; } +#else /* CONFIG_IP_NF_CONNTRACK */ + +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + int *hotdrop) +{ + const struct ipt_helper_info *info = matchinfo; + struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + int ret = info->invert; + + ct = nf_ct_get((struct sk_buff *)skb, &ctinfo); + if (!ct) { + DEBUGP("ipt_helper: Eek! invalid conntrack?\n"); + return ret; + } + + if (!ct->master) { + DEBUGP("ipt_helper: conntrack %p has no master\n", ct); + return ret; + } + + read_lock_bh(&nf_conntrack_lock); + if (!ct->master->helper) { + DEBUGP("ipt_helper: master ct %p has no helper\n", + exp->expectant); + goto out_unlock; + } + + DEBUGP("master's name = %s , info->name = %s\n", + ct->master->helper->name, info->name); + + if (info->name[0] == '\0') + ret ^= 1; + else + ret ^= !strncmp(ct->master->helper->name, info->name, + strlen(ct->master->helper->name)); +out_unlock: + read_unlock_bh(&nf_conntrack_lock); + return ret; +} +#endif + static int check(const char *tablename, const struct ipt_ip *ip, void *matchinfo, diff --git a/net/ipv4/netfilter/ipt_state.c b/net/ipv4/netfilter/ipt_state.c index b1511b97ea5f..4d7f16b70cec 100644 --- a/net/ipv4/netfilter/ipt_state.c +++ b/net/ipv4/netfilter/ipt_state.c @@ -10,7 +10,7 @@ #include #include -#include +#include #include #include @@ -30,9 +30,9 @@ match(const struct sk_buff *skb, enum ip_conntrack_info ctinfo; unsigned int statebit; - if (skb->nfct == &ip_conntrack_untracked.ct_general) + if (nf_ct_is_untracked(skb)) statebit = IPT_STATE_UNTRACKED; - else if (!ip_conntrack_get(skb, &ctinfo)) + else if (!nf_ct_get_ctinfo(skb, &ctinfo)) statebit = IPT_STATE_INVALID; else statebit = IPT_STATE_BIT(ctinfo); diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c new file mode 100644 index 000000000000..8202c1c0afad --- /dev/null +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c @@ -0,0 +1,571 @@ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * 16 Dec 2003: Yasuyuki Kozakai @USAGI + * - move L3 protocol dependent part to this file. + * 23 Mar 2004: Yasuyuki Kozakai @USAGI + * - add get_features() to support various size of conntrack + * structures. + * + * Derived from net/ipv4/netfilter/ip_conntrack_standalone.c + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +DECLARE_PER_CPU(struct nf_conntrack_stat, nf_conntrack_stat); + +static int ipv4_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff, + struct nf_conntrack_tuple *tuple) +{ + u_int32_t _addrs[2], *ap; + ap = skb_header_pointer(skb, nhoff + offsetof(struct iphdr, saddr), + sizeof(u_int32_t) * 2, _addrs); + if (ap == NULL) + return 0; + + tuple->src.u3.ip = ap[0]; + tuple->dst.u3.ip = ap[1]; + + return 1; +} + +static int ipv4_invert_tuple(struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_tuple *orig) +{ + tuple->src.u3.ip = orig->dst.u3.ip; + tuple->dst.u3.ip = orig->src.u3.ip; + + return 1; +} + +static int ipv4_print_tuple(struct seq_file *s, + const struct nf_conntrack_tuple *tuple) +{ + return seq_printf(s, "src=%u.%u.%u.%u dst=%u.%u.%u.%u ", + NIPQUAD(tuple->src.u3.ip), + NIPQUAD(tuple->dst.u3.ip)); +} + +static int ipv4_print_conntrack(struct seq_file *s, + const struct nf_conn *conntrack) +{ + return 0; +} + +/* Returns new sk_buff, or NULL */ +static struct sk_buff * +nf_ct_ipv4_gather_frags(struct sk_buff *skb, u_int32_t user) +{ + skb_orphan(skb); + + local_bh_disable(); + skb = ip_defrag(skb, user); + local_bh_enable(); + + if (skb) + ip_send_check(skb->nh.iph); + + return skb; +} + +static int +ipv4_prepare(struct sk_buff **pskb, unsigned int hooknum, unsigned int *dataoff, + u_int8_t *protonum) +{ + /* Never happen */ + if ((*pskb)->nh.iph->frag_off & htons(IP_OFFSET)) { + if (net_ratelimit()) { + printk(KERN_ERR "ipv4_prepare: Frag of proto %u (hook=%u)\n", + (*pskb)->nh.iph->protocol, hooknum); + } + return -NF_DROP; + } + + *dataoff = (*pskb)->nh.raw - (*pskb)->data + (*pskb)->nh.iph->ihl*4; + *protonum = (*pskb)->nh.iph->protocol; + + return NF_ACCEPT; +} + +int nat_module_is_loaded = 0; +static u_int32_t ipv4_get_features(const struct nf_conntrack_tuple *tuple) +{ + if (nat_module_is_loaded) + return NF_CT_F_NAT; + + return NF_CT_F_BASIC; +} + +static unsigned int ipv4_confirm(unsigned int hooknum, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + /* We've seen it coming out the other side: confirm it */ + return nf_conntrack_confirm(pskb); +} + +static unsigned int ipv4_conntrack_help(unsigned int hooknum, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + + /* This is where we call the helper: as the packet goes out. */ + ct = nf_ct_get(*pskb, &ctinfo); + if (ct && ct->helper) { + unsigned int ret; + ret = ct->helper->help(pskb, + (*pskb)->nh.raw - (*pskb)->data + + (*pskb)->nh.iph->ihl*4, + ct, ctinfo); + if (ret != NF_ACCEPT) + return ret; + } + return NF_ACCEPT; +} + +static unsigned int ipv4_conntrack_defrag(unsigned int hooknum, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ +#if !defined(CONFIG_IP_NF_NAT) && !defined(CONFIG_IP_NF_NAT_MODULE) + /* Previously seen (loopback)? Ignore. Do this before + fragment check. */ + if ((*pskb)->nfct) + return NF_ACCEPT; +#endif + + /* Gather fragments. */ + if ((*pskb)->nh.iph->frag_off & htons(IP_MF|IP_OFFSET)) { + *pskb = nf_ct_ipv4_gather_frags(*pskb, + hooknum == NF_IP_PRE_ROUTING ? + IP_DEFRAG_CONNTRACK_IN : + IP_DEFRAG_CONNTRACK_OUT); + if (!*pskb) + return NF_STOLEN; + } + return NF_ACCEPT; +} + +static unsigned int ipv4_refrag(unsigned int hooknum, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct rtable *rt = (struct rtable *)(*pskb)->dst; + + /* We've seen it coming out the other side: confirm */ + if (ipv4_confirm(hooknum, pskb, in, out, okfn) != NF_ACCEPT) + return NF_DROP; + + /* Local packets are never produced too large for their + interface. We degfragment them at LOCAL_OUT, however, + so we have to refragment them here. */ + if ((*pskb)->len > dst_mtu(&rt->u.dst) && + !skb_shinfo(*pskb)->tso_size) { + /* No hook can be after us, so this should be OK. */ + ip_fragment(*pskb, okfn); + return NF_STOLEN; + } + return NF_ACCEPT; +} + +static unsigned int ipv4_conntrack_in(unsigned int hooknum, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + return nf_conntrack_in(PF_INET, hooknum, pskb); +} + +static unsigned int ipv4_conntrack_local(unsigned int hooknum, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + /* root is playing with raw sockets. */ + if ((*pskb)->len < sizeof(struct iphdr) + || (*pskb)->nh.iph->ihl * 4 < sizeof(struct iphdr)) { + if (net_ratelimit()) + printk("ipt_hook: happy cracking.\n"); + return NF_ACCEPT; + } + return nf_conntrack_in(PF_INET, hooknum, pskb); +} + +/* Connection tracking may drop packets, but never alters them, so + make it the first hook. */ +static struct nf_hook_ops ipv4_conntrack_defrag_ops = { + .hook = ipv4_conntrack_defrag, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_IP_PRE_ROUTING, + .priority = NF_IP_PRI_CONNTRACK_DEFRAG, +}; + +static struct nf_hook_ops ipv4_conntrack_in_ops = { + .hook = ipv4_conntrack_in, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_IP_PRE_ROUTING, + .priority = NF_IP_PRI_CONNTRACK, +}; + +static struct nf_hook_ops ipv4_conntrack_defrag_local_out_ops = { + .hook = ipv4_conntrack_defrag, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_IP_LOCAL_OUT, + .priority = NF_IP_PRI_CONNTRACK_DEFRAG, +}; + +static struct nf_hook_ops ipv4_conntrack_local_out_ops = { + .hook = ipv4_conntrack_local, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_IP_LOCAL_OUT, + .priority = NF_IP_PRI_CONNTRACK, +}; + +/* helpers */ +static struct nf_hook_ops ipv4_conntrack_helper_out_ops = { + .hook = ipv4_conntrack_help, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_IP_POST_ROUTING, + .priority = NF_IP_PRI_CONNTRACK_HELPER, +}; + +static struct nf_hook_ops ipv4_conntrack_helper_in_ops = { + .hook = ipv4_conntrack_help, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_IP_LOCAL_IN, + .priority = NF_IP_PRI_CONNTRACK_HELPER, +}; + + +/* Refragmenter; last chance. */ +static struct nf_hook_ops ipv4_conntrack_out_ops = { + .hook = ipv4_refrag, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_IP_POST_ROUTING, + .priority = NF_IP_PRI_CONNTRACK_CONFIRM, +}; + +static struct nf_hook_ops ipv4_conntrack_local_in_ops = { + .hook = ipv4_confirm, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_IP_LOCAL_IN, + .priority = NF_IP_PRI_CONNTRACK_CONFIRM, +}; + +#ifdef CONFIG_SYSCTL +/* From nf_conntrack_proto_icmp.c */ +extern unsigned long nf_ct_icmp_timeout; +static struct ctl_table_header *nf_ct_ipv4_sysctl_header; + +static ctl_table nf_ct_sysctl_table[] = { + { + .ctl_name = NET_NF_CONNTRACK_ICMP_TIMEOUT, + .procname = "nf_conntrack_icmp_timeout", + .data = &nf_ct_icmp_timeout, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { .ctl_name = 0 } +}; + +static ctl_table nf_ct_netfilter_table[] = { + { + .ctl_name = NET_NETFILTER, + .procname = "netfilter", + .mode = 0555, + .child = nf_ct_sysctl_table, + }, + { .ctl_name = 0 } +}; + +static ctl_table nf_ct_net_table[] = { + { + .ctl_name = CTL_NET, + .procname = "net", + .mode = 0555, + .child = nf_ct_netfilter_table, + }, + { .ctl_name = 0 } +}; +#endif + +/* Fast function for those who don't want to parse /proc (and I don't + blame them). */ +/* Reversing the socket's dst/src point of view gives us the reply + mapping. */ +static int +getorigdst(struct sock *sk, int optval, void __user *user, int *len) +{ + struct inet_sock *inet = inet_sk(sk); + struct nf_conntrack_tuple_hash *h; + struct nf_conntrack_tuple tuple; + + NF_CT_TUPLE_U_BLANK(&tuple); + tuple.src.u3.ip = inet->rcv_saddr; + tuple.src.u.tcp.port = inet->sport; + tuple.dst.u3.ip = inet->daddr; + tuple.dst.u.tcp.port = inet->dport; + tuple.src.l3num = PF_INET; + tuple.dst.protonum = IPPROTO_TCP; + + /* We only do TCP at the moment: is there a better way? */ + if (strcmp(sk->sk_prot->name, "TCP")) { + DEBUGP("SO_ORIGINAL_DST: Not a TCP socket\n"); + return -ENOPROTOOPT; + } + + if ((unsigned int) *len < sizeof(struct sockaddr_in)) { + DEBUGP("SO_ORIGINAL_DST: len %u not %u\n", + *len, sizeof(struct sockaddr_in)); + return -EINVAL; + } + + h = nf_conntrack_find_get(&tuple, NULL); + if (h) { + struct sockaddr_in sin; + struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); + + sin.sin_family = AF_INET; + sin.sin_port = ct->tuplehash[IP_CT_DIR_ORIGINAL] + .tuple.dst.u.tcp.port; + sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_ORIGINAL] + .tuple.dst.u3.ip; + + DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n", + NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port)); + nf_ct_put(ct); + if (copy_to_user(user, &sin, sizeof(sin)) != 0) + return -EFAULT; + else + return 0; + } + DEBUGP("SO_ORIGINAL_DST: Can't find %u.%u.%u.%u/%u-%u.%u.%u.%u/%u.\n", + NIPQUAD(tuple.src.u3.ip), ntohs(tuple.src.u.tcp.port), + NIPQUAD(tuple.dst.u3.ip), ntohs(tuple.dst.u.tcp.port)); + return -ENOENT; +} + +static struct nf_sockopt_ops so_getorigdst = { + .pf = PF_INET, + .get_optmin = SO_ORIGINAL_DST, + .get_optmax = SO_ORIGINAL_DST+1, + .get = &getorigdst, +}; + +struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 = { + .l3proto = PF_INET, + .name = "ipv4", + .pkt_to_tuple = ipv4_pkt_to_tuple, + .invert_tuple = ipv4_invert_tuple, + .print_tuple = ipv4_print_tuple, + .print_conntrack = ipv4_print_conntrack, + .prepare = ipv4_prepare, + .get_features = ipv4_get_features, + .me = THIS_MODULE, +}; + +extern struct nf_conntrack_protocol nf_conntrack_protocol_tcp4; +extern struct nf_conntrack_protocol nf_conntrack_protocol_udp4; +extern struct nf_conntrack_protocol nf_conntrack_protocol_icmp; +static int init_or_cleanup(int init) +{ + int ret = 0; + + if (!init) goto cleanup; + + ret = nf_register_sockopt(&so_getorigdst); + if (ret < 0) { + printk(KERN_ERR "Unable to register netfilter socket option\n"); + goto cleanup_nothing; + } + + ret = nf_conntrack_protocol_register(&nf_conntrack_protocol_tcp4); + if (ret < 0) { + printk("nf_conntrack_ipv4: can't register tcp.\n"); + goto cleanup_sockopt; + } + + ret = nf_conntrack_protocol_register(&nf_conntrack_protocol_udp4); + if (ret < 0) { + printk("nf_conntrack_ipv4: can't register udp.\n"); + goto cleanup_tcp; + } + + ret = nf_conntrack_protocol_register(&nf_conntrack_protocol_icmp); + if (ret < 0) { + printk("nf_conntrack_ipv4: can't register icmp.\n"); + goto cleanup_udp; + } + + ret = nf_conntrack_l3proto_register(&nf_conntrack_l3proto_ipv4); + if (ret < 0) { + printk("nf_conntrack_ipv4: can't register ipv4\n"); + goto cleanup_icmp; + } + + ret = nf_register_hook(&ipv4_conntrack_defrag_ops); + if (ret < 0) { + printk("nf_conntrack_ipv4: can't register pre-routing defrag hook.\n"); + goto cleanup_ipv4; + } + ret = nf_register_hook(&ipv4_conntrack_defrag_local_out_ops); + if (ret < 0) { + printk("nf_conntrack_ipv4: can't register local_out defrag hook.\n"); + goto cleanup_defragops; + } + + ret = nf_register_hook(&ipv4_conntrack_in_ops); + if (ret < 0) { + printk("nf_conntrack_ipv4: can't register pre-routing hook.\n"); + goto cleanup_defraglocalops; + } + + ret = nf_register_hook(&ipv4_conntrack_local_out_ops); + if (ret < 0) { + printk("nf_conntrack_ipv4: can't register local out hook.\n"); + goto cleanup_inops; + } + + ret = nf_register_hook(&ipv4_conntrack_helper_in_ops); + if (ret < 0) { + printk("nf_conntrack_ipv4: can't register local helper hook.\n"); + goto cleanup_inandlocalops; + } + + ret = nf_register_hook(&ipv4_conntrack_helper_out_ops); + if (ret < 0) { + printk("nf_conntrack_ipv4: can't register postrouting helper hook.\n"); + goto cleanup_helperinops; + } + + ret = nf_register_hook(&ipv4_conntrack_out_ops); + if (ret < 0) { + printk("nf_conntrack_ipv4: can't register post-routing hook.\n"); + goto cleanup_helperoutops; + } + + ret = nf_register_hook(&ipv4_conntrack_local_in_ops); + if (ret < 0) { + printk("nf_conntrack_ipv4: can't register local in hook.\n"); + goto cleanup_inoutandlocalops; + } + +#ifdef CONFIG_SYSCTL + nf_ct_ipv4_sysctl_header = register_sysctl_table(nf_ct_net_table, 0); + if (nf_ct_ipv4_sysctl_header == NULL) { + printk("nf_conntrack: can't register to sysctl.\n"); + ret = -ENOMEM; + goto cleanup_localinops; + } +#endif + + /* For use by REJECT target */ + ip_ct_attach = __nf_conntrack_attach; + + return ret; + + cleanup: + synchronize_net(); + ip_ct_attach = NULL; +#ifdef CONFIG_SYSCTL + unregister_sysctl_table(nf_ct_ipv4_sysctl_header); + cleanup_localinops: +#endif + nf_unregister_hook(&ipv4_conntrack_local_in_ops); + cleanup_inoutandlocalops: + nf_unregister_hook(&ipv4_conntrack_out_ops); + cleanup_helperoutops: + nf_unregister_hook(&ipv4_conntrack_helper_out_ops); + cleanup_helperinops: + nf_unregister_hook(&ipv4_conntrack_helper_in_ops); + cleanup_inandlocalops: + nf_unregister_hook(&ipv4_conntrack_local_out_ops); + cleanup_inops: + nf_unregister_hook(&ipv4_conntrack_in_ops); + cleanup_defraglocalops: + nf_unregister_hook(&ipv4_conntrack_defrag_local_out_ops); + cleanup_defragops: + nf_unregister_hook(&ipv4_conntrack_defrag_ops); + cleanup_ipv4: + nf_conntrack_l3proto_unregister(&nf_conntrack_l3proto_ipv4); + cleanup_icmp: + nf_conntrack_protocol_unregister(&nf_conntrack_protocol_icmp); + cleanup_udp: + nf_conntrack_protocol_unregister(&nf_conntrack_protocol_udp4); + cleanup_tcp: + nf_conntrack_protocol_unregister(&nf_conntrack_protocol_tcp4); + cleanup_sockopt: + nf_unregister_sockopt(&so_getorigdst); + cleanup_nothing: + return ret; +} + +MODULE_LICENSE("GPL"); + +static int __init init(void) +{ + need_nf_conntrack(); + return init_or_cleanup(1); +} + +static void __exit fini(void) +{ + init_or_cleanup(0); +} + +module_init(init); +module_exit(fini); + +void need_ip_conntrack(void) +{ +} + +EXPORT_SYMBOL(need_ip_conntrack); +EXPORT_SYMBOL(nf_ct_ipv4_gather_frags); diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c new file mode 100644 index 000000000000..7ddb5c08f7b8 --- /dev/null +++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c @@ -0,0 +1,301 @@ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * 16 Dec 2003: Yasuyuki Kozakai @USAGI + * - enable working with Layer 3 protocol independent connection tracking. + * + * Derived from net/ipv4/netfilter/ip_conntrack_proto_icmp.c + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +unsigned long nf_ct_icmp_timeout = 30*HZ; + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +static int icmp_pkt_to_tuple(const struct sk_buff *skb, + unsigned int dataoff, + struct nf_conntrack_tuple *tuple) +{ + struct icmphdr _hdr, *hp; + + hp = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); + if (hp == NULL) + return 0; + + tuple->dst.u.icmp.type = hp->type; + tuple->src.u.icmp.id = hp->un.echo.id; + tuple->dst.u.icmp.code = hp->code; + + return 1; +} + +static int icmp_invert_tuple(struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_tuple *orig) +{ + /* Add 1; spaces filled with 0. */ + static u_int8_t invmap[] + = { [ICMP_ECHO] = ICMP_ECHOREPLY + 1, + [ICMP_ECHOREPLY] = ICMP_ECHO + 1, + [ICMP_TIMESTAMP] = ICMP_TIMESTAMPREPLY + 1, + [ICMP_TIMESTAMPREPLY] = ICMP_TIMESTAMP + 1, + [ICMP_INFO_REQUEST] = ICMP_INFO_REPLY + 1, + [ICMP_INFO_REPLY] = ICMP_INFO_REQUEST + 1, + [ICMP_ADDRESS] = ICMP_ADDRESSREPLY + 1, + [ICMP_ADDRESSREPLY] = ICMP_ADDRESS + 1}; + + if (orig->dst.u.icmp.type >= sizeof(invmap) + || !invmap[orig->dst.u.icmp.type]) + return 0; + + tuple->src.u.icmp.id = orig->src.u.icmp.id; + tuple->dst.u.icmp.type = invmap[orig->dst.u.icmp.type] - 1; + tuple->dst.u.icmp.code = orig->dst.u.icmp.code; + return 1; +} + +/* Print out the per-protocol part of the tuple. */ +static int icmp_print_tuple(struct seq_file *s, + const struct nf_conntrack_tuple *tuple) +{ + return seq_printf(s, "type=%u code=%u id=%u ", + tuple->dst.u.icmp.type, + tuple->dst.u.icmp.code, + ntohs(tuple->src.u.icmp.id)); +} + +/* Print out the private part of the conntrack. */ +static int icmp_print_conntrack(struct seq_file *s, + const struct nf_conn *conntrack) +{ + return 0; +} + +/* Returns verdict for packet, or -1 for invalid. */ +static int icmp_packet(struct nf_conn *ct, + const struct sk_buff *skb, + unsigned int dataoff, + enum ip_conntrack_info ctinfo, + int pf, + unsigned int hooknum) +{ + /* Try to delete connection immediately after all replies: + won't actually vanish as we still have skb, and del_timer + means this will only run once even if count hits zero twice + (theoretically possible with SMP) */ + if (CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY) { + if (atomic_dec_and_test(&ct->proto.icmp.count) + && del_timer(&ct->timeout)) + ct->timeout.function((unsigned long)ct); + } else { + atomic_inc(&ct->proto.icmp.count); + nf_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, skb); + nf_ct_refresh_acct(ct, ctinfo, skb, nf_ct_icmp_timeout); + } + + return NF_ACCEPT; +} + +/* Called when a new connection for this protocol found. */ +static int icmp_new(struct nf_conn *conntrack, + const struct sk_buff *skb, unsigned int dataoff) +{ + static u_int8_t valid_new[] + = { [ICMP_ECHO] = 1, + [ICMP_TIMESTAMP] = 1, + [ICMP_INFO_REQUEST] = 1, + [ICMP_ADDRESS] = 1 }; + + if (conntrack->tuplehash[0].tuple.dst.u.icmp.type >= sizeof(valid_new) + || !valid_new[conntrack->tuplehash[0].tuple.dst.u.icmp.type]) { + /* Can't create a new ICMP `conn' with this. */ + DEBUGP("icmp: can't create new conn with type %u\n", + conntrack->tuplehash[0].tuple.dst.u.icmp.type); + NF_CT_DUMP_TUPLE(&conntrack->tuplehash[0].tuple); + return 0; + } + atomic_set(&conntrack->proto.icmp.count, 0); + return 1; +} + +extern struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4; +/* Returns conntrack if it dealt with ICMP, and filled in skb fields */ +static int +icmp_error_message(struct sk_buff *skb, + enum ip_conntrack_info *ctinfo, + unsigned int hooknum) +{ + struct nf_conntrack_tuple innertuple, origtuple; + struct { + struct icmphdr icmp; + struct iphdr ip; + } _in, *inside; + struct nf_conntrack_protocol *innerproto; + struct nf_conntrack_tuple_hash *h; + int dataoff; + + NF_CT_ASSERT(skb->nfct == NULL); + + /* Not enough header? */ + inside = skb_header_pointer(skb, skb->nh.iph->ihl*4, sizeof(_in), &_in); + if (inside == NULL) + return -NF_ACCEPT; + + /* Ignore ICMP's containing fragments (shouldn't happen) */ + if (inside->ip.frag_off & htons(IP_OFFSET)) { + DEBUGP("icmp_error_message: fragment of proto %u\n", + inside->ip.protocol); + return -NF_ACCEPT; + } + + innerproto = nf_ct_find_proto(PF_INET, inside->ip.protocol); + dataoff = skb->nh.iph->ihl*4 + sizeof(inside->icmp); + /* Are they talking about one of our connections? */ + if (!nf_ct_get_tuple(skb, dataoff, dataoff + inside->ip.ihl*4, PF_INET, + inside->ip.protocol, &origtuple, + &nf_conntrack_l3proto_ipv4, innerproto)) { + DEBUGP("icmp_error_message: ! get_tuple p=%u", + inside->ip.protocol); + return -NF_ACCEPT; + } + + /* Ordinarily, we'd expect the inverted tupleproto, but it's + been preserved inside the ICMP. */ + if (!nf_ct_invert_tuple(&innertuple, &origtuple, + &nf_conntrack_l3proto_ipv4, innerproto)) { + DEBUGP("icmp_error_message: no match\n"); + return -NF_ACCEPT; + } + + *ctinfo = IP_CT_RELATED; + + h = nf_conntrack_find_get(&innertuple, NULL); + if (!h) { + /* Locally generated ICMPs will match inverted if they + haven't been SNAT'ed yet */ + /* FIXME: NAT code has to handle half-done double NAT --RR */ + if (hooknum == NF_IP_LOCAL_OUT) + h = nf_conntrack_find_get(&origtuple, NULL); + + if (!h) { + DEBUGP("icmp_error_message: no match\n"); + return -NF_ACCEPT; + } + + /* Reverse direction from that found */ + if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) + *ctinfo += IP_CT_IS_REPLY; + } else { + if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) + *ctinfo += IP_CT_IS_REPLY; + } + + /* Update skb to refer to this connection */ + skb->nfct = &nf_ct_tuplehash_to_ctrack(h)->ct_general; + skb->nfctinfo = *ctinfo; + return -NF_ACCEPT; +} + +/* Small and modified version of icmp_rcv */ +static int +icmp_error(struct sk_buff *skb, unsigned int dataoff, + enum ip_conntrack_info *ctinfo, int pf, unsigned int hooknum) +{ + struct icmphdr _ih, *icmph; + + /* Not enough header? */ + icmph = skb_header_pointer(skb, skb->nh.iph->ihl*4, sizeof(_ih), &_ih); + if (icmph == NULL) { + if (LOG_INVALID(IPPROTO_ICMP)) + nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, + "nf_ct_icmp: short packet "); + return -NF_ACCEPT; + } + + /* See ip_conntrack_proto_tcp.c */ + if (hooknum != NF_IP_PRE_ROUTING) + goto checksum_skipped; + + switch (skb->ip_summed) { + case CHECKSUM_HW: + if (!(u16)csum_fold(skb->csum)) + break; + if (LOG_INVALID(IPPROTO_ICMP)) + nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, + "nf_ct_icmp: bad HW ICMP checksum "); + return -NF_ACCEPT; + case CHECKSUM_NONE: + if ((u16)csum_fold(skb_checksum(skb, 0, skb->len, 0))) { + if (LOG_INVALID(IPPROTO_ICMP)) + nf_log_packet(PF_INET, 0, skb, NULL, NULL, + NULL, + "nf_ct_icmp: bad ICMP checksum "); + return -NF_ACCEPT; + } + default: + break; + } + +checksum_skipped: + /* + * 18 is the highest 'known' ICMP type. Anything else is a mystery + * + * RFC 1122: 3.2.2 Unknown ICMP messages types MUST be silently + * discarded. + */ + if (icmph->type > NR_ICMP_TYPES) { + if (LOG_INVALID(IPPROTO_ICMP)) + nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, + "nf_ct_icmp: invalid ICMP type "); + return -NF_ACCEPT; + } + + /* Need to track icmp error message? */ + if (icmph->type != ICMP_DEST_UNREACH + && icmph->type != ICMP_SOURCE_QUENCH + && icmph->type != ICMP_TIME_EXCEEDED + && icmph->type != ICMP_PARAMETERPROB + && icmph->type != ICMP_REDIRECT) + return NF_ACCEPT; + + return icmp_error_message(skb, ctinfo, hooknum); +} + +struct nf_conntrack_protocol nf_conntrack_protocol_icmp = +{ + .list = { NULL, NULL }, + .l3proto = PF_INET, + .proto = IPPROTO_ICMP, + .name = "icmp", + .pkt_to_tuple = icmp_pkt_to_tuple, + .invert_tuple = icmp_invert_tuple, + .print_tuple = icmp_print_tuple, + .print_conntrack = icmp_print_conntrack, + .packet = icmp_packet, + .new = icmp_new, + .error = icmp_error, + .destroy = NULL, + .me = NULL +}; + +EXPORT_SYMBOL(nf_conntrack_protocol_icmp); diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index 6e3480426939..a6026d2787d2 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -176,6 +176,11 @@ resubmit: if (ipprot->flags & INET6_PROTO_FINAL) { struct ipv6hdr *hdr; + /* Free reference early: we don't need it any more, + and it may hold ip_conntrack module loaded + indefinitely. */ + nf_reset(skb); + skb_postpull_rcsum(skb, skb->nh.raw, skb->h.raw - skb->nh.raw); hdr = skb->nh.ipv6h; diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index dbd9767b32e4..c1fa693511a1 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -441,9 +441,15 @@ static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from) #ifdef CONFIG_NETFILTER to->nfmark = from->nfmark; /* Connection association is same as pre-frag packet */ + nf_conntrack_put(to->nfct); to->nfct = from->nfct; nf_conntrack_get(to->nfct); to->nfctinfo = from->nfctinfo; +#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) + nf_conntrack_put_reasm(to->nfct_reasm); + to->nfct_reasm = from->nfct_reasm; + nf_conntrack_get_reasm(to->nfct_reasm); +#endif #ifdef CONFIG_BRIDGE_NETFILTER nf_bridge_put(to->nf_bridge); to->nf_bridge = from->nf_bridge; diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig index bb7ccfe33f23..971ba60bf6e9 100644 --- a/net/ipv6/netfilter/Kconfig +++ b/net/ipv6/netfilter/Kconfig @@ -278,5 +278,19 @@ config IP6_NF_RAW If you want to compile it as a module, say M here and read . If unsure, say `N'. +config NF_CONNTRACK_IPV6 + tristate "IPv6 support for new connection tracking (EXPERIMENTAL)" + depends on EXPERIMENTAL && NF_CONNTRACK + ---help--- + Connection tracking keeps a record of what packets have passed + through your machine, in order to figure out how they are related + into connections. + + This is IPv6 support on Layer 3 independent connection tracking. + Layer 3 independent connection tracking is experimental scheme + which generalize ip_conntrack to support other layer 3 protocols. + + To compile it as a module, choose M here. If unsure, say N. + endmenu diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile index 2b2c370e8b1c..9ab5b2ca1f59 100644 --- a/net/ipv6/netfilter/Makefile +++ b/net/ipv6/netfilter/Makefile @@ -27,3 +27,9 @@ obj-$(CONFIG_IP6_NF_TARGET_LOG) += ip6t_LOG.o obj-$(CONFIG_IP6_NF_RAW) += ip6table_raw.o obj-$(CONFIG_IP6_NF_MATCH_HL) += ip6t_hl.o obj-$(CONFIG_IP6_NF_TARGET_REJECT) += ip6t_REJECT.o + +# objects for l3 independent conntrack +nf_conntrack_ipv6-objs := nf_conntrack_l3proto_ipv6.o nf_conntrack_proto_icmpv6.o nf_conntrack_reasm.o + +# l3 independent conntrack +obj-$(CONFIG_NF_CONNTRACK_IPV6) += nf_conntrack_ipv6.o diff --git a/net/ipv6/netfilter/ip6t_MARK.c b/net/ipv6/netfilter/ip6t_MARK.c index 0c7584f92172..eab8fb864ee0 100644 --- a/net/ipv6/netfilter/ip6t_MARK.c +++ b/net/ipv6/netfilter/ip6t_MARK.c @@ -56,9 +56,9 @@ checkentry(const char *tablename, return 1; } -static struct ip6t_target ip6t_mark_reg = { - .name = "MARK", - .target = target, +static struct ip6t_target ip6t_mark_reg = { + .name = "MARK", + .target = target, .checkentry = checkentry, .me = THIS_MODULE }; diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c new file mode 100644 index 000000000000..e2c90b3a8074 --- /dev/null +++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c @@ -0,0 +1,556 @@ +/* + * Copyright (C)2004 USAGI/WIDE Project + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Author: + * Yasuyuki Kozakai @USAGI + * + * 16 Dec 2003: Yasuyuki Kozakai @USAGI + * - support Layer 3 protocol independent connection tracking. + * Based on the original ip_conntrack code which had the following + * copyright information: + * (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team + * + * 23 Mar 2004: Yasuyuki Kozakai @USAGI + * - add get_features() to support various size of conntrack + * structures. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +DECLARE_PER_CPU(struct ip_conntrack_stat, nf_conntrack_stat); + +static int ipv6_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff, + struct nf_conntrack_tuple *tuple) +{ + u_int32_t _addrs[8], *ap; + + ap = skb_header_pointer(skb, nhoff + offsetof(struct ipv6hdr, saddr), + sizeof(_addrs), _addrs); + if (ap == NULL) + return 0; + + memcpy(tuple->src.u3.ip6, ap, sizeof(tuple->src.u3.ip6)); + memcpy(tuple->dst.u3.ip6, ap + 4, sizeof(tuple->dst.u3.ip6)); + + return 1; +} + +static int ipv6_invert_tuple(struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_tuple *orig) +{ + memcpy(tuple->src.u3.ip6, orig->dst.u3.ip6, sizeof(tuple->src.u3.ip6)); + memcpy(tuple->dst.u3.ip6, orig->src.u3.ip6, sizeof(tuple->dst.u3.ip6)); + + return 1; +} + +static int ipv6_print_tuple(struct seq_file *s, + const struct nf_conntrack_tuple *tuple) +{ + return seq_printf(s, "src=%x:%x:%x:%x:%x:%x:%x:%x dst=%x:%x:%x:%x:%x:%x:%x:%x ", + NIP6(*((struct in6_addr *)tuple->src.u3.ip6)), + NIP6(*((struct in6_addr *)tuple->dst.u3.ip6))); +} + +static int ipv6_print_conntrack(struct seq_file *s, + const struct nf_conn *conntrack) +{ + return 0; +} + +/* + * Based on ipv6_skip_exthdr() in net/ipv6/exthdr.c + * + * This function parses (probably truncated) exthdr set "hdr" + * of length "len". "nexthdrp" initially points to some place, + * where type of the first header can be found. + * + * It skips all well-known exthdrs, and returns pointer to the start + * of unparsable area i.e. the first header with unknown type. + * if success, *nexthdr is updated by type/protocol of this header. + * + * NOTES: - it may return pointer pointing beyond end of packet, + * if the last recognized header is truncated in the middle. + * - if packet is truncated, so that all parsed headers are skipped, + * it returns -1. + * - if packet is fragmented, return pointer of the fragment header. + * - ESP is unparsable for now and considered like + * normal payload protocol. + * - Note also special handling of AUTH header. Thanks to IPsec wizards. + */ + +int nf_ct_ipv6_skip_exthdr(struct sk_buff *skb, int start, u8 *nexthdrp, + int len) +{ + u8 nexthdr = *nexthdrp; + + while (ipv6_ext_hdr(nexthdr)) { + struct ipv6_opt_hdr hdr; + int hdrlen; + + if (len < (int)sizeof(struct ipv6_opt_hdr)) + return -1; + if (nexthdr == NEXTHDR_NONE) + break; + if (nexthdr == NEXTHDR_FRAGMENT) + break; + if (skb_copy_bits(skb, start, &hdr, sizeof(hdr))) + BUG(); + if (nexthdr == NEXTHDR_AUTH) + hdrlen = (hdr.hdrlen+2)<<2; + else + hdrlen = ipv6_optlen(&hdr); + + nexthdr = hdr.nexthdr; + len -= hdrlen; + start += hdrlen; + } + + *nexthdrp = nexthdr; + return start; +} + +static int +ipv6_prepare(struct sk_buff **pskb, unsigned int hooknum, unsigned int *dataoff, + u_int8_t *protonum) +{ + unsigned int extoff; + unsigned char pnum; + int protoff; + + extoff = (u8*)((*pskb)->nh.ipv6h + 1) - (*pskb)->data; + pnum = (*pskb)->nh.ipv6h->nexthdr; + + protoff = nf_ct_ipv6_skip_exthdr(*pskb, extoff, &pnum, + (*pskb)->len - extoff); + + /* + * (protoff == (*pskb)->len) mean that the packet doesn't have no data + * except of IPv6 & ext headers. but it's tracked anyway. - YK + */ + if ((protoff < 0) || (protoff > (*pskb)->len)) { + DEBUGP("ip6_conntrack_core: can't find proto in pkt\n"); + NF_CT_STAT_INC(error); + NF_CT_STAT_INC(invalid); + return -NF_ACCEPT; + } + + *dataoff = protoff; + *protonum = pnum; + return NF_ACCEPT; +} + +static u_int32_t ipv6_get_features(const struct nf_conntrack_tuple *tuple) +{ + return NF_CT_F_BASIC; +} + +static unsigned int ipv6_confirm(unsigned int hooknum, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + + /* This is where we call the helper: as the packet goes out. */ + ct = nf_ct_get(*pskb, &ctinfo); + if (ct && ct->helper) { + unsigned int ret, protoff; + unsigned int extoff = (u8*)((*pskb)->nh.ipv6h + 1) + - (*pskb)->data; + unsigned char pnum = (*pskb)->nh.ipv6h->nexthdr; + + protoff = nf_ct_ipv6_skip_exthdr(*pskb, extoff, &pnum, + (*pskb)->len - extoff); + if (protoff < 0 || protoff > (*pskb)->len || + pnum == NEXTHDR_FRAGMENT) { + DEBUGP("proto header not found\n"); + return NF_ACCEPT; + } + + ret = ct->helper->help(pskb, protoff, ct, ctinfo); + if (ret != NF_ACCEPT) + return ret; + } + + /* We've seen it coming out the other side: confirm it */ + + return nf_conntrack_confirm(pskb); +} + +extern struct sk_buff *nf_ct_frag6_gather(struct sk_buff *skb); +extern void nf_ct_frag6_output(unsigned int hooknum, struct sk_buff *skb, + struct net_device *in, + struct net_device *out, + int (*okfn)(struct sk_buff *)); +static unsigned int ipv6_defrag(unsigned int hooknum, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct sk_buff *reasm; + + /* Previously seen (loopback)? */ + if ((*pskb)->nfct) + return NF_ACCEPT; + + reasm = nf_ct_frag6_gather(*pskb); + + /* queued */ + if (reasm == NULL) + return NF_STOLEN; + + /* error occured or not fragmented */ + if (reasm == *pskb) + return NF_ACCEPT; + + nf_ct_frag6_output(hooknum, reasm, (struct net_device *)in, + (struct net_device *)out, okfn); + + return NF_STOLEN; +} + +static unsigned int ipv6_conntrack_in(unsigned int hooknum, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct sk_buff *reasm = (*pskb)->nfct_reasm; + + /* This packet is fragmented and has reassembled packet. */ + if (reasm) { + /* Reassembled packet isn't parsed yet ? */ + if (!reasm->nfct) { + unsigned int ret; + + ret = nf_conntrack_in(PF_INET6, hooknum, &reasm); + if (ret != NF_ACCEPT) + return ret; + } + nf_conntrack_get(reasm->nfct); + (*pskb)->nfct = reasm->nfct; + return NF_ACCEPT; + } + + return nf_conntrack_in(PF_INET6, hooknum, pskb); +} + +static unsigned int ipv6_conntrack_local(unsigned int hooknum, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + /* root is playing with raw sockets. */ + if ((*pskb)->len < sizeof(struct ipv6hdr)) { + if (net_ratelimit()) + printk("ipv6_conntrack_local: packet too short\n"); + return NF_ACCEPT; + } + return ipv6_conntrack_in(hooknum, pskb, in, out, okfn); +} + +/* Connection tracking may drop packets, but never alters them, so + make it the first hook. */ +static struct nf_hook_ops ipv6_conntrack_defrag_ops = { + .hook = ipv6_defrag, + .owner = THIS_MODULE, + .pf = PF_INET6, + .hooknum = NF_IP6_PRE_ROUTING, + .priority = NF_IP6_PRI_CONNTRACK_DEFRAG, +}; + +static struct nf_hook_ops ipv6_conntrack_in_ops = { + .hook = ipv6_conntrack_in, + .owner = THIS_MODULE, + .pf = PF_INET6, + .hooknum = NF_IP6_PRE_ROUTING, + .priority = NF_IP6_PRI_CONNTRACK, +}; + +static struct nf_hook_ops ipv6_conntrack_local_out_ops = { + .hook = ipv6_conntrack_local, + .owner = THIS_MODULE, + .pf = PF_INET6, + .hooknum = NF_IP6_LOCAL_OUT, + .priority = NF_IP6_PRI_CONNTRACK, +}; + +static struct nf_hook_ops ipv6_conntrack_defrag_local_out_ops = { + .hook = ipv6_defrag, + .owner = THIS_MODULE, + .pf = PF_INET6, + .hooknum = NF_IP6_LOCAL_OUT, + .priority = NF_IP6_PRI_CONNTRACK_DEFRAG, +}; + +/* Refragmenter; last chance. */ +static struct nf_hook_ops ipv6_conntrack_out_ops = { + .hook = ipv6_confirm, + .owner = THIS_MODULE, + .pf = PF_INET6, + .hooknum = NF_IP6_POST_ROUTING, + .priority = NF_IP6_PRI_LAST, +}; + +static struct nf_hook_ops ipv6_conntrack_local_in_ops = { + .hook = ipv6_confirm, + .owner = THIS_MODULE, + .pf = PF_INET6, + .hooknum = NF_IP6_LOCAL_IN, + .priority = NF_IP6_PRI_LAST-1, +}; + +#ifdef CONFIG_SYSCTL + +/* From nf_conntrack_proto_icmpv6.c */ +extern unsigned long nf_ct_icmpv6_timeout; + +/* From nf_conntrack_frag6.c */ +extern unsigned long nf_ct_frag6_timeout; +extern unsigned long nf_ct_frag6_low_thresh; +extern unsigned long nf_ct_frag6_high_thresh; + +static struct ctl_table_header *nf_ct_ipv6_sysctl_header; + +static ctl_table nf_ct_sysctl_table[] = { + { + .ctl_name = NET_NF_CONNTRACK_ICMPV6_TIMEOUT, + .procname = "nf_conntrack_icmpv6_timeout", + .data = &nf_ct_icmpv6_timeout, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_NF_CONNTRACK_FRAG6_TIMEOUT, + .procname = "nf_conntrack_frag6_timeout", + .data = &nf_ct_frag6_timeout, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_NF_CONNTRACK_FRAG6_LOW_THRESH, + .procname = "nf_conntrack_frag6_low_thresh", + .data = &nf_ct_frag6_low_thresh, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_NF_CONNTRACK_FRAG6_HIGH_THRESH, + .procname = "nf_conntrack_frag6_high_thresh", + .data = &nf_ct_frag6_high_thresh, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { .ctl_name = 0 } +}; + +static ctl_table nf_ct_netfilter_table[] = { + { + .ctl_name = NET_NETFILTER, + .procname = "netfilter", + .mode = 0555, + .child = nf_ct_sysctl_table, + }, + { .ctl_name = 0 } +}; + +static ctl_table nf_ct_net_table[] = { + { + .ctl_name = CTL_NET, + .procname = "net", + .mode = 0555, + .child = nf_ct_netfilter_table, + }, + { .ctl_name = 0 } +}; +#endif + +struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv6 = { + .l3proto = PF_INET6, + .name = "ipv6", + .pkt_to_tuple = ipv6_pkt_to_tuple, + .invert_tuple = ipv6_invert_tuple, + .print_tuple = ipv6_print_tuple, + .print_conntrack = ipv6_print_conntrack, + .prepare = ipv6_prepare, + .get_features = ipv6_get_features, + .me = THIS_MODULE, +}; + +extern struct nf_conntrack_protocol nf_conntrack_protocol_tcp6; +extern struct nf_conntrack_protocol nf_conntrack_protocol_udp6; +extern struct nf_conntrack_protocol nf_conntrack_protocol_icmpv6; +extern int nf_ct_frag6_init(void); +extern void nf_ct_frag6_cleanup(void); +static int init_or_cleanup(int init) +{ + int ret = 0; + + if (!init) goto cleanup; + + ret = nf_ct_frag6_init(); + if (ret < 0) { + printk("nf_conntrack_ipv6: can't initialize frag6.\n"); + goto cleanup_nothing; + } + ret = nf_conntrack_protocol_register(&nf_conntrack_protocol_tcp6); + if (ret < 0) { + printk("nf_conntrack_ipv6: can't register tcp.\n"); + goto cleanup_frag6; + } + + ret = nf_conntrack_protocol_register(&nf_conntrack_protocol_udp6); + if (ret < 0) { + printk("nf_conntrack_ipv6: can't register udp.\n"); + goto cleanup_tcp; + } + + ret = nf_conntrack_protocol_register(&nf_conntrack_protocol_icmpv6); + if (ret < 0) { + printk("nf_conntrack_ipv6: can't register icmpv6.\n"); + goto cleanup_udp; + } + + ret = nf_conntrack_l3proto_register(&nf_conntrack_l3proto_ipv6); + if (ret < 0) { + printk("nf_conntrack_ipv6: can't register ipv6\n"); + goto cleanup_icmpv6; + } + + ret = nf_register_hook(&ipv6_conntrack_defrag_ops); + if (ret < 0) { + printk("nf_conntrack_ipv6: can't register pre-routing defrag " + "hook.\n"); + goto cleanup_ipv6; + } + + ret = nf_register_hook(&ipv6_conntrack_defrag_local_out_ops); + if (ret < 0) { + printk("nf_conntrack_ipv6: can't register local_out defrag " + "hook.\n"); + goto cleanup_defragops; + } + + ret = nf_register_hook(&ipv6_conntrack_in_ops); + if (ret < 0) { + printk("nf_conntrack_ipv6: can't register pre-routing hook.\n"); + goto cleanup_defraglocalops; + } + + ret = nf_register_hook(&ipv6_conntrack_local_out_ops); + if (ret < 0) { + printk("nf_conntrack_ipv6: can't register local out hook.\n"); + goto cleanup_inops; + } + + ret = nf_register_hook(&ipv6_conntrack_out_ops); + if (ret < 0) { + printk("nf_conntrack_ipv6: can't register post-routing hook.\n"); + goto cleanup_inandlocalops; + } + + ret = nf_register_hook(&ipv6_conntrack_local_in_ops); + if (ret < 0) { + printk("nf_conntrack_ipv6: can't register local in hook.\n"); + goto cleanup_inoutandlocalops; + } + +#ifdef CONFIG_SYSCTL + nf_ct_ipv6_sysctl_header = register_sysctl_table(nf_ct_net_table, 0); + if (nf_ct_ipv6_sysctl_header == NULL) { + printk("nf_conntrack: can't register to sysctl.\n"); + ret = -ENOMEM; + goto cleanup_localinops; + } +#endif + return ret; + + cleanup: + synchronize_net(); +#ifdef CONFIG_SYSCTL + unregister_sysctl_table(nf_ct_ipv6_sysctl_header); + cleanup_localinops: +#endif + nf_unregister_hook(&ipv6_conntrack_local_in_ops); + cleanup_inoutandlocalops: + nf_unregister_hook(&ipv6_conntrack_out_ops); + cleanup_inandlocalops: + nf_unregister_hook(&ipv6_conntrack_local_out_ops); + cleanup_inops: + nf_unregister_hook(&ipv6_conntrack_in_ops); + cleanup_defraglocalops: + nf_unregister_hook(&ipv6_conntrack_defrag_local_out_ops); + cleanup_defragops: + nf_unregister_hook(&ipv6_conntrack_defrag_ops); + cleanup_ipv6: + nf_conntrack_l3proto_unregister(&nf_conntrack_l3proto_ipv6); + cleanup_icmpv6: + nf_conntrack_protocol_unregister(&nf_conntrack_protocol_icmpv6); + cleanup_udp: + nf_conntrack_protocol_unregister(&nf_conntrack_protocol_udp6); + cleanup_tcp: + nf_conntrack_protocol_unregister(&nf_conntrack_protocol_tcp6); + cleanup_frag6: + nf_ct_frag6_cleanup(); + cleanup_nothing: + return ret; +} + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Yasuyuki KOZAKAI @USAGI "); + +static int __init init(void) +{ + need_nf_conntrack(); + return init_or_cleanup(1); +} + +static void __exit fini(void) +{ + init_or_cleanup(0); +} + +module_init(init); +module_exit(fini); + +void need_ip6_conntrack(void) +{ +} + +EXPORT_SYMBOL(need_ip6_conntrack); diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c new file mode 100644 index 000000000000..c0f1da5497a9 --- /dev/null +++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c @@ -0,0 +1,272 @@ +/* + * Copyright (C)2003,2004 USAGI/WIDE Project + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Author: + * Yasuyuki Kozakai @USAGI + * + * 16 Dec 2003: Yasuyuki Kozakai @USAGI + * - ICMPv6 tracking support. Derived from the original ip_conntrack code + * net/ipv4/netfilter/ip_conntrack_proto_icmp.c which had the following + * copyright information: + * (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +unsigned long nf_ct_icmpv6_timeout = 30*HZ; + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +static int icmpv6_pkt_to_tuple(const struct sk_buff *skb, + unsigned int dataoff, + struct nf_conntrack_tuple *tuple) +{ + struct icmp6hdr _hdr, *hp; + + hp = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); + if (hp == NULL) + return 0; + tuple->dst.u.icmp.type = hp->icmp6_type; + tuple->src.u.icmp.id = hp->icmp6_identifier; + tuple->dst.u.icmp.code = hp->icmp6_code; + + return 1; +} + +static int icmpv6_invert_tuple(struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_tuple *orig) +{ + /* Add 1; spaces filled with 0. */ + static u_int8_t invmap[] = { + [ICMPV6_ECHO_REQUEST - 128] = ICMPV6_ECHO_REPLY + 1, + [ICMPV6_ECHO_REPLY - 128] = ICMPV6_ECHO_REQUEST + 1, + [ICMPV6_NI_QUERY - 128] = ICMPV6_NI_QUERY + 1, + [ICMPV6_NI_REPLY - 128] = ICMPV6_NI_REPLY +1 + }; + + __u8 type = orig->dst.u.icmp.type - 128; + if (type >= sizeof(invmap) || !invmap[type]) + return 0; + + tuple->src.u.icmp.id = orig->src.u.icmp.id; + tuple->dst.u.icmp.type = invmap[type] - 1; + tuple->dst.u.icmp.code = orig->dst.u.icmp.code; + return 1; +} + +/* Print out the per-protocol part of the tuple. */ +static int icmpv6_print_tuple(struct seq_file *s, + const struct nf_conntrack_tuple *tuple) +{ + return seq_printf(s, "type=%u code=%u id=%u ", + tuple->dst.u.icmp.type, + tuple->dst.u.icmp.code, + ntohs(tuple->src.u.icmp.id)); +} + +/* Print out the private part of the conntrack. */ +static int icmpv6_print_conntrack(struct seq_file *s, + const struct nf_conn *conntrack) +{ + return 0; +} + +/* Returns verdict for packet, or -1 for invalid. */ +static int icmpv6_packet(struct nf_conn *ct, + const struct sk_buff *skb, + unsigned int dataoff, + enum ip_conntrack_info ctinfo, + int pf, + unsigned int hooknum) +{ + /* Try to delete connection immediately after all replies: + won't actually vanish as we still have skb, and del_timer + means this will only run once even if count hits zero twice + (theoretically possible with SMP) */ + if (CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY) { + if (atomic_dec_and_test(&ct->proto.icmp.count) + && del_timer(&ct->timeout)) + ct->timeout.function((unsigned long)ct); + } else { + atomic_inc(&ct->proto.icmp.count); + nf_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, skb); + nf_ct_refresh_acct(ct, ctinfo, skb, nf_ct_icmpv6_timeout); + } + + return NF_ACCEPT; +} + +/* Called when a new connection for this protocol found. */ +static int icmpv6_new(struct nf_conn *conntrack, + const struct sk_buff *skb, + unsigned int dataoff) +{ + static u_int8_t valid_new[] = { + [ICMPV6_ECHO_REQUEST - 128] = 1, + [ICMPV6_NI_QUERY - 128] = 1 + }; + + if (conntrack->tuplehash[0].tuple.dst.u.icmp.type - 128 >= sizeof(valid_new) + || !valid_new[conntrack->tuplehash[0].tuple.dst.u.icmp.type - 128]) { + /* Can't create a new ICMPv6 `conn' with this. */ + DEBUGP("icmp: can't create new conn with type %u\n", + conntrack->tuplehash[0].tuple.dst.u.icmp.type); + NF_CT_DUMP_TUPLE(&conntrack->tuplehash[0].tuple); + return 0; + } + atomic_set(&conntrack->proto.icmp.count, 0); + return 1; +} + +extern int +nf_ct_ipv6_skip_exthdr(struct sk_buff *skb, int start, u8 *nexthdrp, int len); +extern struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv6; +static int +icmpv6_error_message(struct sk_buff *skb, + unsigned int icmp6off, + enum ip_conntrack_info *ctinfo, + unsigned int hooknum) +{ + struct nf_conntrack_tuple intuple, origtuple; + struct nf_conntrack_tuple_hash *h; + struct icmp6hdr _hdr, *hp; + unsigned int inip6off; + struct nf_conntrack_protocol *inproto; + u_int8_t inprotonum; + unsigned int inprotoff; + + NF_CT_ASSERT(skb->nfct == NULL); + + hp = skb_header_pointer(skb, icmp6off, sizeof(_hdr), &_hdr); + if (hp == NULL) { + DEBUGP("icmpv6_error: Can't get ICMPv6 hdr.\n"); + return -NF_ACCEPT; + } + + inip6off = icmp6off + sizeof(_hdr); + if (skb_copy_bits(skb, inip6off+offsetof(struct ipv6hdr, nexthdr), + &inprotonum, sizeof(inprotonum)) != 0) { + DEBUGP("icmpv6_error: Can't get nexthdr in inner IPv6 header.\n"); + return -NF_ACCEPT; + } + inprotoff = nf_ct_ipv6_skip_exthdr(skb, + inip6off + sizeof(struct ipv6hdr), + &inprotonum, + skb->len - inip6off + - sizeof(struct ipv6hdr)); + + if ((inprotoff < 0) || (inprotoff > skb->len) || + (inprotonum == NEXTHDR_FRAGMENT)) { + DEBUGP("icmpv6_error: Can't get protocol header in ICMPv6 payload.\n"); + return -NF_ACCEPT; + } + + inproto = nf_ct_find_proto(PF_INET6, inprotonum); + + /* Are they talking about one of our connections? */ + if (!nf_ct_get_tuple(skb, inip6off, inprotoff, PF_INET6, inprotonum, + &origtuple, &nf_conntrack_l3proto_ipv6, inproto)) { + DEBUGP("icmpv6_error: Can't get tuple\n"); + return -NF_ACCEPT; + } + + /* Ordinarily, we'd expect the inverted tupleproto, but it's + been preserved inside the ICMP. */ + if (!nf_ct_invert_tuple(&intuple, &origtuple, + &nf_conntrack_l3proto_ipv6, inproto)) { + DEBUGP("icmpv6_error: Can't invert tuple\n"); + return -NF_ACCEPT; + } + + *ctinfo = IP_CT_RELATED; + + h = nf_conntrack_find_get(&intuple, NULL); + if (!h) { + DEBUGP("icmpv6_error: no match\n"); + return -NF_ACCEPT; + } else { + if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) + *ctinfo += IP_CT_IS_REPLY; + } + + /* Update skb to refer to this connection */ + skb->nfct = &nf_ct_tuplehash_to_ctrack(h)->ct_general; + skb->nfctinfo = *ctinfo; + return -NF_ACCEPT; +} + +static int +icmpv6_error(struct sk_buff *skb, unsigned int dataoff, + enum ip_conntrack_info *ctinfo, int pf, unsigned int hooknum) +{ + struct icmp6hdr _ih, *icmp6h; + + icmp6h = skb_header_pointer(skb, dataoff, sizeof(_ih), &_ih); + if (icmp6h == NULL) { + if (LOG_INVALID(IPPROTO_ICMPV6)) + nf_log_packet(PF_INET6, 0, skb, NULL, NULL, NULL, + "nf_ct_icmpv6: short packet "); + return -NF_ACCEPT; + } + + if (hooknum != NF_IP6_PRE_ROUTING) + goto skipped; + + /* Ignore it if the checksum's bogus. */ + if (csum_ipv6_magic(&skb->nh.ipv6h->saddr, &skb->nh.ipv6h->daddr, + skb->len - dataoff, IPPROTO_ICMPV6, + skb_checksum(skb, dataoff, + skb->len - dataoff, 0))) { + nf_log_packet(PF_INET6, 0, skb, NULL, NULL, NULL, + "nf_ct_icmpv6: ICMPv6 checksum failed\n"); + return -NF_ACCEPT; + } + +skipped: + + /* is not error message ? */ + if (icmp6h->icmp6_type >= 128) + return NF_ACCEPT; + + return icmpv6_error_message(skb, dataoff, ctinfo, hooknum); +} + +struct nf_conntrack_protocol nf_conntrack_protocol_icmpv6 = +{ + .l3proto = PF_INET6, + .proto = IPPROTO_ICMPV6, + .name = "icmpv6", + .pkt_to_tuple = icmpv6_pkt_to_tuple, + .invert_tuple = icmpv6_invert_tuple, + .print_tuple = icmpv6_print_tuple, + .print_conntrack = icmpv6_print_conntrack, + .packet = icmpv6_packet, + .new = icmpv6_new, + .error = icmpv6_error, +}; + +EXPORT_SYMBOL(nf_conntrack_protocol_icmpv6); diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c new file mode 100644 index 000000000000..7640b9bb7694 --- /dev/null +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -0,0 +1,885 @@ +/* + * IPv6 fragment reassembly for connection tracking + * + * Copyright (C)2004 USAGI/WIDE Project + * + * Author: + * Yasuyuki Kozakai @USAGI + * + * Based on: net/ipv6/reassembly.c + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +#define NF_CT_FRAG6_HIGH_THRESH 262144 /* == 256*1024 */ +#define NF_CT_FRAG6_LOW_THRESH 196608 /* == 192*1024 */ +#define NF_CT_FRAG6_TIMEOUT IPV6_FRAG_TIMEOUT + +int nf_ct_frag6_high_thresh = 256*1024; +int nf_ct_frag6_low_thresh = 192*1024; +int nf_ct_frag6_timeout = IPV6_FRAG_TIMEOUT; + +struct nf_ct_frag6_skb_cb +{ + struct inet6_skb_parm h; + int offset; + struct sk_buff *orig; +}; + +#define NFCT_FRAG6_CB(skb) ((struct nf_ct_frag6_skb_cb*)((skb)->cb)) + +struct nf_ct_frag6_queue +{ + struct nf_ct_frag6_queue *next; + struct list_head lru_list; /* lru list member */ + + __u32 id; /* fragment id */ + struct in6_addr saddr; + struct in6_addr daddr; + + spinlock_t lock; + atomic_t refcnt; + struct timer_list timer; /* expire timer */ + struct sk_buff *fragments; + int len; + int meat; + struct timeval stamp; + unsigned int csum; + __u8 last_in; /* has first/last segment arrived? */ +#define COMPLETE 4 +#define FIRST_IN 2 +#define LAST_IN 1 + __u16 nhoffset; + struct nf_ct_frag6_queue **pprev; +}; + +/* Hash table. */ + +#define FRAG6Q_HASHSZ 64 + +static struct nf_ct_frag6_queue *nf_ct_frag6_hash[FRAG6Q_HASHSZ]; +static rwlock_t nf_ct_frag6_lock = RW_LOCK_UNLOCKED; +static u32 nf_ct_frag6_hash_rnd; +static LIST_HEAD(nf_ct_frag6_lru_list); +int nf_ct_frag6_nqueues = 0; + +static __inline__ void __fq_unlink(struct nf_ct_frag6_queue *fq) +{ + if (fq->next) + fq->next->pprev = fq->pprev; + *fq->pprev = fq->next; + list_del(&fq->lru_list); + nf_ct_frag6_nqueues--; +} + +static __inline__ void fq_unlink(struct nf_ct_frag6_queue *fq) +{ + write_lock(&nf_ct_frag6_lock); + __fq_unlink(fq); + write_unlock(&nf_ct_frag6_lock); +} + +static unsigned int ip6qhashfn(u32 id, struct in6_addr *saddr, + struct in6_addr *daddr) +{ + u32 a, b, c; + + a = saddr->s6_addr32[0]; + b = saddr->s6_addr32[1]; + c = saddr->s6_addr32[2]; + + a += JHASH_GOLDEN_RATIO; + b += JHASH_GOLDEN_RATIO; + c += nf_ct_frag6_hash_rnd; + __jhash_mix(a, b, c); + + a += saddr->s6_addr32[3]; + b += daddr->s6_addr32[0]; + c += daddr->s6_addr32[1]; + __jhash_mix(a, b, c); + + a += daddr->s6_addr32[2]; + b += daddr->s6_addr32[3]; + c += id; + __jhash_mix(a, b, c); + + return c & (FRAG6Q_HASHSZ - 1); +} + +static struct timer_list nf_ct_frag6_secret_timer; +int nf_ct_frag6_secret_interval = 10 * 60 * HZ; + +static void nf_ct_frag6_secret_rebuild(unsigned long dummy) +{ + unsigned long now = jiffies; + int i; + + write_lock(&nf_ct_frag6_lock); + get_random_bytes(&nf_ct_frag6_hash_rnd, sizeof(u32)); + for (i = 0; i < FRAG6Q_HASHSZ; i++) { + struct nf_ct_frag6_queue *q; + + q = nf_ct_frag6_hash[i]; + while (q) { + struct nf_ct_frag6_queue *next = q->next; + unsigned int hval = ip6qhashfn(q->id, + &q->saddr, + &q->daddr); + + if (hval != i) { + /* Unlink. */ + if (q->next) + q->next->pprev = q->pprev; + *q->pprev = q->next; + + /* Relink to new hash chain. */ + if ((q->next = nf_ct_frag6_hash[hval]) != NULL) + q->next->pprev = &q->next; + nf_ct_frag6_hash[hval] = q; + q->pprev = &nf_ct_frag6_hash[hval]; + } + + q = next; + } + } + write_unlock(&nf_ct_frag6_lock); + + mod_timer(&nf_ct_frag6_secret_timer, now + nf_ct_frag6_secret_interval); +} + +atomic_t nf_ct_frag6_mem = ATOMIC_INIT(0); + +/* Memory Tracking Functions. */ +static inline void frag_kfree_skb(struct sk_buff *skb) +{ + atomic_sub(skb->truesize, &nf_ct_frag6_mem); + if (NFCT_FRAG6_CB(skb)->orig) + kfree_skb(NFCT_FRAG6_CB(skb)->orig); + + kfree_skb(skb); +} + +static inline void frag_free_queue(struct nf_ct_frag6_queue *fq) +{ + atomic_sub(sizeof(struct nf_ct_frag6_queue), &nf_ct_frag6_mem); + kfree(fq); +} + +static inline struct nf_ct_frag6_queue *frag_alloc_queue(void) +{ + struct nf_ct_frag6_queue *fq = kmalloc(sizeof(struct nf_ct_frag6_queue), GFP_ATOMIC); + + if (!fq) + return NULL; + atomic_add(sizeof(struct nf_ct_frag6_queue), &nf_ct_frag6_mem); + return fq; +} + +/* Destruction primitives. */ + +/* Complete destruction of fq. */ +static void nf_ct_frag6_destroy(struct nf_ct_frag6_queue *fq) +{ + struct sk_buff *fp; + + BUG_TRAP(fq->last_in&COMPLETE); + BUG_TRAP(del_timer(&fq->timer) == 0); + + /* Release all fragment data. */ + fp = fq->fragments; + while (fp) { + struct sk_buff *xp = fp->next; + + frag_kfree_skb(fp); + fp = xp; + } + + frag_free_queue(fq); +} + +static __inline__ void fq_put(struct nf_ct_frag6_queue *fq) +{ + if (atomic_dec_and_test(&fq->refcnt)) + nf_ct_frag6_destroy(fq); +} + +/* Kill fq entry. It is not destroyed immediately, + * because caller (and someone more) holds reference count. + */ +static __inline__ void fq_kill(struct nf_ct_frag6_queue *fq) +{ + if (del_timer(&fq->timer)) + atomic_dec(&fq->refcnt); + + if (!(fq->last_in & COMPLETE)) { + fq_unlink(fq); + atomic_dec(&fq->refcnt); + fq->last_in |= COMPLETE; + } +} + +static void nf_ct_frag6_evictor(void) +{ + struct nf_ct_frag6_queue *fq; + struct list_head *tmp; + + for (;;) { + if (atomic_read(&nf_ct_frag6_mem) <= nf_ct_frag6_low_thresh) + return; + read_lock(&nf_ct_frag6_lock); + if (list_empty(&nf_ct_frag6_lru_list)) { + read_unlock(&nf_ct_frag6_lock); + return; + } + tmp = nf_ct_frag6_lru_list.next; + fq = list_entry(tmp, struct nf_ct_frag6_queue, lru_list); + atomic_inc(&fq->refcnt); + read_unlock(&nf_ct_frag6_lock); + + spin_lock(&fq->lock); + if (!(fq->last_in&COMPLETE)) + fq_kill(fq); + spin_unlock(&fq->lock); + + fq_put(fq); + } +} + +static void nf_ct_frag6_expire(unsigned long data) +{ + struct nf_ct_frag6_queue *fq = (struct nf_ct_frag6_queue *) data; + + spin_lock(&fq->lock); + + if (fq->last_in & COMPLETE) + goto out; + + fq_kill(fq); + +out: + spin_unlock(&fq->lock); + fq_put(fq); +} + +/* Creation primitives. */ + + +static struct nf_ct_frag6_queue *nf_ct_frag6_intern(unsigned int hash, + struct nf_ct_frag6_queue *fq_in) +{ + struct nf_ct_frag6_queue *fq; + + write_lock(&nf_ct_frag6_lock); +#ifdef CONFIG_SMP + for (fq = nf_ct_frag6_hash[hash]; fq; fq = fq->next) { + if (fq->id == fq_in->id && + !ipv6_addr_cmp(&fq_in->saddr, &fq->saddr) && + !ipv6_addr_cmp(&fq_in->daddr, &fq->daddr)) { + atomic_inc(&fq->refcnt); + write_unlock(&nf_ct_frag6_lock); + fq_in->last_in |= COMPLETE; + fq_put(fq_in); + return fq; + } + } +#endif + fq = fq_in; + + if (!mod_timer(&fq->timer, jiffies + nf_ct_frag6_timeout)) + atomic_inc(&fq->refcnt); + + atomic_inc(&fq->refcnt); + if ((fq->next = nf_ct_frag6_hash[hash]) != NULL) + fq->next->pprev = &fq->next; + nf_ct_frag6_hash[hash] = fq; + fq->pprev = &nf_ct_frag6_hash[hash]; + INIT_LIST_HEAD(&fq->lru_list); + list_add_tail(&fq->lru_list, &nf_ct_frag6_lru_list); + nf_ct_frag6_nqueues++; + write_unlock(&nf_ct_frag6_lock); + return fq; +} + + +static struct nf_ct_frag6_queue * +nf_ct_frag6_create(unsigned int hash, u32 id, struct in6_addr *src, struct in6_addr *dst) +{ + struct nf_ct_frag6_queue *fq; + + if ((fq = frag_alloc_queue()) == NULL) { + DEBUGP("Can't alloc new queue\n"); + goto oom; + } + + memset(fq, 0, sizeof(struct nf_ct_frag6_queue)); + + fq->id = id; + ipv6_addr_copy(&fq->saddr, src); + ipv6_addr_copy(&fq->daddr, dst); + + init_timer(&fq->timer); + fq->timer.function = nf_ct_frag6_expire; + fq->timer.data = (long) fq; + fq->lock = SPIN_LOCK_UNLOCKED; + atomic_set(&fq->refcnt, 1); + + return nf_ct_frag6_intern(hash, fq); + +oom: + return NULL; +} + +static __inline__ struct nf_ct_frag6_queue * +fq_find(u32 id, struct in6_addr *src, struct in6_addr *dst) +{ + struct nf_ct_frag6_queue *fq; + unsigned int hash = ip6qhashfn(id, src, dst); + + read_lock(&nf_ct_frag6_lock); + for (fq = nf_ct_frag6_hash[hash]; fq; fq = fq->next) { + if (fq->id == id && + !ipv6_addr_cmp(src, &fq->saddr) && + !ipv6_addr_cmp(dst, &fq->daddr)) { + atomic_inc(&fq->refcnt); + read_unlock(&nf_ct_frag6_lock); + return fq; + } + } + read_unlock(&nf_ct_frag6_lock); + + return nf_ct_frag6_create(hash, id, src, dst); +} + + +static int nf_ct_frag6_queue(struct nf_ct_frag6_queue *fq, struct sk_buff *skb, + struct frag_hdr *fhdr, int nhoff) +{ + struct sk_buff *prev, *next; + int offset, end; + + if (fq->last_in & COMPLETE) { + DEBUGP("Allready completed\n"); + goto err; + } + + offset = ntohs(fhdr->frag_off) & ~0x7; + end = offset + (ntohs(skb->nh.ipv6h->payload_len) - + ((u8 *) (fhdr + 1) - (u8 *) (skb->nh.ipv6h + 1))); + + if ((unsigned int)end > IPV6_MAXPLEN) { + DEBUGP("offset is too large.\n"); + return -1; + } + + if (skb->ip_summed == CHECKSUM_HW) + skb->csum = csum_sub(skb->csum, + csum_partial(skb->nh.raw, + (u8*)(fhdr + 1) - skb->nh.raw, + 0)); + + /* Is this the final fragment? */ + if (!(fhdr->frag_off & htons(IP6_MF))) { + /* If we already have some bits beyond end + * or have different end, the segment is corrupted. + */ + if (end < fq->len || + ((fq->last_in & LAST_IN) && end != fq->len)) { + DEBUGP("already received last fragment\n"); + goto err; + } + fq->last_in |= LAST_IN; + fq->len = end; + } else { + /* Check if the fragment is rounded to 8 bytes. + * Required by the RFC. + */ + if (end & 0x7) { + /* RFC2460 says always send parameter problem in + * this case. -DaveM + */ + DEBUGP("the end of this fragment is not rounded to 8 bytes.\n"); + return -1; + } + if (end > fq->len) { + /* Some bits beyond end -> corruption. */ + if (fq->last_in & LAST_IN) { + DEBUGP("last packet already reached.\n"); + goto err; + } + fq->len = end; + } + } + + if (end == offset) + goto err; + + /* Point into the IP datagram 'data' part. */ + if (!pskb_pull(skb, (u8 *) (fhdr + 1) - skb->data)) { + DEBUGP("queue: message is too short.\n"); + goto err; + } + if (end-offset < skb->len) { + if (pskb_trim(skb, end - offset)) { + DEBUGP("Can't trim\n"); + goto err; + } + if (skb->ip_summed != CHECKSUM_UNNECESSARY) + skb->ip_summed = CHECKSUM_NONE; + } + + /* Find out which fragments are in front and at the back of us + * in the chain of fragments so far. We must know where to put + * this fragment, right? + */ + prev = NULL; + for (next = fq->fragments; next != NULL; next = next->next) { + if (NFCT_FRAG6_CB(next)->offset >= offset) + break; /* bingo! */ + prev = next; + } + + /* We found where to put this one. Check for overlap with + * preceding fragment, and, if needed, align things so that + * any overlaps are eliminated. + */ + if (prev) { + int i = (NFCT_FRAG6_CB(prev)->offset + prev->len) - offset; + + if (i > 0) { + offset += i; + if (end <= offset) { + DEBUGP("overlap\n"); + goto err; + } + if (!pskb_pull(skb, i)) { + DEBUGP("Can't pull\n"); + goto err; + } + if (skb->ip_summed != CHECKSUM_UNNECESSARY) + skb->ip_summed = CHECKSUM_NONE; + } + } + + /* Look for overlap with succeeding segments. + * If we can merge fragments, do it. + */ + while (next && NFCT_FRAG6_CB(next)->offset < end) { + /* overlap is 'i' bytes */ + int i = end - NFCT_FRAG6_CB(next)->offset; + + if (i < next->len) { + /* Eat head of the next overlapped fragment + * and leave the loop. The next ones cannot overlap. + */ + DEBUGP("Eat head of the overlapped parts.: %d", i); + if (!pskb_pull(next, i)) + goto err; + + /* next fragment */ + NFCT_FRAG6_CB(next)->offset += i; + fq->meat -= i; + if (next->ip_summed != CHECKSUM_UNNECESSARY) + next->ip_summed = CHECKSUM_NONE; + break; + } else { + struct sk_buff *free_it = next; + + /* Old fragmnet is completely overridden with + * new one drop it. + */ + next = next->next; + + if (prev) + prev->next = next; + else + fq->fragments = next; + + fq->meat -= free_it->len; + frag_kfree_skb(free_it); + } + } + + NFCT_FRAG6_CB(skb)->offset = offset; + + /* Insert this fragment in the chain of fragments. */ + skb->next = next; + if (prev) + prev->next = skb; + else + fq->fragments = skb; + + skb->dev = NULL; + skb_get_timestamp(skb, &fq->stamp); + fq->meat += skb->len; + atomic_add(skb->truesize, &nf_ct_frag6_mem); + + /* The first fragment. + * nhoffset is obtained from the first fragment, of course. + */ + if (offset == 0) { + fq->nhoffset = nhoff; + fq->last_in |= FIRST_IN; + } + write_lock(&nf_ct_frag6_lock); + list_move_tail(&fq->lru_list, &nf_ct_frag6_lru_list); + write_unlock(&nf_ct_frag6_lock); + return 0; + +err: + return -1; +} + +/* + * Check if this packet is complete. + * Returns NULL on failure by any reason, and pointer + * to current nexthdr field in reassembled frame. + * + * It is called with locked fq, and caller must check that + * queue is eligible for reassembly i.e. it is not COMPLETE, + * the last and the first frames arrived and all the bits are here. + */ +static struct sk_buff * +nf_ct_frag6_reasm(struct nf_ct_frag6_queue *fq, struct net_device *dev) +{ + struct sk_buff *fp, *op, *head = fq->fragments; + int payload_len; + + fq_kill(fq); + + BUG_TRAP(head != NULL); + BUG_TRAP(NFCT_FRAG6_CB(head)->offset == 0); + + /* Unfragmented part is taken from the first segment. */ + payload_len = (head->data - head->nh.raw) - sizeof(struct ipv6hdr) + fq->len - sizeof(struct frag_hdr); + if (payload_len > IPV6_MAXPLEN) { + DEBUGP("payload len is too large.\n"); + goto out_oversize; + } + + /* Head of list must not be cloned. */ + if (skb_cloned(head) && pskb_expand_head(head, 0, 0, GFP_ATOMIC)) { + DEBUGP("skb is cloned but can't expand head"); + goto out_oom; + } + + /* If the first fragment is fragmented itself, we split + * it to two chunks: the first with data and paged part + * and the second, holding only fragments. */ + if (skb_shinfo(head)->frag_list) { + struct sk_buff *clone; + int i, plen = 0; + + if ((clone = alloc_skb(0, GFP_ATOMIC)) == NULL) { + DEBUGP("Can't alloc skb\n"); + goto out_oom; + } + clone->next = head->next; + head->next = clone; + skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list; + skb_shinfo(head)->frag_list = NULL; + for (i=0; inr_frags; i++) + plen += skb_shinfo(head)->frags[i].size; + clone->len = clone->data_len = head->data_len - plen; + head->data_len -= clone->len; + head->len -= clone->len; + clone->csum = 0; + clone->ip_summed = head->ip_summed; + + NFCT_FRAG6_CB(clone)->orig = NULL; + atomic_add(clone->truesize, &nf_ct_frag6_mem); + } + + /* We have to remove fragment header from datagram and to relocate + * header in order to calculate ICV correctly. */ + head->nh.raw[fq->nhoffset] = head->h.raw[0]; + memmove(head->head + sizeof(struct frag_hdr), head->head, + (head->data - head->head) - sizeof(struct frag_hdr)); + head->mac.raw += sizeof(struct frag_hdr); + head->nh.raw += sizeof(struct frag_hdr); + + skb_shinfo(head)->frag_list = head->next; + head->h.raw = head->data; + skb_push(head, head->data - head->nh.raw); + atomic_sub(head->truesize, &nf_ct_frag6_mem); + + for (fp=head->next; fp; fp = fp->next) { + head->data_len += fp->len; + head->len += fp->len; + if (head->ip_summed != fp->ip_summed) + head->ip_summed = CHECKSUM_NONE; + else if (head->ip_summed == CHECKSUM_HW) + head->csum = csum_add(head->csum, fp->csum); + head->truesize += fp->truesize; + atomic_sub(fp->truesize, &nf_ct_frag6_mem); + } + + head->next = NULL; + head->dev = dev; + skb_set_timestamp(head, &fq->stamp); + head->nh.ipv6h->payload_len = htons(payload_len); + + /* Yes, and fold redundant checksum back. 8) */ + if (head->ip_summed == CHECKSUM_HW) + head->csum = csum_partial(head->nh.raw, head->h.raw-head->nh.raw, head->csum); + + fq->fragments = NULL; + + /* all original skbs are linked into the NFCT_FRAG6_CB(head).orig */ + fp = skb_shinfo(head)->frag_list; + if (NFCT_FRAG6_CB(fp)->orig == NULL) + /* at above code, head skb is divided into two skbs. */ + fp = fp->next; + + op = NFCT_FRAG6_CB(head)->orig; + for (; fp; fp = fp->next) { + struct sk_buff *orig = NFCT_FRAG6_CB(fp)->orig; + + op->next = orig; + op = orig; + NFCT_FRAG6_CB(fp)->orig = NULL; + } + + return head; + +out_oversize: + if (net_ratelimit()) + printk(KERN_DEBUG "nf_ct_frag6_reasm: payload len = %d\n", payload_len); + goto out_fail; +out_oom: + if (net_ratelimit()) + printk(KERN_DEBUG "nf_ct_frag6_reasm: no memory for reassembly\n"); +out_fail: + return NULL; +} + +/* + * find the header just before Fragment Header. + * + * if success return 0 and set ... + * (*prevhdrp): the value of "Next Header Field" in the header + * just before Fragment Header. + * (*prevhoff): the offset of "Next Header Field" in the header + * just before Fragment Header. + * (*fhoff) : the offset of Fragment Header. + * + * Based on ipv6_skip_hdr() in net/ipv6/exthdr.c + * + */ +static int +find_prev_fhdr(struct sk_buff *skb, u8 *prevhdrp, int *prevhoff, int *fhoff) +{ + u8 nexthdr = skb->nh.ipv6h->nexthdr; + u8 prev_nhoff = (u8 *)&skb->nh.ipv6h->nexthdr - skb->data; + int start = (u8 *)(skb->nh.ipv6h+1) - skb->data; + int len = skb->len - start; + u8 prevhdr = NEXTHDR_IPV6; + + while (nexthdr != NEXTHDR_FRAGMENT) { + struct ipv6_opt_hdr hdr; + int hdrlen; + + if (!ipv6_ext_hdr(nexthdr)) { + return -1; + } + if (len < (int)sizeof(struct ipv6_opt_hdr)) { + DEBUGP("too short\n"); + return -1; + } + if (nexthdr == NEXTHDR_NONE) { + DEBUGP("next header is none\n"); + return -1; + } + if (skb_copy_bits(skb, start, &hdr, sizeof(hdr))) + BUG(); + if (nexthdr == NEXTHDR_AUTH) + hdrlen = (hdr.hdrlen+2)<<2; + else + hdrlen = ipv6_optlen(&hdr); + + prevhdr = nexthdr; + prev_nhoff = start; + + nexthdr = hdr.nexthdr; + len -= hdrlen; + start += hdrlen; + } + + if (len < 0) + return -1; + + *prevhdrp = prevhdr; + *prevhoff = prev_nhoff; + *fhoff = start; + + return 0; +} + +struct sk_buff *nf_ct_frag6_gather(struct sk_buff *skb) +{ + struct sk_buff *clone; + struct net_device *dev = skb->dev; + struct frag_hdr *fhdr; + struct nf_ct_frag6_queue *fq; + struct ipv6hdr *hdr; + int fhoff, nhoff; + u8 prevhdr; + struct sk_buff *ret_skb = NULL; + + /* Jumbo payload inhibits frag. header */ + if (skb->nh.ipv6h->payload_len == 0) { + DEBUGP("payload len = 0\n"); + return skb; + } + + if (find_prev_fhdr(skb, &prevhdr, &nhoff, &fhoff) < 0) + return skb; + + clone = skb_clone(skb, GFP_ATOMIC); + if (clone == NULL) { + DEBUGP("Can't clone skb\n"); + return skb; + } + + NFCT_FRAG6_CB(clone)->orig = skb; + + if (!pskb_may_pull(clone, fhoff + sizeof(*fhdr))) { + DEBUGP("message is too short.\n"); + goto ret_orig; + } + + clone->h.raw = clone->data + fhoff; + hdr = clone->nh.ipv6h; + fhdr = (struct frag_hdr *)clone->h.raw; + + if (!(fhdr->frag_off & htons(0xFFF9))) { + DEBUGP("Invalid fragment offset\n"); + /* It is not a fragmented frame */ + goto ret_orig; + } + + if (atomic_read(&nf_ct_frag6_mem) > nf_ct_frag6_high_thresh) + nf_ct_frag6_evictor(); + + fq = fq_find(fhdr->identification, &hdr->saddr, &hdr->daddr); + if (fq == NULL) { + DEBUGP("Can't find and can't create new queue\n"); + goto ret_orig; + } + + spin_lock(&fq->lock); + + if (nf_ct_frag6_queue(fq, clone, fhdr, nhoff) < 0) { + spin_unlock(&fq->lock); + DEBUGP("Can't insert skb to queue\n"); + fq_put(fq); + goto ret_orig; + } + + if (fq->last_in == (FIRST_IN|LAST_IN) && fq->meat == fq->len) { + ret_skb = nf_ct_frag6_reasm(fq, dev); + if (ret_skb == NULL) + DEBUGP("Can't reassemble fragmented packets\n"); + } + spin_unlock(&fq->lock); + + fq_put(fq); + return ret_skb; + +ret_orig: + kfree_skb(clone); + return skb; +} + +void nf_ct_frag6_output(unsigned int hooknum, struct sk_buff *skb, + struct net_device *in, struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct sk_buff *s, *s2; + + for (s = NFCT_FRAG6_CB(skb)->orig; s;) { + nf_conntrack_put_reasm(s->nfct_reasm); + nf_conntrack_get_reasm(skb); + s->nfct_reasm = skb; + + s2 = s->next; + NF_HOOK_THRESH(PF_INET6, hooknum, s, in, out, okfn, + NF_IP6_PRI_CONNTRACK_DEFRAG + 1); + s = s2; + } + nf_conntrack_put_reasm(skb); +} + +int nf_ct_frag6_kfree_frags(struct sk_buff *skb) +{ + struct sk_buff *s, *s2; + + for (s = NFCT_FRAG6_CB(skb)->orig; s; s = s2) { + + s2 = s->next; + kfree_skb(s); + } + + kfree_skb(skb); + + return 0; +} + +int nf_ct_frag6_init(void) +{ + nf_ct_frag6_hash_rnd = (u32) ((num_physpages ^ (num_physpages>>7)) ^ + (jiffies ^ (jiffies >> 6))); + + init_timer(&nf_ct_frag6_secret_timer); + nf_ct_frag6_secret_timer.function = nf_ct_frag6_secret_rebuild; + nf_ct_frag6_secret_timer.expires = jiffies + + nf_ct_frag6_secret_interval; + add_timer(&nf_ct_frag6_secret_timer); + + return 0; +} + +void nf_ct_frag6_cleanup(void) +{ + del_timer(&nf_ct_frag6_secret_timer); + nf_ct_frag6_evictor(); +} diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index a1265a320b11..651c79b41eeb 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -174,8 +174,10 @@ int ipv6_raw_deliver(struct sk_buff *skb, int nexthdr) struct sk_buff *clone = skb_clone(skb, GFP_ATOMIC); /* Not releasing hash table! */ - if (clone) + if (clone) { + nf_reset(clone); rawv6_rcv(sk, clone); + } } sk = __raw_v6_lookup(sk_next(sk), nexthdr, daddr, saddr, IP6CB(skb)->iif); diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 8296b38bf270..a84f9221e5f0 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -1,3 +1,6 @@ +menu "Core Netfilter Configuration" + depends on NET && NETFILTER + config NETFILTER_NETLINK tristate "Netfilter netlink interface" help @@ -22,3 +25,74 @@ config NETFILTER_NETLINK_LOG and is also scheduled to replace the old syslog-based ipt_LOG and ip6t_LOG modules. +config NF_CONNTRACK + tristate "Layer 3 Independent Connection tracking (EXPERIMENTAL)" + depends on EXPERIMENTAL && IP_NF_CONNTRACK=n + default n + ---help--- + Connection tracking keeps a record of what packets have passed + through your machine, in order to figure out how they are related + into connections. + + Layer 3 independent connection tracking is experimental scheme + which generalize ip_conntrack to support other layer 3 protocols. + + To compile it as a module, choose M here. If unsure, say N. + +config NF_CT_ACCT + bool "Connection tracking flow accounting" + depends on NF_CONNTRACK + help + If this option is enabled, the connection tracking code will + keep per-flow packet and byte counters. + + Those counters can be used for flow-based accounting or the + `connbytes' match. + + If unsure, say `N'. + +config NF_CONNTRACK_MARK + bool 'Connection mark tracking support' + depends on NF_CONNTRACK + help + This option enables support for connection marks, used by the + `CONNMARK' target and `connmark' match. Similar to the mark value + of packets, but this mark value is kept in the conntrack session + instead of the individual packets. + +config NF_CONNTRACK_EVENTS + bool "Connection tracking events" + depends on NF_CONNTRACK + help + If this option is enabled, the connection tracking code will + provide a notifier chain that can be used by other kernel code + to get notified aboutchanges in the connection tracking state. + + If unsure, say `N'. + +config NF_CT_PROTO_SCTP + tristate 'SCTP protocol on new connection tracking support (EXPERIMENTAL)' + depends on EXPERIMENTAL && NF_CONNTRACK + default n + help + With this option enabled, the layer 3 independent connection + tracking code will be able to do state tracking on SCTP connections. + + If you want to compile it as a module, say M here and read + Documentation/modules.txt. If unsure, say `N'. + +config NF_CONNTRACK_FTP + tristate "FTP support on new connection tracking (EXPERIMENTAL)" + depends on EXPERIMENTAL && NF_CONNTRACK + help + Tracking FTP connections is problematic: special helpers are + required for tracking them, and doing masquerading and other forms + of Network Address Translation on them. + + This is FTP support on Layer 3 independent connection tracking. + Layer 3 independent connection tracking is experimental scheme + which generalize ip_conntrack to support other layer 3 protocols. + + To compile it as a module, choose M here. If unsure, say N. + +endmenu diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index b3b44f8b415a..55f019ad2c08 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -5,3 +5,11 @@ obj-$(CONFIG_NETFILTER) = netfilter.o obj-$(CONFIG_NETFILTER_NETLINK) += nfnetlink.o obj-$(CONFIG_NETFILTER_NETLINK_QUEUE) += nfnetlink_queue.o obj-$(CONFIG_NETFILTER_NETLINK_LOG) += nfnetlink_log.o + +nf_conntrack-objs := nf_conntrack_core.o nf_conntrack_standalone.o nf_conntrack_l3proto_generic.o nf_conntrack_proto_generic.o nf_conntrack_proto_tcp.o nf_conntrack_proto_udp.o + +obj-$(CONFIG_NF_CONNTRACK) += nf_conntrack.o +obj-$(CONFIG_NF_CONNTRACK_FTP) += nf_conntrack_ftp.o + +# SCTP protocol connection tracking +obj-$(CONFIG_NF_CT_PROTO_SCTP) += nf_conntrack_proto_sctp.o diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c new file mode 100644 index 000000000000..9a67c796b385 --- /dev/null +++ b/net/netfilter/nf_conntrack_core.c @@ -0,0 +1,1538 @@ +/* Connection state tracking for netfilter. This is separated from, + but required by, the NAT layer; it can also be used by an iptables + extension. */ + +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2005 Netfilter Core Team + * (C) 2003,2004 USAGI/WIDE Project + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * 23 Apr 2001: Harald Welte + * - new API and handling of conntrack/nat helpers + * - now capable of multiple expectations for one master + * 16 Jul 2002: Harald Welte + * - add usage/reference counts to ip_conntrack_expect + * - export ip_conntrack[_expect]_{find_get,put} functions + * 16 Dec 2003: Yasuyuki Kozakai @USAGI + * - generalize L3 protocol denendent part. + * 23 Mar 2004: Yasuyuki Kozakai @USAGI + * - add support various size of conntrack structures. + * + * Derived from net/ipv4/netfilter/ip_conntrack_core.c + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* This rwlock protects the main hash table, protocol/helper/expected + registrations, conntrack timers*/ +#define ASSERT_READ_LOCK(x) +#define ASSERT_WRITE_LOCK(x) + +#include +#include +#include +#include +#include +#include + +#define NF_CONNTRACK_VERSION "0.4.1" + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +DEFINE_RWLOCK(nf_conntrack_lock); + +/* nf_conntrack_standalone needs this */ +atomic_t nf_conntrack_count = ATOMIC_INIT(0); + +void (*nf_conntrack_destroyed)(struct nf_conn *conntrack) = NULL; +LIST_HEAD(nf_conntrack_expect_list); +struct nf_conntrack_protocol **nf_ct_protos[PF_MAX]; +struct nf_conntrack_l3proto *nf_ct_l3protos[PF_MAX]; +static LIST_HEAD(helpers); +unsigned int nf_conntrack_htable_size = 0; +int nf_conntrack_max; +struct list_head *nf_conntrack_hash; +static kmem_cache_t *nf_conntrack_expect_cachep; +struct nf_conn nf_conntrack_untracked; +unsigned int nf_ct_log_invalid; +static LIST_HEAD(unconfirmed); +static int nf_conntrack_vmalloc; + +#ifdef CONFIG_NF_CONNTRACK_EVENTS +struct notifier_block *nf_conntrack_chain; +struct notifier_block *nf_conntrack_expect_chain; + +DEFINE_PER_CPU(struct nf_conntrack_ecache, nf_conntrack_ecache); + +/* deliver cached events and clear cache entry - must be called with locally + * disabled softirqs */ +static inline void +__nf_ct_deliver_cached_events(struct nf_conntrack_ecache *ecache) +{ + DEBUGP("ecache: delivering events for %p\n", ecache->ct); + if (nf_ct_is_confirmed(ecache->ct) && !nf_ct_is_dying(ecache->ct) + && ecache->events) + notifier_call_chain(&nf_conntrack_chain, ecache->events, + ecache->ct); + + ecache->events = 0; + nf_ct_put(ecache->ct); + ecache->ct = NULL; +} + +/* Deliver all cached events for a particular conntrack. This is called + * by code prior to async packet handling for freeing the skb */ +void nf_ct_deliver_cached_events(const struct nf_conn *ct) +{ + struct nf_conntrack_ecache *ecache; + + local_bh_disable(); + ecache = &__get_cpu_var(nf_conntrack_ecache); + if (ecache->ct == ct) + __nf_ct_deliver_cached_events(ecache); + local_bh_enable(); +} + +/* Deliver cached events for old pending events, if current conntrack != old */ +void __nf_ct_event_cache_init(struct nf_conn *ct) +{ + struct nf_conntrack_ecache *ecache; + + /* take care of delivering potentially old events */ + ecache = &__get_cpu_var(nf_conntrack_ecache); + BUG_ON(ecache->ct == ct); + if (ecache->ct) + __nf_ct_deliver_cached_events(ecache); + /* initialize for this conntrack/packet */ + ecache->ct = ct; + nf_conntrack_get(&ct->ct_general); +} + +/* flush the event cache - touches other CPU's data and must not be called + * while packets are still passing through the code */ +static void nf_ct_event_cache_flush(void) +{ + struct nf_conntrack_ecache *ecache; + int cpu; + + for_each_cpu(cpu) { + ecache = &per_cpu(nf_conntrack_ecache, cpu); + if (ecache->ct) + nf_ct_put(ecache->ct); + } +} +#else +static inline void nf_ct_event_cache_flush(void) {} +#endif /* CONFIG_NF_CONNTRACK_EVENTS */ + +DEFINE_PER_CPU(struct ip_conntrack_stat, nf_conntrack_stat); +EXPORT_PER_CPU_SYMBOL(nf_conntrack_stat); + +/* + * This scheme offers various size of "struct nf_conn" dependent on + * features(helper, nat, ...) + */ + +#define NF_CT_FEATURES_NAMELEN 256 +static struct { + /* name of slab cache. printed in /proc/slabinfo */ + char *name; + + /* size of slab cache */ + size_t size; + + /* slab cache pointer */ + kmem_cache_t *cachep; + + /* allocated slab cache + modules which uses this slab cache */ + int use; + + /* Initialization */ + int (*init_conntrack)(struct nf_conn *, u_int32_t); + +} nf_ct_cache[NF_CT_F_NUM]; + +/* protect members of nf_ct_cache except of "use" */ +DEFINE_RWLOCK(nf_ct_cache_lock); + +/* This avoids calling kmem_cache_create() with same name simultaneously */ +DECLARE_MUTEX(nf_ct_cache_mutex); + +extern struct nf_conntrack_protocol nf_conntrack_generic_protocol; +struct nf_conntrack_protocol * +nf_ct_find_proto(u_int16_t l3proto, u_int8_t protocol) +{ + if (unlikely(nf_ct_protos[l3proto] == NULL)) + return &nf_conntrack_generic_protocol; + + return nf_ct_protos[l3proto][protocol]; +} + +static int nf_conntrack_hash_rnd_initted; +static unsigned int nf_conntrack_hash_rnd; + +static u_int32_t __hash_conntrack(const struct nf_conntrack_tuple *tuple, + unsigned int size, unsigned int rnd) +{ + unsigned int a, b; + a = jhash((void *)tuple->src.u3.all, sizeof(tuple->src.u3.all), + ((tuple->src.l3num) << 16) | tuple->dst.protonum); + b = jhash((void *)tuple->dst.u3.all, sizeof(tuple->dst.u3.all), + (tuple->src.u.all << 16) | tuple->dst.u.all); + + return jhash_2words(a, b, rnd) % size; +} + +static inline u_int32_t hash_conntrack(const struct nf_conntrack_tuple *tuple) +{ + return __hash_conntrack(tuple, nf_conntrack_htable_size, + nf_conntrack_hash_rnd); +} + +/* Initialize "struct nf_conn" which has spaces for helper */ +static int +init_conntrack_for_helper(struct nf_conn *conntrack, u_int32_t features) +{ + + conntrack->help = (union nf_conntrack_help *) + (((unsigned long)conntrack->data + + (__alignof__(union nf_conntrack_help) - 1)) + & (~((unsigned long)(__alignof__(union nf_conntrack_help) -1)))); + return 0; +} + +int nf_conntrack_register_cache(u_int32_t features, const char *name, + size_t size, + int (*init)(struct nf_conn *, u_int32_t)) +{ + int ret = 0; + char *cache_name; + kmem_cache_t *cachep; + + DEBUGP("nf_conntrack_register_cache: features=0x%x, name=%s, size=%d\n", + features, name, size); + + if (features < NF_CT_F_BASIC || features >= NF_CT_F_NUM) { + DEBUGP("nf_conntrack_register_cache: invalid features.: 0x%x\n", + features); + return -EINVAL; + } + + down(&nf_ct_cache_mutex); + + write_lock_bh(&nf_ct_cache_lock); + /* e.g: multiple helpers are loaded */ + if (nf_ct_cache[features].use > 0) { + DEBUGP("nf_conntrack_register_cache: already resisterd.\n"); + if ((!strncmp(nf_ct_cache[features].name, name, + NF_CT_FEATURES_NAMELEN)) + && nf_ct_cache[features].size == size + && nf_ct_cache[features].init_conntrack == init) { + DEBUGP("nf_conntrack_register_cache: reusing.\n"); + nf_ct_cache[features].use++; + ret = 0; + } else + ret = -EBUSY; + + write_unlock_bh(&nf_ct_cache_lock); + up(&nf_ct_cache_mutex); + return ret; + } + write_unlock_bh(&nf_ct_cache_lock); + + /* + * The memory space for name of slab cache must be alive until + * cache is destroyed. + */ + cache_name = kmalloc(sizeof(char)*NF_CT_FEATURES_NAMELEN, GFP_ATOMIC); + if (cache_name == NULL) { + DEBUGP("nf_conntrack_register_cache: can't alloc cache_name\n"); + ret = -ENOMEM; + goto out_up_mutex; + } + + if (strlcpy(cache_name, name, NF_CT_FEATURES_NAMELEN) + >= NF_CT_FEATURES_NAMELEN) { + printk("nf_conntrack_register_cache: name too long\n"); + ret = -EINVAL; + goto out_free_name; + } + + cachep = kmem_cache_create(cache_name, size, 0, 0, + NULL, NULL); + if (!cachep) { + printk("nf_conntrack_register_cache: Can't create slab cache " + "for the features = 0x%x\n", features); + ret = -ENOMEM; + goto out_free_name; + } + + write_lock_bh(&nf_ct_cache_lock); + nf_ct_cache[features].use = 1; + nf_ct_cache[features].size = size; + nf_ct_cache[features].init_conntrack = init; + nf_ct_cache[features].cachep = cachep; + nf_ct_cache[features].name = cache_name; + write_unlock_bh(&nf_ct_cache_lock); + + goto out_up_mutex; + +out_free_name: + kfree(cache_name); +out_up_mutex: + up(&nf_ct_cache_mutex); + return ret; +} + +/* FIXME: In the current, only nf_conntrack_cleanup() can call this function. */ +void nf_conntrack_unregister_cache(u_int32_t features) +{ + kmem_cache_t *cachep; + char *name; + + /* + * This assures that kmem_cache_create() isn't called before destroying + * slab cache. + */ + DEBUGP("nf_conntrack_unregister_cache: 0x%04x\n", features); + down(&nf_ct_cache_mutex); + + write_lock_bh(&nf_ct_cache_lock); + if (--nf_ct_cache[features].use > 0) { + write_unlock_bh(&nf_ct_cache_lock); + up(&nf_ct_cache_mutex); + return; + } + cachep = nf_ct_cache[features].cachep; + name = nf_ct_cache[features].name; + nf_ct_cache[features].cachep = NULL; + nf_ct_cache[features].name = NULL; + nf_ct_cache[features].init_conntrack = NULL; + nf_ct_cache[features].size = 0; + write_unlock_bh(&nf_ct_cache_lock); + + synchronize_net(); + + kmem_cache_destroy(cachep); + kfree(name); + + up(&nf_ct_cache_mutex); +} + +int +nf_ct_get_tuple(const struct sk_buff *skb, + unsigned int nhoff, + unsigned int dataoff, + u_int16_t l3num, + u_int8_t protonum, + struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_l3proto *l3proto, + const struct nf_conntrack_protocol *protocol) +{ + NF_CT_TUPLE_U_BLANK(tuple); + + tuple->src.l3num = l3num; + if (l3proto->pkt_to_tuple(skb, nhoff, tuple) == 0) + return 0; + + tuple->dst.protonum = protonum; + tuple->dst.dir = IP_CT_DIR_ORIGINAL; + + return protocol->pkt_to_tuple(skb, dataoff, tuple); +} + +int +nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse, + const struct nf_conntrack_tuple *orig, + const struct nf_conntrack_l3proto *l3proto, + const struct nf_conntrack_protocol *protocol) +{ + NF_CT_TUPLE_U_BLANK(inverse); + + inverse->src.l3num = orig->src.l3num; + if (l3proto->invert_tuple(inverse, orig) == 0) + return 0; + + inverse->dst.dir = !orig->dst.dir; + + inverse->dst.protonum = orig->dst.protonum; + return protocol->invert_tuple(inverse, orig); +} + +/* nf_conntrack_expect helper functions */ +static void nf_ct_unlink_expect(struct nf_conntrack_expect *exp) +{ + ASSERT_WRITE_LOCK(&nf_conntrack_lock); + NF_CT_ASSERT(!timer_pending(&exp_timeout)); + list_del(&exp->list); + NF_CT_STAT_INC(expect_delete); + exp->master->expecting--; + nf_conntrack_expect_put(exp); +} + +static void expectation_timed_out(unsigned long ul_expect) +{ + struct nf_conntrack_expect *exp = (void *)ul_expect; + + write_lock_bh(&nf_conntrack_lock); + nf_ct_unlink_expect(exp); + write_unlock_bh(&nf_conntrack_lock); + nf_conntrack_expect_put(exp); +} + +/* If an expectation for this connection is found, it gets delete from + * global list then returned. */ +static struct nf_conntrack_expect * +find_expectation(const struct nf_conntrack_tuple *tuple) +{ + struct nf_conntrack_expect *i; + + list_for_each_entry(i, &nf_conntrack_expect_list, list) { + /* If master is not in hash table yet (ie. packet hasn't left + this machine yet), how can other end know about expected? + Hence these are not the droids you are looking for (if + master ct never got confirmed, we'd hold a reference to it + and weird things would happen to future packets). */ + if (nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask) + && nf_ct_is_confirmed(i->master)) { + if (i->flags & NF_CT_EXPECT_PERMANENT) { + atomic_inc(&i->use); + return i; + } else if (del_timer(&i->timeout)) { + nf_ct_unlink_expect(i); + return i; + } + } + } + return NULL; +} + +/* delete all expectations for this conntrack */ +static void remove_expectations(struct nf_conn *ct) +{ + struct nf_conntrack_expect *i, *tmp; + + /* Optimization: most connection never expect any others. */ + if (ct->expecting == 0) + return; + + list_for_each_entry_safe(i, tmp, &nf_conntrack_expect_list, list) { + if (i->master == ct && del_timer(&i->timeout)) { + nf_ct_unlink_expect(i); + nf_conntrack_expect_put(i); + } + } +} + +static void +clean_from_lists(struct nf_conn *ct) +{ + unsigned int ho, hr; + + DEBUGP("clean_from_lists(%p)\n", ct); + ASSERT_WRITE_LOCK(&nf_conntrack_lock); + + ho = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); + hr = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple); + LIST_DELETE(&nf_conntrack_hash[ho], &ct->tuplehash[IP_CT_DIR_ORIGINAL]); + LIST_DELETE(&nf_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]); + + /* Destroy all pending expectations */ + remove_expectations(ct); +} + +static void +destroy_conntrack(struct nf_conntrack *nfct) +{ + struct nf_conn *ct = (struct nf_conn *)nfct; + struct nf_conntrack_l3proto *l3proto; + struct nf_conntrack_protocol *proto; + + DEBUGP("destroy_conntrack(%p)\n", ct); + NF_CT_ASSERT(atomic_read(&nfct->use) == 0); + NF_CT_ASSERT(!timer_pending(&ct->timeout)); + + nf_conntrack_event(IPCT_DESTROY, ct); + set_bit(IPS_DYING_BIT, &ct->status); + + /* To make sure we don't get any weird locking issues here: + * destroy_conntrack() MUST NOT be called with a write lock + * to nf_conntrack_lock!!! -HW */ + l3proto = nf_ct_find_l3proto(ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.l3num); + if (l3proto && l3proto->destroy) + l3proto->destroy(ct); + + proto = nf_ct_find_proto(ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.l3num, + ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum); + if (proto && proto->destroy) + proto->destroy(ct); + + if (nf_conntrack_destroyed) + nf_conntrack_destroyed(ct); + + write_lock_bh(&nf_conntrack_lock); + /* Expectations will have been removed in clean_from_lists, + * except TFTP can create an expectation on the first packet, + * before connection is in the list, so we need to clean here, + * too. */ + remove_expectations(ct); + + /* We overload first tuple to link into unconfirmed list. */ + if (!nf_ct_is_confirmed(ct)) { + BUG_ON(list_empty(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list)); + list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list); + } + + NF_CT_STAT_INC(delete); + write_unlock_bh(&nf_conntrack_lock); + + if (ct->master) + nf_ct_put(ct->master); + + DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct); + nf_conntrack_free(ct); +} + +static void death_by_timeout(unsigned long ul_conntrack) +{ + struct nf_conn *ct = (void *)ul_conntrack; + + write_lock_bh(&nf_conntrack_lock); + /* Inside lock so preempt is disabled on module removal path. + * Otherwise we can get spurious warnings. */ + NF_CT_STAT_INC(delete_list); + clean_from_lists(ct); + write_unlock_bh(&nf_conntrack_lock); + nf_ct_put(ct); +} + +static inline int +conntrack_tuple_cmp(const struct nf_conntrack_tuple_hash *i, + const struct nf_conntrack_tuple *tuple, + const struct nf_conn *ignored_conntrack) +{ + ASSERT_READ_LOCK(&nf_conntrack_lock); + return nf_ct_tuplehash_to_ctrack(i) != ignored_conntrack + && nf_ct_tuple_equal(tuple, &i->tuple); +} + +static struct nf_conntrack_tuple_hash * +__nf_conntrack_find(const struct nf_conntrack_tuple *tuple, + const struct nf_conn *ignored_conntrack) +{ + struct nf_conntrack_tuple_hash *h; + unsigned int hash = hash_conntrack(tuple); + + ASSERT_READ_LOCK(&nf_conntrack_lock); + list_for_each_entry(h, &nf_conntrack_hash[hash], list) { + if (conntrack_tuple_cmp(h, tuple, ignored_conntrack)) { + NF_CT_STAT_INC(found); + return h; + } + NF_CT_STAT_INC(searched); + } + + return NULL; +} + +/* Find a connection corresponding to a tuple. */ +struct nf_conntrack_tuple_hash * +nf_conntrack_find_get(const struct nf_conntrack_tuple *tuple, + const struct nf_conn *ignored_conntrack) +{ + struct nf_conntrack_tuple_hash *h; + + read_lock_bh(&nf_conntrack_lock); + h = __nf_conntrack_find(tuple, ignored_conntrack); + if (h) + atomic_inc(&nf_ct_tuplehash_to_ctrack(h)->ct_general.use); + read_unlock_bh(&nf_conntrack_lock); + + return h; +} + +/* Confirm a connection given skb; places it in hash table */ +int +__nf_conntrack_confirm(struct sk_buff **pskb) +{ + unsigned int hash, repl_hash; + struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + + ct = nf_ct_get(*pskb, &ctinfo); + + /* ipt_REJECT uses nf_conntrack_attach to attach related + ICMP/TCP RST packets in other direction. Actual packet + which created connection will be IP_CT_NEW or for an + expected connection, IP_CT_RELATED. */ + if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) + return NF_ACCEPT; + + hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); + repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple); + + /* We're not in hash table, and we refuse to set up related + connections for unconfirmed conns. But packet copies and + REJECT will give spurious warnings here. */ + /* NF_CT_ASSERT(atomic_read(&ct->ct_general.use) == 1); */ + + /* No external references means noone else could have + confirmed us. */ + NF_CT_ASSERT(!nf_ct_is_confirmed(ct)); + DEBUGP("Confirming conntrack %p\n", ct); + + write_lock_bh(&nf_conntrack_lock); + + /* See if there's one in the list already, including reverse: + NAT could have grabbed it without realizing, since we're + not in the hash. If there is, we lost race. */ + if (!LIST_FIND(&nf_conntrack_hash[hash], + conntrack_tuple_cmp, + struct nf_conntrack_tuple_hash *, + &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, NULL) + && !LIST_FIND(&nf_conntrack_hash[repl_hash], + conntrack_tuple_cmp, + struct nf_conntrack_tuple_hash *, + &ct->tuplehash[IP_CT_DIR_REPLY].tuple, NULL)) { + /* Remove from unconfirmed list */ + list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list); + + list_prepend(&nf_conntrack_hash[hash], + &ct->tuplehash[IP_CT_DIR_ORIGINAL]); + list_prepend(&nf_conntrack_hash[repl_hash], + &ct->tuplehash[IP_CT_DIR_REPLY]); + /* Timer relative to confirmation time, not original + setting time, otherwise we'd get timer wrap in + weird delay cases. */ + ct->timeout.expires += jiffies; + add_timer(&ct->timeout); + atomic_inc(&ct->ct_general.use); + set_bit(IPS_CONFIRMED_BIT, &ct->status); + NF_CT_STAT_INC(insert); + write_unlock_bh(&nf_conntrack_lock); + if (ct->helper) + nf_conntrack_event_cache(IPCT_HELPER, *pskb); +#ifdef CONFIG_NF_NAT_NEEDED + if (test_bit(IPS_SRC_NAT_DONE_BIT, &ct->status) || + test_bit(IPS_DST_NAT_DONE_BIT, &ct->status)) + nf_conntrack_event_cache(IPCT_NATINFO, *pskb); +#endif + nf_conntrack_event_cache(master_ct(ct) ? + IPCT_RELATED : IPCT_NEW, *pskb); + return NF_ACCEPT; + } + + NF_CT_STAT_INC(insert_failed); + write_unlock_bh(&nf_conntrack_lock); + return NF_DROP; +} + +/* Returns true if a connection correspondings to the tuple (required + for NAT). */ +int +nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple, + const struct nf_conn *ignored_conntrack) +{ + struct nf_conntrack_tuple_hash *h; + + read_lock_bh(&nf_conntrack_lock); + h = __nf_conntrack_find(tuple, ignored_conntrack); + read_unlock_bh(&nf_conntrack_lock); + + return h != NULL; +} + +/* There's a small race here where we may free a just-assured + connection. Too bad: we're in trouble anyway. */ +static inline int unreplied(const struct nf_conntrack_tuple_hash *i) +{ + return !(test_bit(IPS_ASSURED_BIT, + &nf_ct_tuplehash_to_ctrack(i)->status)); +} + +static int early_drop(struct list_head *chain) +{ + /* Traverse backwards: gives us oldest, which is roughly LRU */ + struct nf_conntrack_tuple_hash *h; + struct nf_conn *ct = NULL; + int dropped = 0; + + read_lock_bh(&nf_conntrack_lock); + h = LIST_FIND_B(chain, unreplied, struct nf_conntrack_tuple_hash *); + if (h) { + ct = nf_ct_tuplehash_to_ctrack(h); + atomic_inc(&ct->ct_general.use); + } + read_unlock_bh(&nf_conntrack_lock); + + if (!ct) + return dropped; + + if (del_timer(&ct->timeout)) { + death_by_timeout((unsigned long)ct); + dropped = 1; + NF_CT_STAT_INC(early_drop); + } + nf_ct_put(ct); + return dropped; +} + +static inline int helper_cmp(const struct nf_conntrack_helper *i, + const struct nf_conntrack_tuple *rtuple) +{ + return nf_ct_tuple_mask_cmp(rtuple, &i->tuple, &i->mask); +} + +static struct nf_conntrack_helper * +nf_ct_find_helper(const struct nf_conntrack_tuple *tuple) +{ + return LIST_FIND(&helpers, helper_cmp, + struct nf_conntrack_helper *, + tuple); +} + +static struct nf_conn * +__nf_conntrack_alloc(const struct nf_conntrack_tuple *orig, + const struct nf_conntrack_tuple *repl, + const struct nf_conntrack_l3proto *l3proto) +{ + struct nf_conn *conntrack = NULL; + u_int32_t features = 0; + + if (!nf_conntrack_hash_rnd_initted) { + get_random_bytes(&nf_conntrack_hash_rnd, 4); + nf_conntrack_hash_rnd_initted = 1; + } + + if (nf_conntrack_max + && atomic_read(&nf_conntrack_count) >= nf_conntrack_max) { + unsigned int hash = hash_conntrack(orig); + /* Try dropping from this hash chain. */ + if (!early_drop(&nf_conntrack_hash[hash])) { + if (net_ratelimit()) + printk(KERN_WARNING + "nf_conntrack: table full, dropping" + " packet.\n"); + return ERR_PTR(-ENOMEM); + } + } + + /* find features needed by this conntrack. */ + features = l3proto->get_features(orig); + read_lock_bh(&nf_conntrack_lock); + if (nf_ct_find_helper(repl) != NULL) + features |= NF_CT_F_HELP; + read_unlock_bh(&nf_conntrack_lock); + + DEBUGP("nf_conntrack_alloc: features=0x%x\n", features); + + read_lock_bh(&nf_ct_cache_lock); + + if (!nf_ct_cache[features].use) { + DEBUGP("nf_conntrack_alloc: not supported features = 0x%x\n", + features); + goto out; + } + + conntrack = kmem_cache_alloc(nf_ct_cache[features].cachep, GFP_ATOMIC); + if (conntrack == NULL) { + DEBUGP("nf_conntrack_alloc: Can't alloc conntrack from cache\n"); + goto out; + } + + memset(conntrack, 0, nf_ct_cache[features].size); + conntrack->features = features; + if (nf_ct_cache[features].init_conntrack && + nf_ct_cache[features].init_conntrack(conntrack, features) < 0) { + DEBUGP("nf_conntrack_alloc: failed to init\n"); + kmem_cache_free(nf_ct_cache[features].cachep, conntrack); + conntrack = NULL; + goto out; + } + + atomic_set(&conntrack->ct_general.use, 1); + conntrack->ct_general.destroy = destroy_conntrack; + conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig; + conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *repl; + /* Don't set timer yet: wait for confirmation */ + init_timer(&conntrack->timeout); + conntrack->timeout.data = (unsigned long)conntrack; + conntrack->timeout.function = death_by_timeout; + + atomic_inc(&nf_conntrack_count); +out: + read_unlock_bh(&nf_ct_cache_lock); + return conntrack; +} + +struct nf_conn *nf_conntrack_alloc(const struct nf_conntrack_tuple *orig, + const struct nf_conntrack_tuple *repl) +{ + struct nf_conntrack_l3proto *l3proto; + + l3proto = nf_ct_find_l3proto(orig->src.l3num); + return __nf_conntrack_alloc(orig, repl, l3proto); +} + +void nf_conntrack_free(struct nf_conn *conntrack) +{ + u_int32_t features = conntrack->features; + NF_CT_ASSERT(features >= NF_CT_F_BASIC && features < NF_CT_F_NUM); + DEBUGP("nf_conntrack_free: features = 0x%x, conntrack=%p\n", features, + conntrack); + kmem_cache_free(nf_ct_cache[features].cachep, conntrack); + atomic_dec(&nf_conntrack_count); +} + +/* Allocate a new conntrack: we return -ENOMEM if classification + failed due to stress. Otherwise it really is unclassifiable. */ +static struct nf_conntrack_tuple_hash * +init_conntrack(const struct nf_conntrack_tuple *tuple, + struct nf_conntrack_l3proto *l3proto, + struct nf_conntrack_protocol *protocol, + struct sk_buff *skb, + unsigned int dataoff) +{ + struct nf_conn *conntrack; + struct nf_conntrack_tuple repl_tuple; + struct nf_conntrack_expect *exp; + + if (!nf_ct_invert_tuple(&repl_tuple, tuple, l3proto, protocol)) { + DEBUGP("Can't invert tuple.\n"); + return NULL; + } + + conntrack = __nf_conntrack_alloc(tuple, &repl_tuple, l3proto); + if (conntrack == NULL || IS_ERR(conntrack)) { + DEBUGP("Can't allocate conntrack.\n"); + return (struct nf_conntrack_tuple_hash *)conntrack; + } + + if (!protocol->new(conntrack, skb, dataoff)) { + nf_conntrack_free(conntrack); + DEBUGP("init conntrack: can't track with proto module\n"); + return NULL; + } + + write_lock_bh(&nf_conntrack_lock); + exp = find_expectation(tuple); + + if (exp) { + DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n", + conntrack, exp); + /* Welcome, Mr. Bond. We've been expecting you... */ + __set_bit(IPS_EXPECTED_BIT, &conntrack->status); + conntrack->master = exp->master; +#ifdef CONFIG_NF_CONNTRACK_MARK + conntrack->mark = exp->master->mark; +#endif + nf_conntrack_get(&conntrack->master->ct_general); + NF_CT_STAT_INC(expect_new); + } else { + conntrack->helper = nf_ct_find_helper(&repl_tuple); + + NF_CT_STAT_INC(new); + } + + /* Overload tuple linked list to put us in unconfirmed list. */ + list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed); + + write_unlock_bh(&nf_conntrack_lock); + + if (exp) { + if (exp->expectfn) + exp->expectfn(conntrack, exp); + nf_conntrack_expect_put(exp); + } + + return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL]; +} + +/* On success, returns conntrack ptr, sets skb->nfct and ctinfo */ +static inline struct nf_conn * +resolve_normal_ct(struct sk_buff *skb, + unsigned int dataoff, + u_int16_t l3num, + u_int8_t protonum, + struct nf_conntrack_l3proto *l3proto, + struct nf_conntrack_protocol *proto, + int *set_reply, + enum ip_conntrack_info *ctinfo) +{ + struct nf_conntrack_tuple tuple; + struct nf_conntrack_tuple_hash *h; + struct nf_conn *ct; + + if (!nf_ct_get_tuple(skb, (unsigned int)(skb->nh.raw - skb->data), + dataoff, l3num, protonum, &tuple, l3proto, + proto)) { + DEBUGP("resolve_normal_ct: Can't get tuple\n"); + return NULL; + } + + /* look for tuple match */ + h = nf_conntrack_find_get(&tuple, NULL); + if (!h) { + h = init_conntrack(&tuple, l3proto, proto, skb, dataoff); + if (!h) + return NULL; + if (IS_ERR(h)) + return (void *)h; + } + ct = nf_ct_tuplehash_to_ctrack(h); + + /* It exists; we have (non-exclusive) reference. */ + if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) { + *ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY; + /* Please set reply bit if this packet OK */ + *set_reply = 1; + } else { + /* Once we've had two way comms, always ESTABLISHED. */ + if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { + DEBUGP("nf_conntrack_in: normal packet for %p\n", ct); + *ctinfo = IP_CT_ESTABLISHED; + } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) { + DEBUGP("nf_conntrack_in: related packet for %p\n", ct); + *ctinfo = IP_CT_RELATED; + } else { + DEBUGP("nf_conntrack_in: new packet for %p\n", ct); + *ctinfo = IP_CT_NEW; + } + *set_reply = 0; + } + skb->nfct = &ct->ct_general; + skb->nfctinfo = *ctinfo; + return ct; +} + +unsigned int +nf_conntrack_in(int pf, unsigned int hooknum, struct sk_buff **pskb) +{ + struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + struct nf_conntrack_l3proto *l3proto; + struct nf_conntrack_protocol *proto; + unsigned int dataoff; + u_int8_t protonum; + int set_reply = 0; + int ret; + + /* Previously seen (loopback or untracked)? Ignore. */ + if ((*pskb)->nfct) { + NF_CT_STAT_INC(ignore); + return NF_ACCEPT; + } + + l3proto = nf_ct_find_l3proto((u_int16_t)pf); + if ((ret = l3proto->prepare(pskb, hooknum, &dataoff, &protonum)) <= 0) { + DEBUGP("not prepared to track yet or error occured\n"); + return -ret; + } + + proto = nf_ct_find_proto((u_int16_t)pf, protonum); + + /* It may be an special packet, error, unclean... + * inverse of the return code tells to the netfilter + * core what to do with the packet. */ + if (proto->error != NULL && + (ret = proto->error(*pskb, dataoff, &ctinfo, pf, hooknum)) <= 0) { + NF_CT_STAT_INC(error); + NF_CT_STAT_INC(invalid); + return -ret; + } + + ct = resolve_normal_ct(*pskb, dataoff, pf, protonum, l3proto, proto, + &set_reply, &ctinfo); + if (!ct) { + /* Not valid part of a connection */ + NF_CT_STAT_INC(invalid); + return NF_ACCEPT; + } + + if (IS_ERR(ct)) { + /* Too stressed to deal. */ + NF_CT_STAT_INC(drop); + return NF_DROP; + } + + NF_CT_ASSERT((*pskb)->nfct); + + ret = proto->packet(ct, *pskb, dataoff, ctinfo, pf, hooknum); + if (ret < 0) { + /* Invalid: inverse of the return code tells + * the netfilter core what to do */ + DEBUGP("nf_conntrack_in: Can't track with proto module\n"); + nf_conntrack_put((*pskb)->nfct); + (*pskb)->nfct = NULL; + NF_CT_STAT_INC(invalid); + return -ret; + } + + if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status)) + nf_conntrack_event_cache(IPCT_STATUS, *pskb); + + return ret; +} + +int nf_ct_invert_tuplepr(struct nf_conntrack_tuple *inverse, + const struct nf_conntrack_tuple *orig) +{ + return nf_ct_invert_tuple(inverse, orig, + nf_ct_find_l3proto(orig->src.l3num), + nf_ct_find_proto(orig->src.l3num, + orig->dst.protonum)); +} + +/* Would two expected things clash? */ +static inline int expect_clash(const struct nf_conntrack_expect *a, + const struct nf_conntrack_expect *b) +{ + /* Part covered by intersection of masks must be unequal, + otherwise they clash */ + struct nf_conntrack_tuple intersect_mask; + int count; + + intersect_mask.src.l3num = a->mask.src.l3num & b->mask.src.l3num; + intersect_mask.src.u.all = a->mask.src.u.all & b->mask.src.u.all; + intersect_mask.dst.u.all = a->mask.dst.u.all & b->mask.dst.u.all; + intersect_mask.dst.protonum = a->mask.dst.protonum + & b->mask.dst.protonum; + + for (count = 0; count < NF_CT_TUPLE_L3SIZE; count++){ + intersect_mask.src.u3.all[count] = + a->mask.src.u3.all[count] & b->mask.src.u3.all[count]; + } + + for (count = 0; count < NF_CT_TUPLE_L3SIZE; count++){ + intersect_mask.dst.u3.all[count] = + a->mask.dst.u3.all[count] & b->mask.dst.u3.all[count]; + } + + return nf_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask); +} + +static inline int expect_matches(const struct nf_conntrack_expect *a, + const struct nf_conntrack_expect *b) +{ + return a->master == b->master + && nf_ct_tuple_equal(&a->tuple, &b->tuple) + && nf_ct_tuple_equal(&a->mask, &b->mask); +} + +/* Generally a bad idea to call this: could have matched already. */ +void nf_conntrack_unexpect_related(struct nf_conntrack_expect *exp) +{ + struct nf_conntrack_expect *i; + + write_lock_bh(&nf_conntrack_lock); + /* choose the the oldest expectation to evict */ + list_for_each_entry_reverse(i, &nf_conntrack_expect_list, list) { + if (expect_matches(i, exp) && del_timer(&i->timeout)) { + nf_ct_unlink_expect(i); + write_unlock_bh(&nf_conntrack_lock); + nf_conntrack_expect_put(i); + return; + } + } + write_unlock_bh(&nf_conntrack_lock); +} + +/* We don't increase the master conntrack refcount for non-fulfilled + * conntracks. During the conntrack destruction, the expectations are + * always killed before the conntrack itself */ +struct nf_conntrack_expect *nf_conntrack_expect_alloc(struct nf_conn *me) +{ + struct nf_conntrack_expect *new; + + new = kmem_cache_alloc(nf_conntrack_expect_cachep, GFP_ATOMIC); + if (!new) { + DEBUGP("expect_related: OOM allocating expect\n"); + return NULL; + } + new->master = me; + atomic_set(&new->use, 1); + return new; +} + +void nf_conntrack_expect_put(struct nf_conntrack_expect *exp) +{ + if (atomic_dec_and_test(&exp->use)) + kmem_cache_free(nf_conntrack_expect_cachep, exp); +} + +static void nf_conntrack_expect_insert(struct nf_conntrack_expect *exp) +{ + atomic_inc(&exp->use); + exp->master->expecting++; + list_add(&exp->list, &nf_conntrack_expect_list); + + init_timer(&exp->timeout); + exp->timeout.data = (unsigned long)exp; + exp->timeout.function = expectation_timed_out; + exp->timeout.expires = jiffies + exp->master->helper->timeout * HZ; + add_timer(&exp->timeout); + + atomic_inc(&exp->use); + NF_CT_STAT_INC(expect_create); +} + +/* Race with expectations being used means we could have none to find; OK. */ +static void evict_oldest_expect(struct nf_conn *master) +{ + struct nf_conntrack_expect *i; + + list_for_each_entry_reverse(i, &nf_conntrack_expect_list, list) { + if (i->master == master) { + if (del_timer(&i->timeout)) { + nf_ct_unlink_expect(i); + nf_conntrack_expect_put(i); + } + break; + } + } +} + +static inline int refresh_timer(struct nf_conntrack_expect *i) +{ + if (!del_timer(&i->timeout)) + return 0; + + i->timeout.expires = jiffies + i->master->helper->timeout*HZ; + add_timer(&i->timeout); + return 1; +} + +int nf_conntrack_expect_related(struct nf_conntrack_expect *expect) +{ + struct nf_conntrack_expect *i; + int ret; + + DEBUGP("nf_conntrack_expect_related %p\n", related_to); + DEBUGP("tuple: "); NF_CT_DUMP_TUPLE(&expect->tuple); + DEBUGP("mask: "); NF_CT_DUMP_TUPLE(&expect->mask); + + write_lock_bh(&nf_conntrack_lock); + list_for_each_entry(i, &nf_conntrack_expect_list, list) { + if (expect_matches(i, expect)) { + /* Refresh timer: if it's dying, ignore.. */ + if (refresh_timer(i)) { + ret = 0; + goto out; + } + } else if (expect_clash(i, expect)) { + ret = -EBUSY; + goto out; + } + } + /* Will be over limit? */ + if (expect->master->helper->max_expected && + expect->master->expecting >= expect->master->helper->max_expected) + evict_oldest_expect(expect->master); + + nf_conntrack_expect_insert(expect); + nf_conntrack_expect_event(IPEXP_NEW, expect); + ret = 0; +out: + write_unlock_bh(&nf_conntrack_lock); + return ret; +} + +/* Alter reply tuple (maybe alter helper). This is for NAT, and is + implicitly racy: see __nf_conntrack_confirm */ +void nf_conntrack_alter_reply(struct nf_conn *conntrack, + const struct nf_conntrack_tuple *newreply) +{ + write_lock_bh(&nf_conntrack_lock); + /* Should be unconfirmed, so not in hash table yet */ + NF_CT_ASSERT(!nf_ct_is_confirmed(conntrack)); + + DEBUGP("Altering reply tuple of %p to ", conntrack); + NF_CT_DUMP_TUPLE(newreply); + + conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply; + if (!conntrack->master && conntrack->expecting == 0) + conntrack->helper = nf_ct_find_helper(newreply); + write_unlock_bh(&nf_conntrack_lock); +} + +int nf_conntrack_helper_register(struct nf_conntrack_helper *me) +{ + int ret; + BUG_ON(me->timeout == 0); + + ret = nf_conntrack_register_cache(NF_CT_F_HELP, "nf_conntrack:help", + sizeof(struct nf_conn) + + sizeof(union nf_conntrack_help) + + __alignof__(union nf_conntrack_help), + init_conntrack_for_helper); + if (ret < 0) { + printk(KERN_ERR "nf_conntrack_helper_reigster: Unable to create slab cache for conntracks\n"); + return ret; + } + write_lock_bh(&nf_conntrack_lock); + list_prepend(&helpers, me); + write_unlock_bh(&nf_conntrack_lock); + + return 0; +} + +static inline int unhelp(struct nf_conntrack_tuple_hash *i, + const struct nf_conntrack_helper *me) +{ + if (nf_ct_tuplehash_to_ctrack(i)->helper == me) { + nf_conntrack_event(IPCT_HELPER, nf_ct_tuplehash_to_ctrack(i)); + nf_ct_tuplehash_to_ctrack(i)->helper = NULL; + } + return 0; +} + +void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me) +{ + unsigned int i; + struct nf_conntrack_expect *exp, *tmp; + + /* Need write lock here, to delete helper. */ + write_lock_bh(&nf_conntrack_lock); + LIST_DELETE(&helpers, me); + + /* Get rid of expectations */ + list_for_each_entry_safe(exp, tmp, &nf_conntrack_expect_list, list) { + if (exp->master->helper == me && del_timer(&exp->timeout)) { + nf_ct_unlink_expect(exp); + nf_conntrack_expect_put(exp); + } + } + + /* Get rid of expecteds, set helpers to NULL. */ + LIST_FIND_W(&unconfirmed, unhelp, struct nf_conntrack_tuple_hash*, me); + for (i = 0; i < nf_conntrack_htable_size; i++) + LIST_FIND_W(&nf_conntrack_hash[i], unhelp, + struct nf_conntrack_tuple_hash *, me); + write_unlock_bh(&nf_conntrack_lock); + + /* Someone could be still looking at the helper in a bh. */ + synchronize_net(); +} + +/* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */ +void __nf_ct_refresh_acct(struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + const struct sk_buff *skb, + unsigned long extra_jiffies, + int do_acct) +{ + int event = 0; + + NF_CT_ASSERT(ct->timeout.data == (unsigned long)ct); + NF_CT_ASSERT(skb); + + write_lock_bh(&nf_conntrack_lock); + + /* If not in hash table, timer will not be active yet */ + if (!nf_ct_is_confirmed(ct)) { + ct->timeout.expires = extra_jiffies; + event = IPCT_REFRESH; + } else { + /* Need del_timer for race avoidance (may already be dying). */ + if (del_timer(&ct->timeout)) { + ct->timeout.expires = jiffies + extra_jiffies; + add_timer(&ct->timeout); + event = IPCT_REFRESH; + } + } + +#ifdef CONFIG_NF_CT_ACCT + if (do_acct) { + ct->counters[CTINFO2DIR(ctinfo)].packets++; + ct->counters[CTINFO2DIR(ctinfo)].bytes += + skb->len - (unsigned int)(skb->nh.raw - skb->data); + if ((ct->counters[CTINFO2DIR(ctinfo)].packets & 0x80000000) + || (ct->counters[CTINFO2DIR(ctinfo)].bytes & 0x80000000)) + event |= IPCT_COUNTER_FILLING; + } +#endif + + write_unlock_bh(&nf_conntrack_lock); + + /* must be unlocked when calling event cache */ + if (event) + nf_conntrack_event_cache(event, skb); +} + +/* Used by ipt_REJECT and ip6t_REJECT. */ +void __nf_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb) +{ + struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + + /* This ICMP is in reverse direction to the packet which caused it */ + ct = nf_ct_get(skb, &ctinfo); + if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) + ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY; + else + ctinfo = IP_CT_RELATED; + + /* Attach to new skbuff, and increment count */ + nskb->nfct = &ct->ct_general; + nskb->nfctinfo = ctinfo; + nf_conntrack_get(nskb->nfct); +} + +static inline int +do_iter(const struct nf_conntrack_tuple_hash *i, + int (*iter)(struct nf_conn *i, void *data), + void *data) +{ + return iter(nf_ct_tuplehash_to_ctrack(i), data); +} + +/* Bring out ya dead! */ +static struct nf_conntrack_tuple_hash * +get_next_corpse(int (*iter)(struct nf_conn *i, void *data), + void *data, unsigned int *bucket) +{ + struct nf_conntrack_tuple_hash *h = NULL; + + write_lock_bh(&nf_conntrack_lock); + for (; *bucket < nf_conntrack_htable_size; (*bucket)++) { + h = LIST_FIND_W(&nf_conntrack_hash[*bucket], do_iter, + struct nf_conntrack_tuple_hash *, iter, data); + if (h) + break; + } + if (!h) + h = LIST_FIND_W(&unconfirmed, do_iter, + struct nf_conntrack_tuple_hash *, iter, data); + if (h) + atomic_inc(&nf_ct_tuplehash_to_ctrack(h)->ct_general.use); + write_unlock_bh(&nf_conntrack_lock); + + return h; +} + +void +nf_ct_iterate_cleanup(int (*iter)(struct nf_conn *i, void *data), void *data) +{ + struct nf_conntrack_tuple_hash *h; + unsigned int bucket = 0; + + while ((h = get_next_corpse(iter, data, &bucket)) != NULL) { + struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); + /* Time to push up daises... */ + if (del_timer(&ct->timeout)) + death_by_timeout((unsigned long)ct); + /* ... else the timer will get him soon. */ + + nf_ct_put(ct); + } +} + +static int kill_all(struct nf_conn *i, void *data) +{ + return 1; +} + +static void free_conntrack_hash(struct list_head *hash, int vmalloced, int size) +{ + if (vmalloced) + vfree(hash); + else + free_pages((unsigned long)hash, + get_order(sizeof(struct list_head) * size)); +} + +/* Mishearing the voices in his head, our hero wonders how he's + supposed to kill the mall. */ +void nf_conntrack_cleanup(void) +{ + int i; + + /* This makes sure all current packets have passed through + netfilter framework. Roll on, two-stage module + delete... */ + synchronize_net(); + + nf_ct_event_cache_flush(); + i_see_dead_people: + nf_ct_iterate_cleanup(kill_all, NULL); + if (atomic_read(&nf_conntrack_count) != 0) { + schedule(); + goto i_see_dead_people; + } + + for (i = 0; i < NF_CT_F_NUM; i++) { + if (nf_ct_cache[i].use == 0) + continue; + + NF_CT_ASSERT(nf_ct_cache[i].use == 1); + nf_ct_cache[i].use = 1; + nf_conntrack_unregister_cache(i); + } + kmem_cache_destroy(nf_conntrack_expect_cachep); + free_conntrack_hash(nf_conntrack_hash, nf_conntrack_vmalloc, + nf_conntrack_htable_size); +} + +static struct list_head *alloc_hashtable(int size, int *vmalloced) +{ + struct list_head *hash; + unsigned int i; + + *vmalloced = 0; + hash = (void*)__get_free_pages(GFP_KERNEL, + get_order(sizeof(struct list_head) + * size)); + if (!hash) { + *vmalloced = 1; + printk(KERN_WARNING "nf_conntrack: falling back to vmalloc.\n"); + hash = vmalloc(sizeof(struct list_head) * size); + } + + if (hash) + for (i = 0; i < size; i++) + INIT_LIST_HEAD(&hash[i]); + + return hash; +} + +int set_hashsize(const char *val, struct kernel_param *kp) +{ + int i, bucket, hashsize, vmalloced; + int old_vmalloced, old_size; + int rnd; + struct list_head *hash, *old_hash; + struct nf_conntrack_tuple_hash *h; + + /* On boot, we can set this without any fancy locking. */ + if (!nf_conntrack_htable_size) + return param_set_uint(val, kp); + + hashsize = simple_strtol(val, NULL, 0); + if (!hashsize) + return -EINVAL; + + hash = alloc_hashtable(hashsize, &vmalloced); + if (!hash) + return -ENOMEM; + + /* We have to rehahs for the new table anyway, so we also can + * use a newrandom seed */ + get_random_bytes(&rnd, 4); + + write_lock_bh(&nf_conntrack_lock); + for (i = 0; i < nf_conntrack_htable_size; i++) { + while (!list_empty(&nf_conntrack_hash[i])) { + h = list_entry(nf_conntrack_hash[i].next, + struct nf_conntrack_tuple_hash, list); + list_del(&h->list); + bucket = __hash_conntrack(&h->tuple, hashsize, rnd); + list_add_tail(&h->list, &hash[bucket]); + } + } + old_size = nf_conntrack_htable_size; + old_vmalloced = nf_conntrack_vmalloc; + old_hash = nf_conntrack_hash; + + nf_conntrack_htable_size = hashsize; + nf_conntrack_vmalloc = vmalloced; + nf_conntrack_hash = hash; + nf_conntrack_hash_rnd = rnd; + write_unlock_bh(&nf_conntrack_lock); + + free_conntrack_hash(old_hash, old_vmalloced, old_size); + return 0; +} + +module_param_call(hashsize, set_hashsize, param_get_uint, + &nf_conntrack_htable_size, 0600); + +int __init nf_conntrack_init(void) +{ + unsigned int i; + int ret; + + /* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB + * machine has 256 buckets. >= 1GB machines have 8192 buckets. */ + if (!nf_conntrack_htable_size) { + nf_conntrack_htable_size + = (((num_physpages << PAGE_SHIFT) / 16384) + / sizeof(struct list_head)); + if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE)) + nf_conntrack_htable_size = 8192; + if (nf_conntrack_htable_size < 16) + nf_conntrack_htable_size = 16; + } + nf_conntrack_max = 8 * nf_conntrack_htable_size; + + printk("nf_conntrack version %s (%u buckets, %d max)\n", + NF_CONNTRACK_VERSION, nf_conntrack_htable_size, + nf_conntrack_max); + + nf_conntrack_hash = alloc_hashtable(nf_conntrack_htable_size, + &nf_conntrack_vmalloc); + if (!nf_conntrack_hash) { + printk(KERN_ERR "Unable to create nf_conntrack_hash\n"); + goto err_out; + } + + ret = nf_conntrack_register_cache(NF_CT_F_BASIC, "nf_conntrack:basic", + sizeof(struct nf_conn), NULL); + if (ret < 0) { + printk(KERN_ERR "Unable to create nf_conn slab cache\n"); + goto err_free_hash; + } + + nf_conntrack_expect_cachep = kmem_cache_create("nf_conntrack_expect", + sizeof(struct nf_conntrack_expect), + 0, 0, NULL, NULL); + if (!nf_conntrack_expect_cachep) { + printk(KERN_ERR "Unable to create nf_expect slab cache\n"); + goto err_free_conntrack_slab; + } + + /* Don't NEED lock here, but good form anyway. */ + write_lock_bh(&nf_conntrack_lock); + for (i = 0; i < PF_MAX; i++) + nf_ct_l3protos[i] = &nf_conntrack_generic_l3proto; + write_unlock_bh(&nf_conntrack_lock); + + /* Set up fake conntrack: + - to never be deleted, not in any hashes */ + atomic_set(&nf_conntrack_untracked.ct_general.use, 1); + /* - and look it like as a confirmed connection */ + set_bit(IPS_CONFIRMED_BIT, &nf_conntrack_untracked.status); + + return ret; + +err_free_conntrack_slab: + nf_conntrack_unregister_cache(NF_CT_F_BASIC); +err_free_hash: + free_conntrack_hash(nf_conntrack_hash, nf_conntrack_vmalloc, + nf_conntrack_htable_size); +err_out: + return -ENOMEM; +} diff --git a/net/netfilter/nf_conntrack_ftp.c b/net/netfilter/nf_conntrack_ftp.c new file mode 100644 index 000000000000..65080e269f27 --- /dev/null +++ b/net/netfilter/nf_conntrack_ftp.c @@ -0,0 +1,698 @@ +/* FTP extension for connection tracking. */ + +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team + * (C) 2003,2004 USAGI/WIDE Project + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * 16 Dec 2003: Yasuyuki Kozakai @USAGI + * - enable working with Layer 3 protocol independent connection tracking. + * - track EPRT and EPSV commands with IPv6 address. + * + * Derived from net/ipv4/netfilter/ip_conntrack_ftp.c + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Rusty Russell "); +MODULE_DESCRIPTION("ftp connection tracking helper"); + +/* This is slow, but it's simple. --RR */ +static char *ftp_buffer; + +static DEFINE_SPINLOCK(nf_ftp_lock); + +#define MAX_PORTS 8 +static u_int16_t ports[MAX_PORTS]; +static unsigned int ports_c; +module_param_array(ports, ushort, &ports_c, 0400); + +static int loose; +module_param(loose, int, 0600); + +unsigned int (*nf_nat_ftp_hook)(struct sk_buff **pskb, + enum ip_conntrack_info ctinfo, + enum ip_ct_ftp_type type, + unsigned int matchoff, + unsigned int matchlen, + struct nf_conntrack_expect *exp, + u32 *seq); +EXPORT_SYMBOL_GPL(nf_nat_ftp_hook); + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +static int try_rfc959(const char *, size_t, struct nf_conntrack_man *, char); +static int try_eprt(const char *, size_t, struct nf_conntrack_man *, char); +static int try_epsv_response(const char *, size_t, struct nf_conntrack_man *, + char); + +static struct ftp_search { + enum ip_conntrack_dir dir; + const char *pattern; + size_t plen; + char skip; + char term; + enum ip_ct_ftp_type ftptype; + int (*getnum)(const char *, size_t, struct nf_conntrack_man *, char); +} search[] = { + { + IP_CT_DIR_ORIGINAL, + "PORT", sizeof("PORT") - 1, ' ', '\r', + IP_CT_FTP_PORT, + try_rfc959, + }, + { + IP_CT_DIR_REPLY, + "227 ", sizeof("227 ") - 1, '(', ')', + IP_CT_FTP_PASV, + try_rfc959, + }, + { + IP_CT_DIR_ORIGINAL, + "EPRT", sizeof("EPRT") - 1, ' ', '\r', + IP_CT_FTP_EPRT, + try_eprt, + }, + { + IP_CT_DIR_REPLY, + "229 ", sizeof("229 ") - 1, '(', ')', + IP_CT_FTP_EPSV, + try_epsv_response, + }, +}; + +/* This code is based on inet_pton() in glibc-2.2.4 */ +static int +get_ipv6_addr(const char *src, size_t dlen, struct in6_addr *dst, u_int8_t term) +{ + static const char xdigits[] = "0123456789abcdef"; + u_int8_t tmp[16], *tp, *endp, *colonp; + int ch, saw_xdigit; + u_int32_t val; + size_t clen = 0; + + tp = memset(tmp, '\0', sizeof(tmp)); + endp = tp + sizeof(tmp); + colonp = NULL; + + /* Leading :: requires some special handling. */ + if (*src == ':'){ + if (*++src != ':') { + DEBUGP("invalid \":\" at the head of addr\n"); + return 0; + } + clen++; + } + + saw_xdigit = 0; + val = 0; + while ((clen < dlen) && (*src != term)) { + const char *pch; + + ch = tolower(*src++); + clen++; + + pch = strchr(xdigits, ch); + if (pch != NULL) { + val <<= 4; + val |= (pch - xdigits); + if (val > 0xffff) + return 0; + + saw_xdigit = 1; + continue; + } + if (ch != ':') { + DEBUGP("get_ipv6_addr: invalid char. \'%c\'\n", ch); + return 0; + } + + if (!saw_xdigit) { + if (colonp) { + DEBUGP("invalid location of \"::\".\n"); + return 0; + } + colonp = tp; + continue; + } else if (*src == term) { + DEBUGP("trancated IPv6 addr\n"); + return 0; + } + + if (tp + 2 > endp) + return 0; + *tp++ = (u_int8_t) (val >> 8) & 0xff; + *tp++ = (u_int8_t) val & 0xff; + + saw_xdigit = 0; + val = 0; + continue; + } + if (saw_xdigit) { + if (tp + 2 > endp) + return 0; + *tp++ = (u_int8_t) (val >> 8) & 0xff; + *tp++ = (u_int8_t) val & 0xff; + } + if (colonp != NULL) { + /* + * Since some memmove()'s erroneously fail to handle + * overlapping regions, we'll do the shift by hand. + */ + const int n = tp - colonp; + int i; + + if (tp == endp) + return 0; + + for (i = 1; i <= n; i++) { + endp[- i] = colonp[n - i]; + colonp[n - i] = 0; + } + tp = endp; + } + if (tp != endp || (*src != term)) + return 0; + + memcpy(dst->s6_addr, tmp, sizeof(dst->s6_addr)); + return clen; +} + +static int try_number(const char *data, size_t dlen, u_int32_t array[], + int array_size, char sep, char term) +{ + u_int32_t i, len; + + memset(array, 0, sizeof(array[0])*array_size); + + /* Keep data pointing at next char. */ + for (i = 0, len = 0; len < dlen && i < array_size; len++, data++) { + if (*data >= '0' && *data <= '9') { + array[i] = array[i]*10 + *data - '0'; + } + else if (*data == sep) + i++; + else { + /* Unexpected character; true if it's the + terminator and we're finished. */ + if (*data == term && i == array_size - 1) + return len; + + DEBUGP("Char %u (got %u nums) `%u' unexpected\n", + len, i, *data); + return 0; + } + } + DEBUGP("Failed to fill %u numbers separated by %c\n", array_size, sep); + + return 0; +} + +/* Returns 0, or length of numbers: 192,168,1,1,5,6 */ +static int try_rfc959(const char *data, size_t dlen, + struct nf_conntrack_man *cmd, char term) +{ + int length; + u_int32_t array[6]; + + length = try_number(data, dlen, array, 6, ',', term); + if (length == 0) + return 0; + + cmd->u3.ip = htonl((array[0] << 24) | (array[1] << 16) | + (array[2] << 8) | array[3]); + cmd->u.tcp.port = htons((array[4] << 8) | array[5]); + return length; +} + +/* Grab port: number up to delimiter */ +static int get_port(const char *data, int start, size_t dlen, char delim, + u_int16_t *port) +{ + u_int16_t tmp_port = 0; + int i; + + for (i = start; i < dlen; i++) { + /* Finished? */ + if (data[i] == delim) { + if (tmp_port == 0) + break; + *port = htons(tmp_port); + DEBUGP("get_port: return %d\n", tmp_port); + return i + 1; + } + else if (data[i] >= '0' && data[i] <= '9') + tmp_port = tmp_port*10 + data[i] - '0'; + else { /* Some other crap */ + DEBUGP("get_port: invalid char.\n"); + break; + } + } + return 0; +} + +/* Returns 0, or length of numbers: |1|132.235.1.2|6275| or |2|3ffe::1|6275| */ +static int try_eprt(const char *data, size_t dlen, struct nf_conntrack_man *cmd, + char term) +{ + char delim; + int length; + + /* First character is delimiter, then "1" for IPv4 or "2" for IPv6, + then delimiter again. */ + if (dlen <= 3) { + DEBUGP("EPRT: too short\n"); + return 0; + } + delim = data[0]; + if (isdigit(delim) || delim < 33 || delim > 126 || data[2] != delim) { + DEBUGP("try_eprt: invalid delimitter.\n"); + return 0; + } + + if ((cmd->l3num == PF_INET && data[1] != '1') || + (cmd->l3num == PF_INET6 && data[1] != '2')) { + DEBUGP("EPRT: invalid protocol number.\n"); + return 0; + } + + DEBUGP("EPRT: Got %c%c%c\n", delim, data[1], delim); + + if (data[1] == '1') { + u_int32_t array[4]; + + /* Now we have IP address. */ + length = try_number(data + 3, dlen - 3, array, 4, '.', delim); + if (length != 0) + cmd->u3.ip = htonl((array[0] << 24) | (array[1] << 16) + | (array[2] << 8) | array[3]); + } else { + /* Now we have IPv6 address. */ + length = get_ipv6_addr(data + 3, dlen - 3, + (struct in6_addr *)cmd->u3.ip6, delim); + } + + if (length == 0) + return 0; + DEBUGP("EPRT: Got IP address!\n"); + /* Start offset includes initial "|1|", and trailing delimiter */ + return get_port(data, 3 + length + 1, dlen, delim, &cmd->u.tcp.port); +} + +/* Returns 0, or length of numbers: |||6446| */ +static int try_epsv_response(const char *data, size_t dlen, + struct nf_conntrack_man *cmd, char term) +{ + char delim; + + /* Three delimiters. */ + if (dlen <= 3) return 0; + delim = data[0]; + if (isdigit(delim) || delim < 33 || delim > 126 + || data[1] != delim || data[2] != delim) + return 0; + + return get_port(data, 3, dlen, delim, &cmd->u.tcp.port); +} + +/* Return 1 for match, 0 for accept, -1 for partial. */ +static int find_pattern(const char *data, size_t dlen, + const char *pattern, size_t plen, + char skip, char term, + unsigned int *numoff, + unsigned int *numlen, + struct nf_conntrack_man *cmd, + int (*getnum)(const char *, size_t, + struct nf_conntrack_man *, char)) +{ + size_t i; + + DEBUGP("find_pattern `%s': dlen = %u\n", pattern, dlen); + if (dlen == 0) + return 0; + + if (dlen <= plen) { + /* Short packet: try for partial? */ + if (strnicmp(data, pattern, dlen) == 0) + return -1; + else return 0; + } + + if (strnicmp(data, pattern, plen) != 0) { +#if 0 + size_t i; + + DEBUGP("ftp: string mismatch\n"); + for (i = 0; i < plen; i++) { + DEBUGP("ftp:char %u `%c'(%u) vs `%c'(%u)\n", + i, data[i], data[i], + pattern[i], pattern[i]); + } +#endif + return 0; + } + + DEBUGP("Pattern matches!\n"); + /* Now we've found the constant string, try to skip + to the 'skip' character */ + for (i = plen; data[i] != skip; i++) + if (i == dlen - 1) return -1; + + /* Skip over the last character */ + i++; + + DEBUGP("Skipped up to `%c'!\n", skip); + + *numoff = i; + *numlen = getnum(data + i, dlen - i, cmd, term); + if (!*numlen) + return -1; + + DEBUGP("Match succeeded!\n"); + return 1; +} + +/* Look up to see if we're just after a \n. */ +static int find_nl_seq(u32 seq, const struct ip_ct_ftp_master *info, int dir) +{ + unsigned int i; + + for (i = 0; i < info->seq_aft_nl_num[dir]; i++) + if (info->seq_aft_nl[dir][i] == seq) + return 1; + return 0; +} + +/* We don't update if it's older than what we have. */ +static void update_nl_seq(u32 nl_seq, struct ip_ct_ftp_master *info, int dir, + struct sk_buff *skb) +{ + unsigned int i, oldest = NUM_SEQ_TO_REMEMBER; + + /* Look for oldest: if we find exact match, we're done. */ + for (i = 0; i < info->seq_aft_nl_num[dir]; i++) { + if (info->seq_aft_nl[dir][i] == nl_seq) + return; + + if (oldest == info->seq_aft_nl_num[dir] + || before(info->seq_aft_nl[dir][i], oldest)) + oldest = i; + } + + if (info->seq_aft_nl_num[dir] < NUM_SEQ_TO_REMEMBER) { + info->seq_aft_nl[dir][info->seq_aft_nl_num[dir]++] = nl_seq; + nf_conntrack_event_cache(IPCT_HELPINFO_VOLATILE, skb); + } else if (oldest != NUM_SEQ_TO_REMEMBER) { + info->seq_aft_nl[dir][oldest] = nl_seq; + nf_conntrack_event_cache(IPCT_HELPINFO_VOLATILE, skb); + } +} + +static int help(struct sk_buff **pskb, + unsigned int protoff, + struct nf_conn *ct, + enum ip_conntrack_info ctinfo) +{ + unsigned int dataoff, datalen; + struct tcphdr _tcph, *th; + char *fb_ptr; + int ret; + u32 seq; + int dir = CTINFO2DIR(ctinfo); + unsigned int matchlen, matchoff; + struct ip_ct_ftp_master *ct_ftp_info = &ct->help->ct_ftp_info; + struct nf_conntrack_expect *exp; + struct nf_conntrack_man cmd = {}; + + unsigned int i; + int found = 0, ends_in_nl; + + /* Until there's been traffic both ways, don't look in packets. */ + if (ctinfo != IP_CT_ESTABLISHED + && ctinfo != IP_CT_ESTABLISHED+IP_CT_IS_REPLY) { + DEBUGP("ftp: Conntrackinfo = %u\n", ctinfo); + return NF_ACCEPT; + } + + th = skb_header_pointer(*pskb, protoff, sizeof(_tcph), &_tcph); + if (th == NULL) + return NF_ACCEPT; + + dataoff = protoff + th->doff * 4; + /* No data? */ + if (dataoff >= (*pskb)->len) { + DEBUGP("ftp: dataoff(%u) >= skblen(%u)\n", dataoff, + (*pskb)->len); + return NF_ACCEPT; + } + datalen = (*pskb)->len - dataoff; + + spin_lock_bh(&nf_ftp_lock); + fb_ptr = skb_header_pointer(*pskb, dataoff, datalen, ftp_buffer); + BUG_ON(fb_ptr == NULL); + + ends_in_nl = (fb_ptr[datalen - 1] == '\n'); + seq = ntohl(th->seq) + datalen; + + /* Look up to see if we're just after a \n. */ + if (!find_nl_seq(ntohl(th->seq), ct_ftp_info, dir)) { + /* Now if this ends in \n, update ftp info. */ + DEBUGP("nf_conntrack_ftp_help: wrong seq pos %s(%u) or %s(%u)\n", + ct_ftp_info->seq_aft_nl_num[dir] > 0 ? "" : "(UNSET)", + ct_ftp_info->seq_aft_nl[dir][0], + ct_ftp_info->seq_aft_nl_num[dir] > 1 ? "" : "(UNSET)", + ct_ftp_info->seq_aft_nl[dir][1]); + ret = NF_ACCEPT; + goto out_update_nl; + } + + /* Initialize IP/IPv6 addr to expected address (it's not mentioned + in EPSV responses) */ + cmd.l3num = ct->tuplehash[dir].tuple.src.l3num; + memcpy(cmd.u3.all, &ct->tuplehash[dir].tuple.src.u3.all, + sizeof(cmd.u3.all)); + + for (i = 0; i < ARRAY_SIZE(search); i++) { + if (search[i].dir != dir) continue; + + found = find_pattern(fb_ptr, datalen, + search[i].pattern, + search[i].plen, + search[i].skip, + search[i].term, + &matchoff, &matchlen, + &cmd, + search[i].getnum); + if (found) break; + } + if (found == -1) { + /* We don't usually drop packets. After all, this is + connection tracking, not packet filtering. + However, it is necessary for accurate tracking in + this case. */ + if (net_ratelimit()) + printk("conntrack_ftp: partial %s %u+%u\n", + search[i].pattern, + ntohl(th->seq), datalen); + ret = NF_DROP; + goto out; + } else if (found == 0) { /* No match */ + ret = NF_ACCEPT; + goto out_update_nl; + } + + DEBUGP("conntrack_ftp: match `%.*s' (%u bytes at %u)\n", + (int)matchlen, fb_ptr + matchoff, + matchlen, ntohl(th->seq) + matchoff); + + exp = nf_conntrack_expect_alloc(ct); + if (exp == NULL) { + ret = NF_DROP; + goto out; + } + + /* We refer to the reverse direction ("!dir") tuples here, + * because we're expecting something in the other direction. + * Doesn't matter unless NAT is happening. */ + exp->tuple.dst.u3 = ct->tuplehash[!dir].tuple.dst.u3; + + /* Update the ftp info */ + if ((cmd.l3num == ct->tuplehash[dir].tuple.src.l3num) && + memcmp(&cmd.u3.all, &ct->tuplehash[dir].tuple.src.u3.all, + sizeof(cmd.u3.all))) { + /* Enrico Scholz's passive FTP to partially RNAT'd ftp + server: it really wants us to connect to a + different IP address. Simply don't record it for + NAT. */ + if (cmd.l3num == PF_INET) { + DEBUGP("conntrack_ftp: NOT RECORDING: %u,%u,%u,%u != %u.%u.%u.%u\n", + NIPQUAD(cmd.u3.ip), + NIPQUAD(ct->tuplehash[dir].tuple.src.u3.ip)); + } else { + DEBUGP("conntrack_ftp: NOT RECORDING: %x:%x:%x:%x:%x:%x:%x:%x != %x:%x:%x:%x:%x:%x:%x:%x\n", + NIP6(*((struct in6_addr *)cmd.u3.ip6)), + NIP6(*((struct in6_addr *)ct->tuplehash[dir] + .tuple.src.u3.ip6))); + } + + /* Thanks to Cristiano Lincoln Mattos + for reporting this potential + problem (DMZ machines opening holes to internal + networks, or the packet filter itself). */ + if (!loose) { + ret = NF_ACCEPT; + goto out_put_expect; + } + memcpy(&exp->tuple.dst.u3, &cmd.u3.all, + sizeof(exp->tuple.dst.u3)); + } + + exp->tuple.src.u3 = ct->tuplehash[!dir].tuple.src.u3; + exp->tuple.src.l3num = cmd.l3num; + exp->tuple.src.u.tcp.port = 0; + exp->tuple.dst.u.tcp.port = cmd.u.tcp.port; + exp->tuple.dst.protonum = IPPROTO_TCP; + + exp->mask = (struct nf_conntrack_tuple) + { .src = { .l3num = 0xFFFF, + .u = { .tcp = { 0 }}, + }, + .dst = { .protonum = 0xFF, + .u = { .tcp = { 0xFFFF }}, + }, + }; + if (cmd.l3num == PF_INET) { + exp->mask.src.u3.ip = 0xFFFFFFFF; + exp->mask.dst.u3.ip = 0xFFFFFFFF; + } else { + memset(exp->mask.src.u3.ip6, 0xFF, + sizeof(exp->mask.src.u3.ip6)); + memset(exp->mask.dst.u3.ip6, 0xFF, + sizeof(exp->mask.src.u3.ip6)); + } + + exp->expectfn = NULL; + exp->flags = 0; + + /* Now, NAT might want to mangle the packet, and register the + * (possibly changed) expectation itself. */ + if (nf_nat_ftp_hook) + ret = nf_nat_ftp_hook(pskb, ctinfo, search[i].ftptype, + matchoff, matchlen, exp, &seq); + else { + /* Can't expect this? Best to drop packet now. */ + if (nf_conntrack_expect_related(exp) != 0) + ret = NF_DROP; + else + ret = NF_ACCEPT; + } + +out_put_expect: + nf_conntrack_expect_put(exp); + +out_update_nl: + /* Now if this ends in \n, update ftp info. Seq may have been + * adjusted by NAT code. */ + if (ends_in_nl) + update_nl_seq(seq, ct_ftp_info, dir, *pskb); + out: + spin_unlock_bh(&nf_ftp_lock); + return ret; +} + +static struct nf_conntrack_helper ftp[MAX_PORTS][2]; +static char ftp_names[MAX_PORTS][2][sizeof("ftp-65535")]; + +/* don't make this __exit, since it's called from __init ! */ +static void fini(void) +{ + int i, j; + for (i = 0; i < ports_c; i++) { + for (j = 0; j < 2; j++) { + if (ftp[i][j].me == NULL) + continue; + + DEBUGP("nf_ct_ftp: unregistering helper for pf: %d " + "port: %d\n", + ftp[i][j].tuple.src.l3num, ports[i]); + nf_conntrack_helper_unregister(&ftp[i][j]); + } + } + + kfree(ftp_buffer); +} + +static int __init init(void) +{ + int i, j = -1, ret = 0; + char *tmpname; + + ftp_buffer = kmalloc(65536, GFP_KERNEL); + if (!ftp_buffer) + return -ENOMEM; + + if (ports_c == 0) + ports[ports_c++] = FTP_PORT; + + /* FIXME should be configurable whether IPv4 and IPv6 FTP connections + are tracked or not - YK */ + for (i = 0; i < ports_c; i++) { + memset(&ftp[i], 0, sizeof(struct nf_conntrack_helper)); + + ftp[i][0].tuple.src.l3num = PF_INET; + ftp[i][1].tuple.src.l3num = PF_INET6; + for (j = 0; j < 2; j++) { + ftp[i][j].tuple.src.u.tcp.port = htons(ports[i]); + ftp[i][j].tuple.dst.protonum = IPPROTO_TCP; + ftp[i][j].mask.src.u.tcp.port = 0xFFFF; + ftp[i][j].mask.dst.protonum = 0xFF; + ftp[i][j].max_expected = 1; + ftp[i][j].timeout = 5 * 60; /* 5 Minutes */ + ftp[i][j].me = THIS_MODULE; + ftp[i][j].help = help; + tmpname = &ftp_names[i][j][0]; + if (ports[i] == FTP_PORT) + sprintf(tmpname, "ftp"); + else + sprintf(tmpname, "ftp-%d", ports[i]); + ftp[i][j].name = tmpname; + + DEBUGP("nf_ct_ftp: registering helper for pf: %d " + "port: %d\n", + ftp[i][j].tuple.src.l3num, ports[i]); + ret = nf_conntrack_helper_register(&ftp[i][j]); + if (ret) { + printk("nf_ct_ftp: failed to register helper " + " for pf: %d port: %d\n", + ftp[i][j].tuple.src.l3num, ports[i]); + fini(); + return ret; + } + } + } + + return 0; +} + +module_init(init); +module_exit(fini); diff --git a/net/netfilter/nf_conntrack_l3proto_generic.c b/net/netfilter/nf_conntrack_l3proto_generic.c new file mode 100644 index 000000000000..7de4f06c63c5 --- /dev/null +++ b/net/netfilter/nf_conntrack_l3proto_generic.c @@ -0,0 +1,98 @@ +/* + * (C) 2003,2004 USAGI/WIDE Project + * + * Based largely upon the original ip_conntrack code which + * had the following copyright information: + * + * (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Author: + * Yasuyuki Kozakai @USAGI + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +DECLARE_PER_CPU(struct nf_conntrack_stat, nf_conntrack_stat); + +static int generic_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff, + struct nf_conntrack_tuple *tuple) +{ + memset(&tuple->src.u3, 0, sizeof(tuple->src.u3)); + memset(&tuple->dst.u3, 0, sizeof(tuple->dst.u3)); + + return 1; +} + +static int generic_invert_tuple(struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_tuple *orig) +{ + memset(&tuple->src.u3, 0, sizeof(tuple->src.u3)); + memset(&tuple->dst.u3, 0, sizeof(tuple->dst.u3)); + + return 1; +} + +static int generic_print_tuple(struct seq_file *s, + const struct nf_conntrack_tuple *tuple) +{ + return 0; +} + +static int generic_print_conntrack(struct seq_file *s, + const struct nf_conn *conntrack) +{ + return 0; +} + +static int +generic_prepare(struct sk_buff **pskb, unsigned int hooknum, + unsigned int *dataoff, u_int8_t *protonum) +{ + /* Never track !!! */ + return -NF_ACCEPT; +} + + +static u_int32_t generic_get_features(const struct nf_conntrack_tuple *tuple) + +{ + return NF_CT_F_BASIC; +} + +struct nf_conntrack_l3proto nf_conntrack_generic_l3proto = { + .l3proto = PF_UNSPEC, + .name = "unknown", + .pkt_to_tuple = generic_pkt_to_tuple, + .invert_tuple = generic_invert_tuple, + .print_tuple = generic_print_tuple, + .print_conntrack = generic_print_conntrack, + .prepare = generic_prepare, + .get_features = generic_get_features, + .me = THIS_MODULE, +}; diff --git a/net/netfilter/nf_conntrack_proto_generic.c b/net/netfilter/nf_conntrack_proto_generic.c new file mode 100644 index 000000000000..36425f6c833f --- /dev/null +++ b/net/netfilter/nf_conntrack_proto_generic.c @@ -0,0 +1,85 @@ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * 16 Dec 2003: Yasuyuki Kozakai @USAGI + * - enable working with L3 protocol independent connection tracking. + * + * Derived from net/ipv4/netfilter/ip_conntrack_proto_generic.c + */ + +#include +#include +#include +#include +#include + +unsigned long nf_ct_generic_timeout = 600*HZ; + +static int generic_pkt_to_tuple(const struct sk_buff *skb, + unsigned int dataoff, + struct nf_conntrack_tuple *tuple) +{ + tuple->src.u.all = 0; + tuple->dst.u.all = 0; + + return 1; +} + +static int generic_invert_tuple(struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_tuple *orig) +{ + tuple->src.u.all = 0; + tuple->dst.u.all = 0; + + return 1; +} + +/* Print out the per-protocol part of the tuple. */ +static int generic_print_tuple(struct seq_file *s, + const struct nf_conntrack_tuple *tuple) +{ + return 0; +} + +/* Print out the private part of the conntrack. */ +static int generic_print_conntrack(struct seq_file *s, + const struct nf_conn *state) +{ + return 0; +} + +/* Returns verdict for packet, or -1 for invalid. */ +static int packet(struct nf_conn *conntrack, + const struct sk_buff *skb, + unsigned int dataoff, + enum ip_conntrack_info ctinfo, + int pf, + unsigned int hooknum) +{ + nf_ct_refresh_acct(conntrack, ctinfo, skb, nf_ct_generic_timeout); + return NF_ACCEPT; +} + +/* Called when a new connection for this protocol found. */ +static int new(struct nf_conn *conntrack, const struct sk_buff *skb, + unsigned int dataoff) +{ + return 1; +} + +struct nf_conntrack_protocol nf_conntrack_generic_protocol = +{ + .l3proto = PF_UNSPEC, + .proto = 0, + .name = "unknown", + .pkt_to_tuple = generic_pkt_to_tuple, + .invert_tuple = generic_invert_tuple, + .print_tuple = generic_print_tuple, + .print_conntrack = generic_print_conntrack, + .packet = packet, + .new = new, +}; diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c new file mode 100644 index 000000000000..3a600f77b4e0 --- /dev/null +++ b/net/netfilter/nf_conntrack_proto_sctp.c @@ -0,0 +1,670 @@ +/* + * Connection tracking protocol helper module for SCTP. + * + * SCTP is defined in RFC 2960. References to various sections in this code + * are to this RFC. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * 17 Oct 2004: Yasuyuki Kozakai @USAGI + * - enable working with L3 protocol independent connection tracking. + * + * Derived from net/ipv4/ip_conntrack_sctp.c + */ + +/* + * Added support for proc manipulation of timeouts. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#if 0 +#define DEBUGP(format, ...) printk(format, ## __VA_ARGS__) +#else +#define DEBUGP(format, args...) +#endif + +/* Protects conntrack->proto.sctp */ +static DEFINE_RWLOCK(sctp_lock); + +/* FIXME: Examine ipfilter's timeouts and conntrack transitions more + closely. They're more complex. --RR + + And so for me for SCTP :D -Kiran */ + +static const char *sctp_conntrack_names[] = { + "NONE", + "CLOSED", + "COOKIE_WAIT", + "COOKIE_ECHOED", + "ESTABLISHED", + "SHUTDOWN_SENT", + "SHUTDOWN_RECD", + "SHUTDOWN_ACK_SENT", +}; + +#define SECS * HZ +#define MINS * 60 SECS +#define HOURS * 60 MINS +#define DAYS * 24 HOURS + +static unsigned long nf_ct_sctp_timeout_closed = 10 SECS; +static unsigned long nf_ct_sctp_timeout_cookie_wait = 3 SECS; +static unsigned long nf_ct_sctp_timeout_cookie_echoed = 3 SECS; +static unsigned long nf_ct_sctp_timeout_established = 5 DAYS; +static unsigned long nf_ct_sctp_timeout_shutdown_sent = 300 SECS / 1000; +static unsigned long nf_ct_sctp_timeout_shutdown_recd = 300 SECS / 1000; +static unsigned long nf_ct_sctp_timeout_shutdown_ack_sent = 3 SECS; + +static unsigned long * sctp_timeouts[] += { NULL, /* SCTP_CONNTRACK_NONE */ + &nf_ct_sctp_timeout_closed, /* SCTP_CONNTRACK_CLOSED */ + &nf_ct_sctp_timeout_cookie_wait, /* SCTP_CONNTRACK_COOKIE_WAIT */ + &nf_ct_sctp_timeout_cookie_echoed, /* SCTP_CONNTRACK_COOKIE_ECHOED */ + &nf_ct_sctp_timeout_established, /* SCTP_CONNTRACK_ESTABLISHED */ + &nf_ct_sctp_timeout_shutdown_sent, /* SCTP_CONNTRACK_SHUTDOWN_SENT */ + &nf_ct_sctp_timeout_shutdown_recd, /* SCTP_CONNTRACK_SHUTDOWN_RECD */ + &nf_ct_sctp_timeout_shutdown_ack_sent /* SCTP_CONNTRACK_SHUTDOWN_ACK_SENT */ + }; + +#define sNO SCTP_CONNTRACK_NONE +#define sCL SCTP_CONNTRACK_CLOSED +#define sCW SCTP_CONNTRACK_COOKIE_WAIT +#define sCE SCTP_CONNTRACK_COOKIE_ECHOED +#define sES SCTP_CONNTRACK_ESTABLISHED +#define sSS SCTP_CONNTRACK_SHUTDOWN_SENT +#define sSR SCTP_CONNTRACK_SHUTDOWN_RECD +#define sSA SCTP_CONNTRACK_SHUTDOWN_ACK_SENT +#define sIV SCTP_CONNTRACK_MAX + +/* + These are the descriptions of the states: + +NOTE: These state names are tantalizingly similar to the states of an +SCTP endpoint. But the interpretation of the states is a little different, +considering that these are the states of the connection and not of an end +point. Please note the subtleties. -Kiran + +NONE - Nothing so far. +COOKIE WAIT - We have seen an INIT chunk in the original direction, or also + an INIT_ACK chunk in the reply direction. +COOKIE ECHOED - We have seen a COOKIE_ECHO chunk in the original direction. +ESTABLISHED - We have seen a COOKIE_ACK in the reply direction. +SHUTDOWN_SENT - We have seen a SHUTDOWN chunk in the original direction. +SHUTDOWN_RECD - We have seen a SHUTDOWN chunk in the reply directoin. +SHUTDOWN_ACK_SENT - We have seen a SHUTDOWN_ACK chunk in the direction opposite + to that of the SHUTDOWN chunk. +CLOSED - We have seen a SHUTDOWN_COMPLETE chunk in the direction of + the SHUTDOWN chunk. Connection is closed. +*/ + +/* TODO + - I have assumed that the first INIT is in the original direction. + This messes things when an INIT comes in the reply direction in CLOSED + state. + - Check the error type in the reply dir before transitioning from +cookie echoed to closed. + - Sec 5.2.4 of RFC 2960 + - Multi Homing support. +*/ + +/* SCTP conntrack state transitions */ +static enum sctp_conntrack sctp_conntracks[2][9][SCTP_CONNTRACK_MAX] = { + { +/* ORIGINAL */ +/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA */ +/* init */ {sCW, sCW, sCW, sCE, sES, sSS, sSR, sSA}, +/* init_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA}, +/* abort */ {sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL}, +/* shutdown */ {sCL, sCL, sCW, sCE, sSS, sSS, sSR, sSA}, +/* shutdown_ack */ {sSA, sCL, sCW, sCE, sES, sSA, sSA, sSA}, +/* error */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Cant have Stale cookie*/ +/* cookie_echo */ {sCL, sCL, sCE, sCE, sES, sSS, sSR, sSA},/* 5.2.4 - Big TODO */ +/* cookie_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Cant come in orig dir */ +/* shutdown_comp*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sCL} + }, + { +/* REPLY */ +/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA */ +/* init */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* INIT in sCL Big TODO */ +/* init_ack */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA}, +/* abort */ {sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL}, +/* shutdown */ {sIV, sCL, sCW, sCE, sSR, sSS, sSR, sSA}, +/* shutdown_ack */ {sIV, sCL, sCW, sCE, sES, sSA, sSA, sSA}, +/* error */ {sIV, sCL, sCW, sCL, sES, sSS, sSR, sSA}, +/* cookie_echo */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Cant come in reply dir */ +/* cookie_ack */ {sIV, sCL, sCW, sES, sES, sSS, sSR, sSA}, +/* shutdown_comp*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sCL} + } +}; + +static int sctp_pkt_to_tuple(const struct sk_buff *skb, + unsigned int dataoff, + struct nf_conntrack_tuple *tuple) +{ + sctp_sctphdr_t _hdr, *hp; + + DEBUGP(__FUNCTION__); + DEBUGP("\n"); + + /* Actually only need first 8 bytes. */ + hp = skb_header_pointer(skb, dataoff, 8, &_hdr); + if (hp == NULL) + return 0; + + tuple->src.u.sctp.port = hp->source; + tuple->dst.u.sctp.port = hp->dest; + return 1; +} + +static int sctp_invert_tuple(struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_tuple *orig) +{ + DEBUGP(__FUNCTION__); + DEBUGP("\n"); + + tuple->src.u.sctp.port = orig->dst.u.sctp.port; + tuple->dst.u.sctp.port = orig->src.u.sctp.port; + return 1; +} + +/* Print out the per-protocol part of the tuple. */ +static int sctp_print_tuple(struct seq_file *s, + const struct nf_conntrack_tuple *tuple) +{ + DEBUGP(__FUNCTION__); + DEBUGP("\n"); + + return seq_printf(s, "sport=%hu dport=%hu ", + ntohs(tuple->src.u.sctp.port), + ntohs(tuple->dst.u.sctp.port)); +} + +/* Print out the private part of the conntrack. */ +static int sctp_print_conntrack(struct seq_file *s, + const struct nf_conn *conntrack) +{ + enum sctp_conntrack state; + + DEBUGP(__FUNCTION__); + DEBUGP("\n"); + + read_lock_bh(&sctp_lock); + state = conntrack->proto.sctp.state; + read_unlock_bh(&sctp_lock); + + return seq_printf(s, "%s ", sctp_conntrack_names[state]); +} + +#define for_each_sctp_chunk(skb, sch, _sch, offset, dataoff, count) \ +for (offset = dataoff + sizeof(sctp_sctphdr_t), count = 0; \ + offset < skb->len && \ + (sch = skb_header_pointer(skb, offset, sizeof(_sch), &_sch)); \ + offset += (htons(sch->length) + 3) & ~3, count++) + +/* Some validity checks to make sure the chunks are fine */ +static int do_basic_checks(struct nf_conn *conntrack, + const struct sk_buff *skb, + unsigned int dataoff, + char *map) +{ + u_int32_t offset, count; + sctp_chunkhdr_t _sch, *sch; + int flag; + + DEBUGP(__FUNCTION__); + DEBUGP("\n"); + + flag = 0; + + for_each_sctp_chunk (skb, sch, _sch, offset, dataoff, count) { + DEBUGP("Chunk Num: %d Type: %d\n", count, sch->type); + + if (sch->type == SCTP_CID_INIT + || sch->type == SCTP_CID_INIT_ACK + || sch->type == SCTP_CID_SHUTDOWN_COMPLETE) { + flag = 1; + } + + /* Cookie Ack/Echo chunks not the first OR + Init / Init Ack / Shutdown compl chunks not the only chunks */ + if ((sch->type == SCTP_CID_COOKIE_ACK + || sch->type == SCTP_CID_COOKIE_ECHO + || flag) + && count !=0 ) { + DEBUGP("Basic checks failed\n"); + return 1; + } + + if (map) { + set_bit(sch->type, (void *)map); + } + } + + DEBUGP("Basic checks passed\n"); + return 0; +} + +static int new_state(enum ip_conntrack_dir dir, + enum sctp_conntrack cur_state, + int chunk_type) +{ + int i; + + DEBUGP(__FUNCTION__); + DEBUGP("\n"); + + DEBUGP("Chunk type: %d\n", chunk_type); + + switch (chunk_type) { + case SCTP_CID_INIT: + DEBUGP("SCTP_CID_INIT\n"); + i = 0; break; + case SCTP_CID_INIT_ACK: + DEBUGP("SCTP_CID_INIT_ACK\n"); + i = 1; break; + case SCTP_CID_ABORT: + DEBUGP("SCTP_CID_ABORT\n"); + i = 2; break; + case SCTP_CID_SHUTDOWN: + DEBUGP("SCTP_CID_SHUTDOWN\n"); + i = 3; break; + case SCTP_CID_SHUTDOWN_ACK: + DEBUGP("SCTP_CID_SHUTDOWN_ACK\n"); + i = 4; break; + case SCTP_CID_ERROR: + DEBUGP("SCTP_CID_ERROR\n"); + i = 5; break; + case SCTP_CID_COOKIE_ECHO: + DEBUGP("SCTP_CID_COOKIE_ECHO\n"); + i = 6; break; + case SCTP_CID_COOKIE_ACK: + DEBUGP("SCTP_CID_COOKIE_ACK\n"); + i = 7; break; + case SCTP_CID_SHUTDOWN_COMPLETE: + DEBUGP("SCTP_CID_SHUTDOWN_COMPLETE\n"); + i = 8; break; + default: + /* Other chunks like DATA, SACK, HEARTBEAT and + its ACK do not cause a change in state */ + DEBUGP("Unknown chunk type, Will stay in %s\n", + sctp_conntrack_names[cur_state]); + return cur_state; + } + + DEBUGP("dir: %d cur_state: %s chunk_type: %d new_state: %s\n", + dir, sctp_conntrack_names[cur_state], chunk_type, + sctp_conntrack_names[sctp_conntracks[dir][i][cur_state]]); + + return sctp_conntracks[dir][i][cur_state]; +} + +/* Returns verdict for packet, or -1 for invalid. */ +static int sctp_packet(struct nf_conn *conntrack, + const struct sk_buff *skb, + unsigned int dataoff, + enum ip_conntrack_info ctinfo, + int pf, + unsigned int hooknum) +{ + enum sctp_conntrack newconntrack, oldsctpstate; + sctp_sctphdr_t _sctph, *sh; + sctp_chunkhdr_t _sch, *sch; + u_int32_t offset, count; + char map[256 / sizeof (char)] = {0}; + + DEBUGP(__FUNCTION__); + DEBUGP("\n"); + + sh = skb_header_pointer(skb, dataoff, sizeof(_sctph), &_sctph); + if (sh == NULL) + return -1; + + if (do_basic_checks(conntrack, skb, dataoff, map) != 0) + return -1; + + /* Check the verification tag (Sec 8.5) */ + if (!test_bit(SCTP_CID_INIT, (void *)map) + && !test_bit(SCTP_CID_SHUTDOWN_COMPLETE, (void *)map) + && !test_bit(SCTP_CID_COOKIE_ECHO, (void *)map) + && !test_bit(SCTP_CID_ABORT, (void *)map) + && !test_bit(SCTP_CID_SHUTDOWN_ACK, (void *)map) + && (sh->vtag != conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)])) { + DEBUGP("Verification tag check failed\n"); + return -1; + } + + oldsctpstate = newconntrack = SCTP_CONNTRACK_MAX; + for_each_sctp_chunk (skb, sch, _sch, offset, dataoff, count) { + write_lock_bh(&sctp_lock); + + /* Special cases of Verification tag check (Sec 8.5.1) */ + if (sch->type == SCTP_CID_INIT) { + /* Sec 8.5.1 (A) */ + if (sh->vtag != 0) { + write_unlock_bh(&sctp_lock); + return -1; + } + } else if (sch->type == SCTP_CID_ABORT) { + /* Sec 8.5.1 (B) */ + if (!(sh->vtag == conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)]) + && !(sh->vtag == conntrack->proto.sctp.vtag + [1 - CTINFO2DIR(ctinfo)])) { + write_unlock_bh(&sctp_lock); + return -1; + } + } else if (sch->type == SCTP_CID_SHUTDOWN_COMPLETE) { + /* Sec 8.5.1 (C) */ + if (!(sh->vtag == conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)]) + && !(sh->vtag == conntrack->proto.sctp.vtag + [1 - CTINFO2DIR(ctinfo)] + && (sch->flags & 1))) { + write_unlock_bh(&sctp_lock); + return -1; + } + } else if (sch->type == SCTP_CID_COOKIE_ECHO) { + /* Sec 8.5.1 (D) */ + if (!(sh->vtag == conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)])) { + write_unlock_bh(&sctp_lock); + return -1; + } + } + + oldsctpstate = conntrack->proto.sctp.state; + newconntrack = new_state(CTINFO2DIR(ctinfo), oldsctpstate, sch->type); + + /* Invalid */ + if (newconntrack == SCTP_CONNTRACK_MAX) { + DEBUGP("nf_conntrack_sctp: Invalid dir=%i ctype=%u conntrack=%u\n", + CTINFO2DIR(ctinfo), sch->type, oldsctpstate); + write_unlock_bh(&sctp_lock); + return -1; + } + + /* If it is an INIT or an INIT ACK note down the vtag */ + if (sch->type == SCTP_CID_INIT + || sch->type == SCTP_CID_INIT_ACK) { + sctp_inithdr_t _inithdr, *ih; + + ih = skb_header_pointer(skb, offset + sizeof(sctp_chunkhdr_t), + sizeof(_inithdr), &_inithdr); + if (ih == NULL) { + write_unlock_bh(&sctp_lock); + return -1; + } + DEBUGP("Setting vtag %x for dir %d\n", + ih->init_tag, !CTINFO2DIR(ctinfo)); + conntrack->proto.sctp.vtag[!CTINFO2DIR(ctinfo)] = ih->init_tag; + } + + conntrack->proto.sctp.state = newconntrack; + if (oldsctpstate != newconntrack) + nf_conntrack_event_cache(IPCT_PROTOINFO, skb); + write_unlock_bh(&sctp_lock); + } + + nf_ct_refresh_acct(conntrack, ctinfo, skb, *sctp_timeouts[newconntrack]); + + if (oldsctpstate == SCTP_CONNTRACK_COOKIE_ECHOED + && CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY + && newconntrack == SCTP_CONNTRACK_ESTABLISHED) { + DEBUGP("Setting assured bit\n"); + set_bit(IPS_ASSURED_BIT, &conntrack->status); + nf_conntrack_event_cache(IPCT_STATUS, skb); + } + + return NF_ACCEPT; +} + +/* Called when a new connection for this protocol found. */ +static int sctp_new(struct nf_conn *conntrack, const struct sk_buff *skb, + unsigned int dataoff) +{ + enum sctp_conntrack newconntrack; + sctp_sctphdr_t _sctph, *sh; + sctp_chunkhdr_t _sch, *sch; + u_int32_t offset, count; + char map[256 / sizeof (char)] = {0}; + + DEBUGP(__FUNCTION__); + DEBUGP("\n"); + + sh = skb_header_pointer(skb, dataoff, sizeof(_sctph), &_sctph); + if (sh == NULL) + return 0; + + if (do_basic_checks(conntrack, skb, dataoff, map) != 0) + return 0; + + /* If an OOTB packet has any of these chunks discard (Sec 8.4) */ + if ((test_bit (SCTP_CID_ABORT, (void *)map)) + || (test_bit (SCTP_CID_SHUTDOWN_COMPLETE, (void *)map)) + || (test_bit (SCTP_CID_COOKIE_ACK, (void *)map))) { + return 0; + } + + newconntrack = SCTP_CONNTRACK_MAX; + for_each_sctp_chunk (skb, sch, _sch, offset, dataoff, count) { + /* Don't need lock here: this conntrack not in circulation yet */ + newconntrack = new_state(IP_CT_DIR_ORIGINAL, + SCTP_CONNTRACK_NONE, sch->type); + + /* Invalid: delete conntrack */ + if (newconntrack == SCTP_CONNTRACK_MAX) { + DEBUGP("nf_conntrack_sctp: invalid new deleting.\n"); + return 0; + } + + /* Copy the vtag into the state info */ + if (sch->type == SCTP_CID_INIT) { + if (sh->vtag == 0) { + sctp_inithdr_t _inithdr, *ih; + + ih = skb_header_pointer(skb, offset + sizeof(sctp_chunkhdr_t), + sizeof(_inithdr), &_inithdr); + if (ih == NULL) + return 0; + + DEBUGP("Setting vtag %x for new conn\n", + ih->init_tag); + + conntrack->proto.sctp.vtag[IP_CT_DIR_REPLY] = + ih->init_tag; + } else { + /* Sec 8.5.1 (A) */ + return 0; + } + } + /* If it is a shutdown ack OOTB packet, we expect a return + shutdown complete, otherwise an ABORT Sec 8.4 (5) and (8) */ + else { + DEBUGP("Setting vtag %x for new conn OOTB\n", + sh->vtag); + conntrack->proto.sctp.vtag[IP_CT_DIR_REPLY] = sh->vtag; + } + + conntrack->proto.sctp.state = newconntrack; + } + + return 1; +} + +struct nf_conntrack_protocol nf_conntrack_protocol_sctp4 = { + .l3proto = PF_INET, + .proto = IPPROTO_SCTP, + .name = "sctp", + .pkt_to_tuple = sctp_pkt_to_tuple, + .invert_tuple = sctp_invert_tuple, + .print_tuple = sctp_print_tuple, + .print_conntrack = sctp_print_conntrack, + .packet = sctp_packet, + .new = sctp_new, + .destroy = NULL, + .me = THIS_MODULE +}; + +struct nf_conntrack_protocol nf_conntrack_protocol_sctp6 = { + .l3proto = PF_INET6, + .proto = IPPROTO_SCTP, + .name = "sctp", + .pkt_to_tuple = sctp_pkt_to_tuple, + .invert_tuple = sctp_invert_tuple, + .print_tuple = sctp_print_tuple, + .print_conntrack = sctp_print_conntrack, + .packet = sctp_packet, + .new = sctp_new, + .destroy = NULL, + .me = THIS_MODULE +}; + +#ifdef CONFIG_SYSCTL +static ctl_table nf_ct_sysctl_table[] = { + { + .ctl_name = NET_NF_CONNTRACK_SCTP_TIMEOUT_CLOSED, + .procname = "nf_conntrack_sctp_timeout_closed", + .data = &nf_ct_sctp_timeout_closed, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_WAIT, + .procname = "nf_conntrack_sctp_timeout_cookie_wait", + .data = &nf_ct_sctp_timeout_cookie_wait, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_ECHOED, + .procname = "nf_conntrack_sctp_timeout_cookie_echoed", + .data = &nf_ct_sctp_timeout_cookie_echoed, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_NF_CONNTRACK_SCTP_TIMEOUT_ESTABLISHED, + .procname = "nf_conntrack_sctp_timeout_established", + .data = &nf_ct_sctp_timeout_established, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_SENT, + .procname = "nf_conntrack_sctp_timeout_shutdown_sent", + .data = &nf_ct_sctp_timeout_shutdown_sent, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_RECD, + .procname = "nf_conntrack_sctp_timeout_shutdown_recd", + .data = &nf_ct_sctp_timeout_shutdown_recd, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_ACK_SENT, + .procname = "nf_conntrack_sctp_timeout_shutdown_ack_sent", + .data = &nf_ct_sctp_timeout_shutdown_ack_sent, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { .ctl_name = 0 } +}; + +static ctl_table nf_ct_netfilter_table[] = { + { + .ctl_name = NET_NETFILTER, + .procname = "netfilter", + .mode = 0555, + .child = nf_ct_sysctl_table, + }, + { .ctl_name = 0 } +}; + +static ctl_table nf_ct_net_table[] = { + { + .ctl_name = CTL_NET, + .procname = "net", + .mode = 0555, + .child = nf_ct_netfilter_table, + }, + { .ctl_name = 0 } +}; + +static struct ctl_table_header *nf_ct_sysctl_header; +#endif + +int __init init(void) +{ + int ret; + + ret = nf_conntrack_protocol_register(&nf_conntrack_protocol_sctp4); + if (ret) { + printk("nf_conntrack_proto_sctp4: protocol register failed\n"); + goto out; + } + ret = nf_conntrack_protocol_register(&nf_conntrack_protocol_sctp6); + if (ret) { + printk("nf_conntrack_proto_sctp6: protocol register failed\n"); + goto cleanup_sctp4; + } + +#ifdef CONFIG_SYSCTL + nf_ct_sysctl_header = register_sysctl_table(nf_ct_net_table, 0); + if (nf_ct_sysctl_header == NULL) { + printk("nf_conntrack_proto_sctp: can't register to sysctl.\n"); + goto cleanup; + } +#endif + + return ret; + +#ifdef CONFIG_SYSCTL + cleanup: + nf_conntrack_protocol_unregister(&nf_conntrack_protocol_sctp6); +#endif + cleanup_sctp4: + nf_conntrack_protocol_unregister(&nf_conntrack_protocol_sctp4); + out: + DEBUGP("SCTP conntrack module loading %s\n", + ret ? "failed": "succeeded"); + return ret; +} + +void __exit fini(void) +{ + nf_conntrack_protocol_unregister(&nf_conntrack_protocol_sctp6); + nf_conntrack_protocol_unregister(&nf_conntrack_protocol_sctp4); +#ifdef CONFIG_SYSCTL + unregister_sysctl_table(nf_ct_sysctl_header); +#endif + DEBUGP("SCTP conntrack module unloaded\n"); +} + +module_init(init); +module_exit(fini); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Kiran Kumar Immidi"); +MODULE_DESCRIPTION("Netfilter connection tracking protocol helper for SCTP"); diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c new file mode 100644 index 000000000000..83d90dd624f0 --- /dev/null +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -0,0 +1,1162 @@ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Jozsef Kadlecsik : + * - Real stateful connection tracking + * - Modified state transitions table + * - Window scaling support added + * - SACK support added + * + * Willy Tarreau: + * - State table bugfixes + * - More robust state changes + * - Tuning timer parameters + * + * 27 Oct 2004: Yasuyuki Kozakai @USAGI + * - genelized Layer 3 protocol part. + * + * Derived from net/ipv4/netfilter/ip_conntrack_proto_tcp.c + * + * version 2.2 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include + +#if 0 +#define DEBUGP printk +#define DEBUGP_VARS +#else +#define DEBUGP(format, args...) +#endif + +/* Protects conntrack->proto.tcp */ +static DEFINE_RWLOCK(tcp_lock); + +/* "Be conservative in what you do, + be liberal in what you accept from others." + If it's non-zero, we mark only out of window RST segments as INVALID. */ +int nf_ct_tcp_be_liberal = 0; + +/* When connection is picked up from the middle, how many packets are required + to pass in each direction when we assume we are in sync - if any side uses + window scaling, we lost the game. + If it is set to zero, we disable picking up already established + connections. */ +int nf_ct_tcp_loose = 3; + +/* Max number of the retransmitted packets without receiving an (acceptable) + ACK from the destination. If this number is reached, a shorter timer + will be started. */ +int nf_ct_tcp_max_retrans = 3; + + /* FIXME: Examine ipfilter's timeouts and conntrack transitions more + closely. They're more complex. --RR */ + +static const char *tcp_conntrack_names[] = { + "NONE", + "SYN_SENT", + "SYN_RECV", + "ESTABLISHED", + "FIN_WAIT", + "CLOSE_WAIT", + "LAST_ACK", + "TIME_WAIT", + "CLOSE", + "LISTEN" +}; + +#define SECS * HZ +#define MINS * 60 SECS +#define HOURS * 60 MINS +#define DAYS * 24 HOURS + +unsigned long nf_ct_tcp_timeout_syn_sent = 2 MINS; +unsigned long nf_ct_tcp_timeout_syn_recv = 60 SECS; +unsigned long nf_ct_tcp_timeout_established = 5 DAYS; +unsigned long nf_ct_tcp_timeout_fin_wait = 2 MINS; +unsigned long nf_ct_tcp_timeout_close_wait = 60 SECS; +unsigned long nf_ct_tcp_timeout_last_ack = 30 SECS; +unsigned long nf_ct_tcp_timeout_time_wait = 2 MINS; +unsigned long nf_ct_tcp_timeout_close = 10 SECS; + +/* RFC1122 says the R2 limit should be at least 100 seconds. + Linux uses 15 packets as limit, which corresponds + to ~13-30min depending on RTO. */ +unsigned long nf_ct_tcp_timeout_max_retrans = 5 MINS; + +static unsigned long * tcp_timeouts[] += { NULL, /* TCP_CONNTRACK_NONE */ + &nf_ct_tcp_timeout_syn_sent, /* TCP_CONNTRACK_SYN_SENT, */ + &nf_ct_tcp_timeout_syn_recv, /* TCP_CONNTRACK_SYN_RECV, */ + &nf_ct_tcp_timeout_established, /* TCP_CONNTRACK_ESTABLISHED, */ + &nf_ct_tcp_timeout_fin_wait, /* TCP_CONNTRACK_FIN_WAIT, */ + &nf_ct_tcp_timeout_close_wait, /* TCP_CONNTRACK_CLOSE_WAIT, */ + &nf_ct_tcp_timeout_last_ack, /* TCP_CONNTRACK_LAST_ACK, */ + &nf_ct_tcp_timeout_time_wait, /* TCP_CONNTRACK_TIME_WAIT, */ + &nf_ct_tcp_timeout_close, /* TCP_CONNTRACK_CLOSE, */ + NULL, /* TCP_CONNTRACK_LISTEN */ + }; + +#define sNO TCP_CONNTRACK_NONE +#define sSS TCP_CONNTRACK_SYN_SENT +#define sSR TCP_CONNTRACK_SYN_RECV +#define sES TCP_CONNTRACK_ESTABLISHED +#define sFW TCP_CONNTRACK_FIN_WAIT +#define sCW TCP_CONNTRACK_CLOSE_WAIT +#define sLA TCP_CONNTRACK_LAST_ACK +#define sTW TCP_CONNTRACK_TIME_WAIT +#define sCL TCP_CONNTRACK_CLOSE +#define sLI TCP_CONNTRACK_LISTEN +#define sIV TCP_CONNTRACK_MAX +#define sIG TCP_CONNTRACK_IGNORE + +/* What TCP flags are set from RST/SYN/FIN/ACK. */ +enum tcp_bit_set { + TCP_SYN_SET, + TCP_SYNACK_SET, + TCP_FIN_SET, + TCP_ACK_SET, + TCP_RST_SET, + TCP_NONE_SET, +}; + +/* + * The TCP state transition table needs a few words... + * + * We are the man in the middle. All the packets go through us + * but might get lost in transit to the destination. + * It is assumed that the destinations can't receive segments + * we haven't seen. + * + * The checked segment is in window, but our windows are *not* + * equivalent with the ones of the sender/receiver. We always + * try to guess the state of the current sender. + * + * The meaning of the states are: + * + * NONE: initial state + * SYN_SENT: SYN-only packet seen + * SYN_RECV: SYN-ACK packet seen + * ESTABLISHED: ACK packet seen + * FIN_WAIT: FIN packet seen + * CLOSE_WAIT: ACK seen (after FIN) + * LAST_ACK: FIN seen (after FIN) + * TIME_WAIT: last ACK seen + * CLOSE: closed connection + * + * LISTEN state is not used. + * + * Packets marked as IGNORED (sIG): + * if they may be either invalid or valid + * and the receiver may send back a connection + * closing RST or a SYN/ACK. + * + * Packets marked as INVALID (sIV): + * if they are invalid + * or we do not support the request (simultaneous open) + */ +static enum tcp_conntrack tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = { + { +/* ORIGINAL */ +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ +/*syn*/ { sSS, sSS, sIG, sIG, sIG, sIG, sIG, sSS, sSS, sIV }, +/* + * sNO -> sSS Initialize a new connection + * sSS -> sSS Retransmitted SYN + * sSR -> sIG Late retransmitted SYN? + * sES -> sIG Error: SYNs in window outside the SYN_SENT state + * are errors. Receiver will reply with RST + * and close the connection. + * Or we are not in sync and hold a dead connection. + * sFW -> sIG + * sCW -> sIG + * sLA -> sIG + * sTW -> sSS Reopened connection (RFC 1122). + * sCL -> sSS + */ +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ +/*synack*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV }, +/* + * A SYN/ACK from the client is always invalid: + * - either it tries to set up a simultaneous open, which is + * not supported; + * - or the firewall has just been inserted between the two hosts + * during the session set-up. The SYN will be retransmitted + * by the true client (or it'll time out). + */ +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ +/*fin*/ { sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV }, +/* + * sNO -> sIV Too late and no reason to do anything... + * sSS -> sIV Client migth not send FIN in this state: + * we enforce waiting for a SYN/ACK reply first. + * sSR -> sFW Close started. + * sES -> sFW + * sFW -> sLA FIN seen in both directions, waiting for + * the last ACK. + * Migth be a retransmitted FIN as well... + * sCW -> sLA + * sLA -> sLA Retransmitted FIN. Remain in the same state. + * sTW -> sTW + * sCL -> sCL + */ +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ +/*ack*/ { sES, sIV, sES, sES, sCW, sCW, sTW, sTW, sCL, sIV }, +/* + * sNO -> sES Assumed. + * sSS -> sIV ACK is invalid: we haven't seen a SYN/ACK yet. + * sSR -> sES Established state is reached. + * sES -> sES :-) + * sFW -> sCW Normal close request answered by ACK. + * sCW -> sCW + * sLA -> sTW Last ACK detected. + * sTW -> sTW Retransmitted last ACK. Remain in the same state. + * sCL -> sCL + */ +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ +/*rst*/ { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sIV }, +/*none*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV } + }, + { +/* REPLY */ +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ +/*syn*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV }, +/* + * sNO -> sIV Never reached. + * sSS -> sIV Simultaneous open, not supported + * sSR -> sIV Simultaneous open, not supported. + * sES -> sIV Server may not initiate a connection. + * sFW -> sIV + * sCW -> sIV + * sLA -> sIV + * sTW -> sIV Reopened connection, but server may not do it. + * sCL -> sIV + */ +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ +/*synack*/ { sIV, sSR, sSR, sIG, sIG, sIG, sIG, sIG, sIG, sIV }, +/* + * sSS -> sSR Standard open. + * sSR -> sSR Retransmitted SYN/ACK. + * sES -> sIG Late retransmitted SYN/ACK? + * sFW -> sIG Might be SYN/ACK answering ignored SYN + * sCW -> sIG + * sLA -> sIG + * sTW -> sIG + * sCL -> sIG + */ +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ +/*fin*/ { sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV }, +/* + * sSS -> sIV Server might not send FIN in this state. + * sSR -> sFW Close started. + * sES -> sFW + * sFW -> sLA FIN seen in both directions. + * sCW -> sLA + * sLA -> sLA Retransmitted FIN. + * sTW -> sTW + * sCL -> sCL + */ +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ +/*ack*/ { sIV, sIV, sSR, sES, sCW, sCW, sTW, sTW, sCL, sIV }, +/* + * sSS -> sIV Might be a half-open connection. + * sSR -> sSR Might answer late resent SYN. + * sES -> sES :-) + * sFW -> sCW Normal close request answered by ACK. + * sCW -> sCW + * sLA -> sTW Last ACK detected. + * sTW -> sTW Retransmitted last ACK. + * sCL -> sCL + */ +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ +/*rst*/ { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sIV }, +/*none*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV } + } +}; + +static int tcp_pkt_to_tuple(const struct sk_buff *skb, + unsigned int dataoff, + struct nf_conntrack_tuple *tuple) +{ + struct tcphdr _hdr, *hp; + + /* Actually only need first 8 bytes. */ + hp = skb_header_pointer(skb, dataoff, 8, &_hdr); + if (hp == NULL) + return 0; + + tuple->src.u.tcp.port = hp->source; + tuple->dst.u.tcp.port = hp->dest; + + return 1; +} + +static int tcp_invert_tuple(struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_tuple *orig) +{ + tuple->src.u.tcp.port = orig->dst.u.tcp.port; + tuple->dst.u.tcp.port = orig->src.u.tcp.port; + return 1; +} + +/* Print out the per-protocol part of the tuple. */ +static int tcp_print_tuple(struct seq_file *s, + const struct nf_conntrack_tuple *tuple) +{ + return seq_printf(s, "sport=%hu dport=%hu ", + ntohs(tuple->src.u.tcp.port), + ntohs(tuple->dst.u.tcp.port)); +} + +/* Print out the private part of the conntrack. */ +static int tcp_print_conntrack(struct seq_file *s, + const struct nf_conn *conntrack) +{ + enum tcp_conntrack state; + + read_lock_bh(&tcp_lock); + state = conntrack->proto.tcp.state; + read_unlock_bh(&tcp_lock); + + return seq_printf(s, "%s ", tcp_conntrack_names[state]); +} + +static unsigned int get_conntrack_index(const struct tcphdr *tcph) +{ + if (tcph->rst) return TCP_RST_SET; + else if (tcph->syn) return (tcph->ack ? TCP_SYNACK_SET : TCP_SYN_SET); + else if (tcph->fin) return TCP_FIN_SET; + else if (tcph->ack) return TCP_ACK_SET; + else return TCP_NONE_SET; +} + +/* TCP connection tracking based on 'Real Stateful TCP Packet Filtering + in IP Filter' by Guido van Rooij. + + http://www.nluug.nl/events/sane2000/papers.html + http://www.iae.nl/users/guido/papers/tcp_filtering.ps.gz + + The boundaries and the conditions are changed according to RFC793: + the packet must intersect the window (i.e. segments may be + after the right or before the left edge) and thus receivers may ACK + segments after the right edge of the window. + + td_maxend = max(sack + max(win,1)) seen in reply packets + td_maxwin = max(max(win, 1)) + (sack - ack) seen in sent packets + td_maxwin += seq + len - sender.td_maxend + if seq + len > sender.td_maxend + td_end = max(seq + len) seen in sent packets + + I. Upper bound for valid data: seq <= sender.td_maxend + II. Lower bound for valid data: seq + len >= sender.td_end - receiver.td_maxwin + III. Upper bound for valid ack: sack <= receiver.td_end + IV. Lower bound for valid ack: ack >= receiver.td_end - MAXACKWINDOW + + where sack is the highest right edge of sack block found in the packet. + + The upper bound limit for a valid ack is not ignored - + we doesn't have to deal with fragments. +*/ + +static inline __u32 segment_seq_plus_len(__u32 seq, + size_t len, + unsigned int dataoff, + struct tcphdr *tcph) +{ + /* XXX Should I use payload length field in IP/IPv6 header ? + * - YK */ + return (seq + len - dataoff - tcph->doff*4 + + (tcph->syn ? 1 : 0) + (tcph->fin ? 1 : 0)); +} + +/* Fixme: what about big packets? */ +#define MAXACKWINCONST 66000 +#define MAXACKWINDOW(sender) \ + ((sender)->td_maxwin > MAXACKWINCONST ? (sender)->td_maxwin \ + : MAXACKWINCONST) + +/* + * Simplified tcp_parse_options routine from tcp_input.c + */ +static void tcp_options(const struct sk_buff *skb, + unsigned int dataoff, + struct tcphdr *tcph, + struct ip_ct_tcp_state *state) +{ + unsigned char buff[(15 * 4) - sizeof(struct tcphdr)]; + unsigned char *ptr; + int length = (tcph->doff*4) - sizeof(struct tcphdr); + + if (!length) + return; + + ptr = skb_header_pointer(skb, dataoff + sizeof(struct tcphdr), + length, buff); + BUG_ON(ptr == NULL); + + state->td_scale = + state->flags = 0; + + while (length > 0) { + int opcode=*ptr++; + int opsize; + + switch (opcode) { + case TCPOPT_EOL: + return; + case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ + length--; + continue; + default: + opsize=*ptr++; + if (opsize < 2) /* "silly options" */ + return; + if (opsize > length) + break; /* don't parse partial options */ + + if (opcode == TCPOPT_SACK_PERM + && opsize == TCPOLEN_SACK_PERM) + state->flags |= IP_CT_TCP_FLAG_SACK_PERM; + else if (opcode == TCPOPT_WINDOW + && opsize == TCPOLEN_WINDOW) { + state->td_scale = *(u_int8_t *)ptr; + + if (state->td_scale > 14) { + /* See RFC1323 */ + state->td_scale = 14; + } + state->flags |= + IP_CT_TCP_FLAG_WINDOW_SCALE; + } + ptr += opsize - 2; + length -= opsize; + } + } +} + +static void tcp_sack(const struct sk_buff *skb, unsigned int dataoff, + struct tcphdr *tcph, __u32 *sack) +{ + unsigned char buff[(15 * 4) - sizeof(struct tcphdr)]; + unsigned char *ptr; + int length = (tcph->doff*4) - sizeof(struct tcphdr); + __u32 tmp; + + if (!length) + return; + + ptr = skb_header_pointer(skb, dataoff + sizeof(struct tcphdr), + length, buff); + BUG_ON(ptr == NULL); + + /* Fast path for timestamp-only option */ + if (length == TCPOLEN_TSTAMP_ALIGNED*4 + && *(__u32 *)ptr == + __constant_ntohl((TCPOPT_NOP << 24) + | (TCPOPT_NOP << 16) + | (TCPOPT_TIMESTAMP << 8) + | TCPOLEN_TIMESTAMP)) + return; + + while (length > 0) { + int opcode = *ptr++; + int opsize, i; + + switch (opcode) { + case TCPOPT_EOL: + return; + case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ + length--; + continue; + default: + opsize = *ptr++; + if (opsize < 2) /* "silly options" */ + return; + if (opsize > length) + break; /* don't parse partial options */ + + if (opcode == TCPOPT_SACK + && opsize >= (TCPOLEN_SACK_BASE + + TCPOLEN_SACK_PERBLOCK) + && !((opsize - TCPOLEN_SACK_BASE) + % TCPOLEN_SACK_PERBLOCK)) { + for (i = 0; + i < (opsize - TCPOLEN_SACK_BASE); + i += TCPOLEN_SACK_PERBLOCK) { + memcpy(&tmp, (__u32 *)(ptr + i) + 1, + sizeof(__u32)); + tmp = ntohl(tmp); + + if (after(tmp, *sack)) + *sack = tmp; + } + return; + } + ptr += opsize - 2; + length -= opsize; + } + } +} + +static int tcp_in_window(struct ip_ct_tcp *state, + enum ip_conntrack_dir dir, + unsigned int index, + const struct sk_buff *skb, + unsigned int dataoff, + struct tcphdr *tcph, + int pf) +{ + struct ip_ct_tcp_state *sender = &state->seen[dir]; + struct ip_ct_tcp_state *receiver = &state->seen[!dir]; + __u32 seq, ack, sack, end, win, swin; + int res; + + /* + * Get the required data from the packet. + */ + seq = ntohl(tcph->seq); + ack = sack = ntohl(tcph->ack_seq); + win = ntohs(tcph->window); + end = segment_seq_plus_len(seq, skb->len, dataoff, tcph); + + if (receiver->flags & IP_CT_TCP_FLAG_SACK_PERM) + tcp_sack(skb, dataoff, tcph, &sack); + + DEBUGP("tcp_in_window: START\n"); + DEBUGP("tcp_in_window: src=%u.%u.%u.%u:%hu dst=%u.%u.%u.%u:%hu " + "seq=%u ack=%u sack=%u win=%u end=%u\n", + NIPQUAD(iph->saddr), ntohs(tcph->source), + NIPQUAD(iph->daddr), ntohs(tcph->dest), + seq, ack, sack, win, end); + DEBUGP("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i " + "receiver end=%u maxend=%u maxwin=%u scale=%i\n", + sender->td_end, sender->td_maxend, sender->td_maxwin, + sender->td_scale, + receiver->td_end, receiver->td_maxend, receiver->td_maxwin, + receiver->td_scale); + + if (sender->td_end == 0) { + /* + * Initialize sender data. + */ + if (tcph->syn && tcph->ack) { + /* + * Outgoing SYN-ACK in reply to a SYN. + */ + sender->td_end = + sender->td_maxend = end; + sender->td_maxwin = (win == 0 ? 1 : win); + + tcp_options(skb, dataoff, tcph, sender); + /* + * RFC 1323: + * Both sides must send the Window Scale option + * to enable window scaling in either direction. + */ + if (!(sender->flags & IP_CT_TCP_FLAG_WINDOW_SCALE + && receiver->flags & IP_CT_TCP_FLAG_WINDOW_SCALE)) + sender->td_scale = + receiver->td_scale = 0; + } else { + /* + * We are in the middle of a connection, + * its history is lost for us. + * Let's try to use the data from the packet. + */ + sender->td_end = end; + sender->td_maxwin = (win == 0 ? 1 : win); + sender->td_maxend = end + sender->td_maxwin; + } + } else if (((state->state == TCP_CONNTRACK_SYN_SENT + && dir == IP_CT_DIR_ORIGINAL) + || (state->state == TCP_CONNTRACK_SYN_RECV + && dir == IP_CT_DIR_REPLY)) + && after(end, sender->td_end)) { + /* + * RFC 793: "if a TCP is reinitialized ... then it need + * not wait at all; it must only be sure to use sequence + * numbers larger than those recently used." + */ + sender->td_end = + sender->td_maxend = end; + sender->td_maxwin = (win == 0 ? 1 : win); + + tcp_options(skb, dataoff, tcph, sender); + } + + if (!(tcph->ack)) { + /* + * If there is no ACK, just pretend it was set and OK. + */ + ack = sack = receiver->td_end; + } else if (((tcp_flag_word(tcph) & (TCP_FLAG_ACK|TCP_FLAG_RST)) == + (TCP_FLAG_ACK|TCP_FLAG_RST)) + && (ack == 0)) { + /* + * Broken TCP stacks, that set ACK in RST packets as well + * with zero ack value. + */ + ack = sack = receiver->td_end; + } + + if (seq == end + && (!tcph->rst + || (seq == 0 && state->state == TCP_CONNTRACK_SYN_SENT))) + /* + * Packets contains no data: we assume it is valid + * and check the ack value only. + * However RST segments are always validated by their + * SEQ number, except when seq == 0 (reset sent answering + * SYN. + */ + seq = end = sender->td_end; + + DEBUGP("tcp_in_window: src=%u.%u.%u.%u:%hu dst=%u.%u.%u.%u:%hu " + "seq=%u ack=%u sack =%u win=%u end=%u\n", + NIPQUAD(iph->saddr), ntohs(tcph->source), + NIPQUAD(iph->daddr), ntohs(tcph->dest), + seq, ack, sack, win, end); + DEBUGP("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i " + "receiver end=%u maxend=%u maxwin=%u scale=%i\n", + sender->td_end, sender->td_maxend, sender->td_maxwin, + sender->td_scale, + receiver->td_end, receiver->td_maxend, receiver->td_maxwin, + receiver->td_scale); + + DEBUGP("tcp_in_window: I=%i II=%i III=%i IV=%i\n", + before(seq, sender->td_maxend + 1), + after(end, sender->td_end - receiver->td_maxwin - 1), + before(sack, receiver->td_end + 1), + after(ack, receiver->td_end - MAXACKWINDOW(sender))); + + if (sender->loose || receiver->loose || + (before(seq, sender->td_maxend + 1) && + after(end, sender->td_end - receiver->td_maxwin - 1) && + before(sack, receiver->td_end + 1) && + after(ack, receiver->td_end - MAXACKWINDOW(sender)))) { + /* + * Take into account window scaling (RFC 1323). + */ + if (!tcph->syn) + win <<= sender->td_scale; + + /* + * Update sender data. + */ + swin = win + (sack - ack); + if (sender->td_maxwin < swin) + sender->td_maxwin = swin; + if (after(end, sender->td_end)) + sender->td_end = end; + /* + * Update receiver data. + */ + if (after(end, sender->td_maxend)) + receiver->td_maxwin += end - sender->td_maxend; + if (after(sack + win, receiver->td_maxend - 1)) { + receiver->td_maxend = sack + win; + if (win == 0) + receiver->td_maxend++; + } + + /* + * Check retransmissions. + */ + if (index == TCP_ACK_SET) { + if (state->last_dir == dir + && state->last_seq == seq + && state->last_ack == ack + && state->last_end == end) + state->retrans++; + else { + state->last_dir = dir; + state->last_seq = seq; + state->last_ack = ack; + state->last_end = end; + state->retrans = 0; + } + } + /* + * Close the window of disabled window tracking :-) + */ + if (sender->loose) + sender->loose--; + + res = 1; + } else { + if (LOG_INVALID(IPPROTO_TCP)) + nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + "nf_ct_tcp: %s ", + before(seq, sender->td_maxend + 1) ? + after(end, sender->td_end - receiver->td_maxwin - 1) ? + before(sack, receiver->td_end + 1) ? + after(ack, receiver->td_end - MAXACKWINDOW(sender)) ? "BUG" + : "ACK is under the lower bound (possible overly delayed ACK)" + : "ACK is over the upper bound (ACKed data not seen yet)" + : "SEQ is under the lower bound (already ACKed data retransmitted)" + : "SEQ is over the upper bound (over the window of the receiver)"); + + res = nf_ct_tcp_be_liberal; + } + + DEBUGP("tcp_in_window: res=%i sender end=%u maxend=%u maxwin=%u " + "receiver end=%u maxend=%u maxwin=%u\n", + res, sender->td_end, sender->td_maxend, sender->td_maxwin, + receiver->td_end, receiver->td_maxend, receiver->td_maxwin); + + return res; +} + +#ifdef CONFIG_IP_NF_NAT_NEEDED +/* Update sender->td_end after NAT successfully mangled the packet */ +/* Caller must linearize skb at tcp header. */ +void nf_conntrack_tcp_update(struct sk_buff *skb, + unsigned int dataoff, + struct nf_conn *conntrack, + int dir) +{ + struct tcphdr *tcph = (void *)skb->data + dataoff; + __u32 end; +#ifdef DEBUGP_VARS + struct ip_ct_tcp_state *sender = &conntrack->proto.tcp.seen[dir]; + struct ip_ct_tcp_state *receiver = &conntrack->proto.tcp.seen[!dir]; +#endif + + end = segment_seq_plus_len(ntohl(tcph->seq), skb->len, dataoff, tcph); + + write_lock_bh(&tcp_lock); + /* + * We have to worry for the ack in the reply packet only... + */ + if (after(end, conntrack->proto.tcp.seen[dir].td_end)) + conntrack->proto.tcp.seen[dir].td_end = end; + conntrack->proto.tcp.last_end = end; + write_unlock_bh(&tcp_lock); + DEBUGP("tcp_update: sender end=%u maxend=%u maxwin=%u scale=%i " + "receiver end=%u maxend=%u maxwin=%u scale=%i\n", + sender->td_end, sender->td_maxend, sender->td_maxwin, + sender->td_scale, + receiver->td_end, receiver->td_maxend, receiver->td_maxwin, + receiver->td_scale); +} + +#endif + +#define TH_FIN 0x01 +#define TH_SYN 0x02 +#define TH_RST 0x04 +#define TH_PUSH 0x08 +#define TH_ACK 0x10 +#define TH_URG 0x20 +#define TH_ECE 0x40 +#define TH_CWR 0x80 + +/* table of valid flag combinations - ECE and CWR are always valid */ +static u8 tcp_valid_flags[(TH_FIN|TH_SYN|TH_RST|TH_PUSH|TH_ACK|TH_URG) + 1] = +{ + [TH_SYN] = 1, + [TH_SYN|TH_ACK] = 1, + [TH_SYN|TH_ACK|TH_PUSH] = 1, + [TH_RST] = 1, + [TH_RST|TH_ACK] = 1, + [TH_RST|TH_ACK|TH_PUSH] = 1, + [TH_FIN|TH_ACK] = 1, + [TH_ACK] = 1, + [TH_ACK|TH_PUSH] = 1, + [TH_ACK|TH_URG] = 1, + [TH_ACK|TH_URG|TH_PUSH] = 1, + [TH_FIN|TH_ACK|TH_PUSH] = 1, + [TH_FIN|TH_ACK|TH_URG] = 1, + [TH_FIN|TH_ACK|TH_URG|TH_PUSH] = 1, +}; + +/* Protect conntrack agaist broken packets. Code taken from ipt_unclean.c. */ +static int tcp_error(struct sk_buff *skb, + unsigned int dataoff, + enum ip_conntrack_info *ctinfo, + int pf, + unsigned int hooknum, + int(*csum)(const struct sk_buff *,unsigned int)) +{ + struct tcphdr _tcph, *th; + unsigned int tcplen = skb->len - dataoff; + u_int8_t tcpflags; + + /* Smaller that minimal TCP header? */ + th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph); + if (th == NULL) { + if (LOG_INVALID(IPPROTO_TCP)) + nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + "nf_ct_tcp: short packet "); + return -NF_ACCEPT; + } + + /* Not whole TCP header or malformed packet */ + if (th->doff*4 < sizeof(struct tcphdr) || tcplen < th->doff*4) { + if (LOG_INVALID(IPPROTO_TCP)) + nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + "nf_ct_tcp: truncated/malformed packet "); + return -NF_ACCEPT; + } + + /* Checksum invalid? Ignore. + * We skip checking packets on the outgoing path + * because the semantic of CHECKSUM_HW is different there + * and moreover root might send raw packets. + */ + /* FIXME: Source route IP option packets --RR */ + if (((pf == PF_INET && hooknum == NF_IP_PRE_ROUTING) || + (pf == PF_INET6 && hooknum == NF_IP6_PRE_ROUTING)) + && skb->ip_summed != CHECKSUM_UNNECESSARY + && csum(skb, dataoff)) { + if (LOG_INVALID(IPPROTO_TCP)) + nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + "nf_ct_tcp: bad TCP checksum "); + return -NF_ACCEPT; + } + + /* Check TCP flags. */ + tcpflags = (((u_int8_t *)th)[13] & ~(TH_ECE|TH_CWR)); + if (!tcp_valid_flags[tcpflags]) { + if (LOG_INVALID(IPPROTO_TCP)) + nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + "nf_ct_tcp: invalid TCP flag combination "); + return -NF_ACCEPT; + } + + return NF_ACCEPT; +} + +static int csum4(const struct sk_buff *skb, unsigned int dataoff) +{ + return csum_tcpudp_magic(skb->nh.iph->saddr, skb->nh.iph->daddr, + skb->len - dataoff, IPPROTO_TCP, + skb->ip_summed == CHECKSUM_HW ? skb->csum + : skb_checksum(skb, dataoff, + skb->len - dataoff, 0)); +} + +static int csum6(const struct sk_buff *skb, unsigned int dataoff) +{ + return csum_ipv6_magic(&skb->nh.ipv6h->saddr, &skb->nh.ipv6h->daddr, + skb->len - dataoff, IPPROTO_TCP, + skb->ip_summed == CHECKSUM_HW ? skb->csum + : skb_checksum(skb, dataoff, skb->len - dataoff, + 0)); +} + +static int tcp_error4(struct sk_buff *skb, + unsigned int dataoff, + enum ip_conntrack_info *ctinfo, + int pf, + unsigned int hooknum) +{ + return tcp_error(skb, dataoff, ctinfo, pf, hooknum, csum4); +} + +static int tcp_error6(struct sk_buff *skb, + unsigned int dataoff, + enum ip_conntrack_info *ctinfo, + int pf, + unsigned int hooknum) +{ + return tcp_error(skb, dataoff, ctinfo, pf, hooknum, csum6); +} + +/* Returns verdict for packet, or -1 for invalid. */ +static int tcp_packet(struct nf_conn *conntrack, + const struct sk_buff *skb, + unsigned int dataoff, + enum ip_conntrack_info ctinfo, + int pf, + unsigned int hooknum) +{ + enum tcp_conntrack new_state, old_state; + enum ip_conntrack_dir dir; + struct tcphdr *th, _tcph; + unsigned long timeout; + unsigned int index; + + th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph); + BUG_ON(th == NULL); + + write_lock_bh(&tcp_lock); + old_state = conntrack->proto.tcp.state; + dir = CTINFO2DIR(ctinfo); + index = get_conntrack_index(th); + new_state = tcp_conntracks[dir][index][old_state]; + + switch (new_state) { + case TCP_CONNTRACK_IGNORE: + /* Either SYN in ORIGINAL + * or SYN/ACK in REPLY. */ + if (index == TCP_SYNACK_SET + && conntrack->proto.tcp.last_index == TCP_SYN_SET + && conntrack->proto.tcp.last_dir != dir + && ntohl(th->ack_seq) == + conntrack->proto.tcp.last_end) { + /* This SYN/ACK acknowledges a SYN that we earlier + * ignored as invalid. This means that the client and + * the server are both in sync, while the firewall is + * not. We kill this session and block the SYN/ACK so + * that the client cannot but retransmit its SYN and + * thus initiate a clean new session. + */ + write_unlock_bh(&tcp_lock); + if (LOG_INVALID(IPPROTO_TCP)) + nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + "nf_ct_tcp: killing out of sync session "); + if (del_timer(&conntrack->timeout)) + conntrack->timeout.function((unsigned long) + conntrack); + return -NF_DROP; + } + conntrack->proto.tcp.last_index = index; + conntrack->proto.tcp.last_dir = dir; + conntrack->proto.tcp.last_seq = ntohl(th->seq); + conntrack->proto.tcp.last_end = + segment_seq_plus_len(ntohl(th->seq), skb->len, dataoff, th); + + write_unlock_bh(&tcp_lock); + if (LOG_INVALID(IPPROTO_TCP)) + nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + "nf_ct_tcp: invalid packed ignored "); + return NF_ACCEPT; + case TCP_CONNTRACK_MAX: + /* Invalid packet */ + DEBUGP("nf_ct_tcp: Invalid dir=%i index=%u ostate=%u\n", + dir, get_conntrack_index(th), + old_state); + write_unlock_bh(&tcp_lock); + if (LOG_INVALID(IPPROTO_TCP)) + nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + "nf_ct_tcp: invalid state "); + return -NF_ACCEPT; + case TCP_CONNTRACK_SYN_SENT: + if (old_state < TCP_CONNTRACK_TIME_WAIT) + break; + if ((conntrack->proto.tcp.seen[dir].flags & + IP_CT_TCP_FLAG_CLOSE_INIT) + || after(ntohl(th->seq), + conntrack->proto.tcp.seen[dir].td_end)) { + /* Attempt to reopen a closed connection. + * Delete this connection and look up again. */ + write_unlock_bh(&tcp_lock); + if (del_timer(&conntrack->timeout)) + conntrack->timeout.function((unsigned long) + conntrack); + return -NF_REPEAT; + } + case TCP_CONNTRACK_CLOSE: + if (index == TCP_RST_SET + && test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status) + && conntrack->proto.tcp.last_index == TCP_SYN_SET + && ntohl(th->ack_seq) == conntrack->proto.tcp.last_end) { + /* RST sent to invalid SYN we had let trough + * SYN was in window then, tear down connection. + * We skip window checking, because packet might ACK + * segments we ignored in the SYN. */ + goto in_window; + } + /* Just fall trough */ + default: + /* Keep compilers happy. */ + break; + } + + if (!tcp_in_window(&conntrack->proto.tcp, dir, index, + skb, dataoff, th, pf)) { + write_unlock_bh(&tcp_lock); + return -NF_ACCEPT; + } + in_window: + /* From now on we have got in-window packets */ + conntrack->proto.tcp.last_index = index; + + DEBUGP("tcp_conntracks: src=%u.%u.%u.%u:%hu dst=%u.%u.%u.%u:%hu " + "syn=%i ack=%i fin=%i rst=%i old=%i new=%i\n", + NIPQUAD(iph->saddr), ntohs(th->source), + NIPQUAD(iph->daddr), ntohs(th->dest), + (th->syn ? 1 : 0), (th->ack ? 1 : 0), + (th->fin ? 1 : 0), (th->rst ? 1 : 0), + old_state, new_state); + + conntrack->proto.tcp.state = new_state; + if (old_state != new_state + && (new_state == TCP_CONNTRACK_FIN_WAIT + || new_state == TCP_CONNTRACK_CLOSE)) + conntrack->proto.tcp.seen[dir].flags |= IP_CT_TCP_FLAG_CLOSE_INIT; + timeout = conntrack->proto.tcp.retrans >= nf_ct_tcp_max_retrans + && *tcp_timeouts[new_state] > nf_ct_tcp_timeout_max_retrans + ? nf_ct_tcp_timeout_max_retrans : *tcp_timeouts[new_state]; + write_unlock_bh(&tcp_lock); + + nf_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, skb); + if (new_state != old_state) + nf_conntrack_event_cache(IPCT_PROTOINFO, skb); + + if (!test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)) { + /* If only reply is a RST, we can consider ourselves not to + have an established connection: this is a fairly common + problem case, so we can delete the conntrack + immediately. --RR */ + if (th->rst) { + if (del_timer(&conntrack->timeout)) + conntrack->timeout.function((unsigned long) + conntrack); + return NF_ACCEPT; + } + } else if (!test_bit(IPS_ASSURED_BIT, &conntrack->status) + && (old_state == TCP_CONNTRACK_SYN_RECV + || old_state == TCP_CONNTRACK_ESTABLISHED) + && new_state == TCP_CONNTRACK_ESTABLISHED) { + /* Set ASSURED if we see see valid ack in ESTABLISHED + after SYN_RECV or a valid answer for a picked up + connection. */ + set_bit(IPS_ASSURED_BIT, &conntrack->status); + nf_conntrack_event_cache(IPCT_STATUS, skb); + } + nf_ct_refresh_acct(conntrack, ctinfo, skb, timeout); + + return NF_ACCEPT; +} + +/* Called when a new connection for this protocol found. */ +static int tcp_new(struct nf_conn *conntrack, + const struct sk_buff *skb, + unsigned int dataoff) +{ + enum tcp_conntrack new_state; + struct tcphdr *th, _tcph; +#ifdef DEBUGP_VARS + struct ip_ct_tcp_state *sender = &conntrack->proto.tcp.seen[0]; + struct ip_ct_tcp_state *receiver = &conntrack->proto.tcp.seen[1]; +#endif + + th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph); + BUG_ON(th == NULL); + + /* Don't need lock here: this conntrack not in circulation yet */ + new_state + = tcp_conntracks[0][get_conntrack_index(th)] + [TCP_CONNTRACK_NONE]; + + /* Invalid: delete conntrack */ + if (new_state >= TCP_CONNTRACK_MAX) { + DEBUGP("nf_ct_tcp: invalid new deleting.\n"); + return 0; + } + + if (new_state == TCP_CONNTRACK_SYN_SENT) { + /* SYN packet */ + conntrack->proto.tcp.seen[0].td_end = + segment_seq_plus_len(ntohl(th->seq), skb->len, + dataoff, th); + conntrack->proto.tcp.seen[0].td_maxwin = ntohs(th->window); + if (conntrack->proto.tcp.seen[0].td_maxwin == 0) + conntrack->proto.tcp.seen[0].td_maxwin = 1; + conntrack->proto.tcp.seen[0].td_maxend = + conntrack->proto.tcp.seen[0].td_end; + + tcp_options(skb, dataoff, th, &conntrack->proto.tcp.seen[0]); + conntrack->proto.tcp.seen[1].flags = 0; + conntrack->proto.tcp.seen[0].loose = + conntrack->proto.tcp.seen[1].loose = 0; + } else if (nf_ct_tcp_loose == 0) { + /* Don't try to pick up connections. */ + return 0; + } else { + /* + * We are in the middle of a connection, + * its history is lost for us. + * Let's try to use the data from the packet. + */ + conntrack->proto.tcp.seen[0].td_end = + segment_seq_plus_len(ntohl(th->seq), skb->len, + dataoff, th); + conntrack->proto.tcp.seen[0].td_maxwin = ntohs(th->window); + if (conntrack->proto.tcp.seen[0].td_maxwin == 0) + conntrack->proto.tcp.seen[0].td_maxwin = 1; + conntrack->proto.tcp.seen[0].td_maxend = + conntrack->proto.tcp.seen[0].td_end + + conntrack->proto.tcp.seen[0].td_maxwin; + conntrack->proto.tcp.seen[0].td_scale = 0; + + /* We assume SACK. Should we assume window scaling too? */ + conntrack->proto.tcp.seen[0].flags = + conntrack->proto.tcp.seen[1].flags = IP_CT_TCP_FLAG_SACK_PERM; + conntrack->proto.tcp.seen[0].loose = + conntrack->proto.tcp.seen[1].loose = nf_ct_tcp_loose; + } + + conntrack->proto.tcp.seen[1].td_end = 0; + conntrack->proto.tcp.seen[1].td_maxend = 0; + conntrack->proto.tcp.seen[1].td_maxwin = 1; + conntrack->proto.tcp.seen[1].td_scale = 0; + + /* tcp_packet will set them */ + conntrack->proto.tcp.state = TCP_CONNTRACK_NONE; + conntrack->proto.tcp.last_index = TCP_NONE_SET; + + DEBUGP("tcp_new: sender end=%u maxend=%u maxwin=%u scale=%i " + "receiver end=%u maxend=%u maxwin=%u scale=%i\n", + sender->td_end, sender->td_maxend, sender->td_maxwin, + sender->td_scale, + receiver->td_end, receiver->td_maxend, receiver->td_maxwin, + receiver->td_scale); + return 1; +} + +struct nf_conntrack_protocol nf_conntrack_protocol_tcp4 = +{ + .l3proto = PF_INET, + .proto = IPPROTO_TCP, + .name = "tcp", + .pkt_to_tuple = tcp_pkt_to_tuple, + .invert_tuple = tcp_invert_tuple, + .print_tuple = tcp_print_tuple, + .print_conntrack = tcp_print_conntrack, + .packet = tcp_packet, + .new = tcp_new, + .error = tcp_error4, +}; + +struct nf_conntrack_protocol nf_conntrack_protocol_tcp6 = +{ + .l3proto = PF_INET6, + .proto = IPPROTO_TCP, + .name = "tcp", + .pkt_to_tuple = tcp_pkt_to_tuple, + .invert_tuple = tcp_invert_tuple, + .print_tuple = tcp_print_tuple, + .print_conntrack = tcp_print_conntrack, + .packet = tcp_packet, + .new = tcp_new, + .error = tcp_error6, +}; + +EXPORT_SYMBOL(nf_conntrack_protocol_tcp4); +EXPORT_SYMBOL(nf_conntrack_protocol_tcp6); diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c new file mode 100644 index 000000000000..3cae7ce420dd --- /dev/null +++ b/net/netfilter/nf_conntrack_proto_udp.c @@ -0,0 +1,216 @@ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * 16 Dec 2003: Yasuyuki Kozakai @USAGI + * - enable working with Layer 3 protocol independent connection tracking. + * + * Derived from net/ipv4/netfilter/ip_conntrack_proto_udp.c + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +unsigned long nf_ct_udp_timeout = 30*HZ; +unsigned long nf_ct_udp_timeout_stream = 180*HZ; + +static int udp_pkt_to_tuple(const struct sk_buff *skb, + unsigned int dataoff, + struct nf_conntrack_tuple *tuple) +{ + struct udphdr _hdr, *hp; + + /* Actually only need first 8 bytes. */ + hp = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); + if (hp == NULL) + return 0; + + tuple->src.u.udp.port = hp->source; + tuple->dst.u.udp.port = hp->dest; + + return 1; +} + +static int udp_invert_tuple(struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_tuple *orig) +{ + tuple->src.u.udp.port = orig->dst.u.udp.port; + tuple->dst.u.udp.port = orig->src.u.udp.port; + return 1; +} + +/* Print out the per-protocol part of the tuple. */ +static int udp_print_tuple(struct seq_file *s, + const struct nf_conntrack_tuple *tuple) +{ + return seq_printf(s, "sport=%hu dport=%hu ", + ntohs(tuple->src.u.udp.port), + ntohs(tuple->dst.u.udp.port)); +} + +/* Print out the private part of the conntrack. */ +static int udp_print_conntrack(struct seq_file *s, + const struct nf_conn *conntrack) +{ + return 0; +} + +/* Returns verdict for packet, and may modify conntracktype */ +static int udp_packet(struct nf_conn *conntrack, + const struct sk_buff *skb, + unsigned int dataoff, + enum ip_conntrack_info ctinfo, + int pf, + unsigned int hooknum) +{ + /* If we've seen traffic both ways, this is some kind of UDP + stream. Extend timeout. */ + if (test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)) { + nf_ct_refresh_acct(conntrack, ctinfo, skb, + nf_ct_udp_timeout_stream); + /* Also, more likely to be important, and not a probe */ + if (!test_and_set_bit(IPS_ASSURED_BIT, &conntrack->status)) + nf_conntrack_event_cache(IPCT_STATUS, skb); + } else + nf_ct_refresh_acct(conntrack, ctinfo, skb, nf_ct_udp_timeout); + + return NF_ACCEPT; +} + +/* Called when a new connection for this protocol found. */ +static int udp_new(struct nf_conn *conntrack, const struct sk_buff *skb, + unsigned int dataoff) +{ + return 1; +} + +static int udp_error(struct sk_buff *skb, unsigned int dataoff, + enum ip_conntrack_info *ctinfo, + int pf, + unsigned int hooknum, + int (*csum)(const struct sk_buff *, unsigned int)) +{ + unsigned int udplen = skb->len - dataoff; + struct udphdr _hdr, *hdr; + + /* Header is too small? */ + hdr = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); + if (hdr == NULL) { + if (LOG_INVALID(IPPROTO_UDP)) + nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + "nf_ct_udp: short packet "); + return -NF_ACCEPT; + } + + /* Truncated/malformed packets */ + if (ntohs(hdr->len) > udplen || ntohs(hdr->len) < sizeof(*hdr)) { + if (LOG_INVALID(IPPROTO_UDP)) + nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + "nf_ct_udp: truncated/malformed packet "); + return -NF_ACCEPT; + } + + /* Packet with no checksum */ + if (!hdr->check) + return NF_ACCEPT; + + /* Checksum invalid? Ignore. + * We skip checking packets on the outgoing path + * because the semantic of CHECKSUM_HW is different there + * and moreover root might send raw packets. + * FIXME: Source route IP option packets --RR */ + if (((pf == PF_INET && hooknum == NF_IP_PRE_ROUTING) || + (pf == PF_INET6 && hooknum == NF_IP6_PRE_ROUTING)) + && skb->ip_summed != CHECKSUM_UNNECESSARY + && csum(skb, dataoff)) { + if (LOG_INVALID(IPPROTO_UDP)) + nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + "nf_ct_udp: bad UDP checksum "); + return -NF_ACCEPT; + } + + return NF_ACCEPT; +} + +static int csum4(const struct sk_buff *skb, unsigned int dataoff) +{ + return csum_tcpudp_magic(skb->nh.iph->saddr, skb->nh.iph->daddr, + skb->len - dataoff, IPPROTO_UDP, + skb->ip_summed == CHECKSUM_HW ? skb->csum + : skb_checksum(skb, dataoff, + skb->len - dataoff, 0)); +} + +static int csum6(const struct sk_buff *skb, unsigned int dataoff) +{ + return csum_ipv6_magic(&skb->nh.ipv6h->saddr, &skb->nh.ipv6h->daddr, + skb->len - dataoff, IPPROTO_UDP, + skb->ip_summed == CHECKSUM_HW ? skb->csum + : skb_checksum(skb, dataoff, skb->len - dataoff, + 0)); +} + +static int udp_error4(struct sk_buff *skb, + unsigned int dataoff, + enum ip_conntrack_info *ctinfo, + int pf, + unsigned int hooknum) +{ + return udp_error(skb, dataoff, ctinfo, pf, hooknum, csum4); +} + +static int udp_error6(struct sk_buff *skb, + unsigned int dataoff, + enum ip_conntrack_info *ctinfo, + int pf, + unsigned int hooknum) +{ + return udp_error(skb, dataoff, ctinfo, pf, hooknum, csum6); +} + +struct nf_conntrack_protocol nf_conntrack_protocol_udp4 = +{ + .l3proto = PF_INET, + .proto = IPPROTO_UDP, + .name = "udp", + .pkt_to_tuple = udp_pkt_to_tuple, + .invert_tuple = udp_invert_tuple, + .print_tuple = udp_print_tuple, + .print_conntrack = udp_print_conntrack, + .packet = udp_packet, + .new = udp_new, + .error = udp_error4, +}; + +struct nf_conntrack_protocol nf_conntrack_protocol_udp6 = +{ + .l3proto = PF_INET6, + .proto = IPPROTO_UDP, + .name = "udp", + .pkt_to_tuple = udp_pkt_to_tuple, + .invert_tuple = udp_invert_tuple, + .print_tuple = udp_print_tuple, + .print_conntrack = udp_print_conntrack, + .packet = udp_packet, + .new = udp_new, + .error = udp_error6, +}; + +EXPORT_SYMBOL(nf_conntrack_protocol_udp4); +EXPORT_SYMBOL(nf_conntrack_protocol_udp6); diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c new file mode 100644 index 000000000000..45224db4fe2f --- /dev/null +++ b/net/netfilter/nf_conntrack_standalone.c @@ -0,0 +1,869 @@ +/* This file contains all the functions required for the standalone + nf_conntrack module. + + These are not required by the compatibility layer. +*/ + +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * 16 Dec 2003: Yasuyuki Kozakai @USAGI + * - generalize L3 protocol dependent part. + * + * Derived from net/ipv4/netfilter/ip_conntrack_standalone.c + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef CONFIG_SYSCTL +#include +#endif + +#define ASSERT_READ_LOCK(x) +#define ASSERT_WRITE_LOCK(x) + +#include +#include +#include +#include +#include +#include + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +MODULE_LICENSE("GPL"); + +extern atomic_t nf_conntrack_count; +DECLARE_PER_CPU(struct ip_conntrack_stat, nf_conntrack_stat); + +static int kill_l3proto(struct nf_conn *i, void *data) +{ + return (i->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num == + ((struct nf_conntrack_l3proto *)data)->l3proto); +} + +static int kill_proto(struct nf_conn *i, void *data) +{ + struct nf_conntrack_protocol *proto; + proto = (struct nf_conntrack_protocol *)data; + return (i->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum == + proto->proto) && + (i->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num == + proto->l3proto); +} + +#ifdef CONFIG_PROC_FS +static int +print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple, + struct nf_conntrack_l3proto *l3proto, + struct nf_conntrack_protocol *proto) +{ + return l3proto->print_tuple(s, tuple) || proto->print_tuple(s, tuple); +} + +#ifdef CONFIG_NF_CT_ACCT +static unsigned int +seq_print_counters(struct seq_file *s, + const struct ip_conntrack_counter *counter) +{ + return seq_printf(s, "packets=%llu bytes=%llu ", + (unsigned long long)counter->packets, + (unsigned long long)counter->bytes); +} +#else +#define seq_print_counters(x, y) 0 +#endif + +struct ct_iter_state { + unsigned int bucket; +}; + +static struct list_head *ct_get_first(struct seq_file *seq) +{ + struct ct_iter_state *st = seq->private; + + for (st->bucket = 0; + st->bucket < nf_conntrack_htable_size; + st->bucket++) { + if (!list_empty(&nf_conntrack_hash[st->bucket])) + return nf_conntrack_hash[st->bucket].next; + } + return NULL; +} + +static struct list_head *ct_get_next(struct seq_file *seq, struct list_head *head) +{ + struct ct_iter_state *st = seq->private; + + head = head->next; + while (head == &nf_conntrack_hash[st->bucket]) { + if (++st->bucket >= nf_conntrack_htable_size) + return NULL; + head = nf_conntrack_hash[st->bucket].next; + } + return head; +} + +static struct list_head *ct_get_idx(struct seq_file *seq, loff_t pos) +{ + struct list_head *head = ct_get_first(seq); + + if (head) + while (pos && (head = ct_get_next(seq, head))) + pos--; + return pos ? NULL : head; +} + +static void *ct_seq_start(struct seq_file *seq, loff_t *pos) +{ + read_lock_bh(&nf_conntrack_lock); + return ct_get_idx(seq, *pos); +} + +static void *ct_seq_next(struct seq_file *s, void *v, loff_t *pos) +{ + (*pos)++; + return ct_get_next(s, v); +} + +static void ct_seq_stop(struct seq_file *s, void *v) +{ + read_unlock_bh(&nf_conntrack_lock); +} + +/* return 0 on success, 1 in case of error */ +static int ct_seq_show(struct seq_file *s, void *v) +{ + const struct nf_conntrack_tuple_hash *hash = v; + const struct nf_conn *conntrack = nf_ct_tuplehash_to_ctrack(hash); + struct nf_conntrack_l3proto *l3proto; + struct nf_conntrack_protocol *proto; + + ASSERT_READ_LOCK(&nf_conntrack_lock); + NF_CT_ASSERT(conntrack); + + /* we only want to print DIR_ORIGINAL */ + if (NF_CT_DIRECTION(hash)) + return 0; + + l3proto = nf_ct_find_l3proto(conntrack->tuplehash[IP_CT_DIR_ORIGINAL] + .tuple.src.l3num); + + NF_CT_ASSERT(l3proto); + proto = nf_ct_find_proto(conntrack->tuplehash[IP_CT_DIR_ORIGINAL] + .tuple.src.l3num, + conntrack->tuplehash[IP_CT_DIR_ORIGINAL] + .tuple.dst.protonum); + NF_CT_ASSERT(proto); + + if (seq_printf(s, "%-8s %u %-8s %u %ld ", + l3proto->name, + conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num, + proto->name, + conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum, + timer_pending(&conntrack->timeout) + ? (long)(conntrack->timeout.expires - jiffies)/HZ : 0) != 0) + return -ENOSPC; + + if (l3proto->print_conntrack(s, conntrack)) + return -ENOSPC; + + if (proto->print_conntrack(s, conntrack)) + return -ENOSPC; + + if (print_tuple(s, &conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple, + l3proto, proto)) + return -ENOSPC; + + if (seq_print_counters(s, &conntrack->counters[IP_CT_DIR_ORIGINAL])) + return -ENOSPC; + + if (!(test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status))) + if (seq_printf(s, "[UNREPLIED] ")) + return -ENOSPC; + + if (print_tuple(s, &conntrack->tuplehash[IP_CT_DIR_REPLY].tuple, + l3proto, proto)) + return -ENOSPC; + + if (seq_print_counters(s, &conntrack->counters[IP_CT_DIR_REPLY])) + return -ENOSPC; + + if (test_bit(IPS_ASSURED_BIT, &conntrack->status)) + if (seq_printf(s, "[ASSURED] ")) + return -ENOSPC; + +#if defined(CONFIG_NF_CONNTRACK_MARK) + if (seq_printf(s, "mark=%u ", conntrack->mark)) + return -ENOSPC; +#endif + + if (seq_printf(s, "use=%u\n", atomic_read(&conntrack->ct_general.use))) + return -ENOSPC; + + return 0; +} + +static struct seq_operations ct_seq_ops = { + .start = ct_seq_start, + .next = ct_seq_next, + .stop = ct_seq_stop, + .show = ct_seq_show +}; + +static int ct_open(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + struct ct_iter_state *st; + int ret; + + st = kmalloc(sizeof(struct ct_iter_state), GFP_KERNEL); + if (st == NULL) + return -ENOMEM; + ret = seq_open(file, &ct_seq_ops); + if (ret) + goto out_free; + seq = file->private_data; + seq->private = st; + memset(st, 0, sizeof(struct ct_iter_state)); + return ret; +out_free: + kfree(st); + return ret; +} + +static struct file_operations ct_file_ops = { + .owner = THIS_MODULE, + .open = ct_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + +/* expects */ +static void *exp_seq_start(struct seq_file *s, loff_t *pos) +{ + struct list_head *e = &nf_conntrack_expect_list; + loff_t i; + + /* strange seq_file api calls stop even if we fail, + * thus we need to grab lock since stop unlocks */ + read_lock_bh(&nf_conntrack_lock); + + if (list_empty(e)) + return NULL; + + for (i = 0; i <= *pos; i++) { + e = e->next; + if (e == &nf_conntrack_expect_list) + return NULL; + } + return e; +} + +static void *exp_seq_next(struct seq_file *s, void *v, loff_t *pos) +{ + struct list_head *e = v; + + ++*pos; + e = e->next; + + if (e == &nf_conntrack_expect_list) + return NULL; + + return e; +} + +static void exp_seq_stop(struct seq_file *s, void *v) +{ + read_unlock_bh(&nf_conntrack_lock); +} + +static int exp_seq_show(struct seq_file *s, void *v) +{ + struct nf_conntrack_expect *expect = v; + + if (expect->timeout.function) + seq_printf(s, "%ld ", timer_pending(&expect->timeout) + ? (long)(expect->timeout.expires - jiffies)/HZ : 0); + else + seq_printf(s, "- "); + seq_printf(s, "l3proto = %u proto=%u ", + expect->tuple.src.l3num, + expect->tuple.dst.protonum); + print_tuple(s, &expect->tuple, + nf_ct_find_l3proto(expect->tuple.src.l3num), + nf_ct_find_proto(expect->tuple.src.l3num, + expect->tuple.dst.protonum)); + return seq_putc(s, '\n'); +} + +static struct seq_operations exp_seq_ops = { + .start = exp_seq_start, + .next = exp_seq_next, + .stop = exp_seq_stop, + .show = exp_seq_show +}; + +static int exp_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &exp_seq_ops); +} + +static struct file_operations exp_file_ops = { + .owner = THIS_MODULE, + .open = exp_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release +}; + +static void *ct_cpu_seq_start(struct seq_file *seq, loff_t *pos) +{ + int cpu; + + if (*pos == 0) + return SEQ_START_TOKEN; + + for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) { + if (!cpu_possible(cpu)) + continue; + *pos = cpu + 1; + return &per_cpu(nf_conntrack_stat, cpu); + } + + return NULL; +} + +static void *ct_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + int cpu; + + for (cpu = *pos; cpu < NR_CPUS; ++cpu) { + if (!cpu_possible(cpu)) + continue; + *pos = cpu + 1; + return &per_cpu(nf_conntrack_stat, cpu); + } + + return NULL; +} + +static void ct_cpu_seq_stop(struct seq_file *seq, void *v) +{ +} + +static int ct_cpu_seq_show(struct seq_file *seq, void *v) +{ + unsigned int nr_conntracks = atomic_read(&nf_conntrack_count); + struct ip_conntrack_stat *st = v; + + if (v == SEQ_START_TOKEN) { + seq_printf(seq, "entries searched found new invalid ignore delete delete_list insert insert_failed drop early_drop icmp_error expect_new expect_create expect_delete\n"); + return 0; + } + + seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x " + "%08x %08x %08x %08x %08x %08x %08x %08x \n", + nr_conntracks, + st->searched, + st->found, + st->new, + st->invalid, + st->ignore, + st->delete, + st->delete_list, + st->insert, + st->insert_failed, + st->drop, + st->early_drop, + st->error, + + st->expect_new, + st->expect_create, + st->expect_delete + ); + return 0; +} + +static struct seq_operations ct_cpu_seq_ops = { + .start = ct_cpu_seq_start, + .next = ct_cpu_seq_next, + .stop = ct_cpu_seq_stop, + .show = ct_cpu_seq_show, +}; + +static int ct_cpu_seq_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &ct_cpu_seq_ops); +} + +static struct file_operations ct_cpu_seq_fops = { + .owner = THIS_MODULE, + .open = ct_cpu_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; +#endif /* CONFIG_PROC_FS */ + +/* Sysctl support */ + +#ifdef CONFIG_SYSCTL + +/* From nf_conntrack_core.c */ +extern int nf_conntrack_max; +extern unsigned int nf_conntrack_htable_size; + +/* From nf_conntrack_proto_tcp.c */ +extern unsigned long nf_ct_tcp_timeout_syn_sent; +extern unsigned long nf_ct_tcp_timeout_syn_recv; +extern unsigned long nf_ct_tcp_timeout_established; +extern unsigned long nf_ct_tcp_timeout_fin_wait; +extern unsigned long nf_ct_tcp_timeout_close_wait; +extern unsigned long nf_ct_tcp_timeout_last_ack; +extern unsigned long nf_ct_tcp_timeout_time_wait; +extern unsigned long nf_ct_tcp_timeout_close; +extern unsigned long nf_ct_tcp_timeout_max_retrans; +extern int nf_ct_tcp_loose; +extern int nf_ct_tcp_be_liberal; +extern int nf_ct_tcp_max_retrans; + +/* From nf_conntrack_proto_udp.c */ +extern unsigned long nf_ct_udp_timeout; +extern unsigned long nf_ct_udp_timeout_stream; + +/* From nf_conntrack_proto_generic.c */ +extern unsigned long nf_ct_generic_timeout; + +/* Log invalid packets of a given protocol */ +static int log_invalid_proto_min = 0; +static int log_invalid_proto_max = 255; + +static struct ctl_table_header *nf_ct_sysctl_header; + +static ctl_table nf_ct_sysctl_table[] = { + { + .ctl_name = NET_NF_CONNTRACK_MAX, + .procname = "nf_conntrack_max", + .data = &nf_conntrack_max, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_NF_CONNTRACK_COUNT, + .procname = "nf_conntrack_count", + .data = &nf_conntrack_count, + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_NF_CONNTRACK_BUCKETS, + .procname = "nf_conntrack_buckets", + .data = &nf_conntrack_htable_size, + .maxlen = sizeof(unsigned int), + .mode = 0444, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT, + .procname = "nf_conntrack_tcp_timeout_syn_sent", + .data = &nf_ct_tcp_timeout_syn_sent, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_RECV, + .procname = "nf_conntrack_tcp_timeout_syn_recv", + .data = &nf_ct_tcp_timeout_syn_recv, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_NF_CONNTRACK_TCP_TIMEOUT_ESTABLISHED, + .procname = "nf_conntrack_tcp_timeout_established", + .data = &nf_ct_tcp_timeout_established, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_NF_CONNTRACK_TCP_TIMEOUT_FIN_WAIT, + .procname = "nf_conntrack_tcp_timeout_fin_wait", + .data = &nf_ct_tcp_timeout_fin_wait, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_NF_CONNTRACK_TCP_TIMEOUT_CLOSE_WAIT, + .procname = "nf_conntrack_tcp_timeout_close_wait", + .data = &nf_ct_tcp_timeout_close_wait, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_NF_CONNTRACK_TCP_TIMEOUT_LAST_ACK, + .procname = "nf_conntrack_tcp_timeout_last_ack", + .data = &nf_ct_tcp_timeout_last_ack, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_NF_CONNTRACK_TCP_TIMEOUT_TIME_WAIT, + .procname = "nf_conntrack_tcp_timeout_time_wait", + .data = &nf_ct_tcp_timeout_time_wait, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_NF_CONNTRACK_TCP_TIMEOUT_CLOSE, + .procname = "nf_conntrack_tcp_timeout_close", + .data = &nf_ct_tcp_timeout_close, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_NF_CONNTRACK_UDP_TIMEOUT, + .procname = "nf_conntrack_udp_timeout", + .data = &nf_ct_udp_timeout, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_NF_CONNTRACK_UDP_TIMEOUT_STREAM, + .procname = "nf_conntrack_udp_timeout_stream", + .data = &nf_ct_udp_timeout_stream, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_NF_CONNTRACK_GENERIC_TIMEOUT, + .procname = "nf_conntrack_generic_timeout", + .data = &nf_ct_generic_timeout, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_NF_CONNTRACK_LOG_INVALID, + .procname = "nf_conntrack_log_invalid", + .data = &nf_ct_log_invalid, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &log_invalid_proto_min, + .extra2 = &log_invalid_proto_max, + }, + { + .ctl_name = NET_NF_CONNTRACK_TCP_TIMEOUT_MAX_RETRANS, + .procname = "nf_conntrack_tcp_timeout_max_retrans", + .data = &nf_ct_tcp_timeout_max_retrans, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_NF_CONNTRACK_TCP_LOOSE, + .procname = "nf_conntrack_tcp_loose", + .data = &nf_ct_tcp_loose, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_NF_CONNTRACK_TCP_BE_LIBERAL, + .procname = "nf_conntrack_tcp_be_liberal", + .data = &nf_ct_tcp_be_liberal, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_NF_CONNTRACK_TCP_MAX_RETRANS, + .procname = "nf_conntrack_tcp_max_retrans", + .data = &nf_ct_tcp_max_retrans, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + + { .ctl_name = 0 } +}; + +#define NET_NF_CONNTRACK_MAX 2089 + +static ctl_table nf_ct_netfilter_table[] = { + { + .ctl_name = NET_NETFILTER, + .procname = "netfilter", + .mode = 0555, + .child = nf_ct_sysctl_table, + }, + { + .ctl_name = NET_NF_CONNTRACK_MAX, + .procname = "nf_conntrack_max", + .data = &nf_conntrack_max, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { .ctl_name = 0 } +}; + +static ctl_table nf_ct_net_table[] = { + { + .ctl_name = CTL_NET, + .procname = "net", + .mode = 0555, + .child = nf_ct_netfilter_table, + }, + { .ctl_name = 0 } +}; +EXPORT_SYMBOL(nf_ct_log_invalid); +#endif /* CONFIG_SYSCTL */ + +static int init_or_cleanup(int init) +{ +#ifdef CONFIG_PROC_FS + struct proc_dir_entry *proc, *proc_exp, *proc_stat; +#endif + int ret = 0; + + if (!init) goto cleanup; + + ret = nf_conntrack_init(); + if (ret < 0) + goto cleanup_nothing; + +#ifdef CONFIG_PROC_FS + proc = proc_net_fops_create("nf_conntrack", 0440, &ct_file_ops); + if (!proc) goto cleanup_init; + + proc_exp = proc_net_fops_create("nf_conntrack_expect", 0440, + &exp_file_ops); + if (!proc_exp) goto cleanup_proc; + + proc_stat = create_proc_entry("nf_conntrack", S_IRUGO, proc_net_stat); + if (!proc_stat) + goto cleanup_proc_exp; + + proc_stat->proc_fops = &ct_cpu_seq_fops; + proc_stat->owner = THIS_MODULE; +#endif +#ifdef CONFIG_SYSCTL + nf_ct_sysctl_header = register_sysctl_table(nf_ct_net_table, 0); + if (nf_ct_sysctl_header == NULL) { + printk("nf_conntrack: can't register to sysctl.\n"); + ret = -ENOMEM; + goto cleanup_proc_stat; + } +#endif + + return ret; + + cleanup: +#ifdef CONFIG_SYSCTL + unregister_sysctl_table(nf_ct_sysctl_header); + cleanup_proc_stat: +#endif +#ifdef CONFIG_PROC_FS + proc_net_remove("nf_conntrack_stat"); + cleanup_proc_exp: + proc_net_remove("nf_conntrack_expect"); + cleanup_proc: + proc_net_remove("nf_conntrack"); + cleanup_init: +#endif /* CNFIG_PROC_FS */ + nf_conntrack_cleanup(); + cleanup_nothing: + return ret; +} + +int nf_conntrack_l3proto_register(struct nf_conntrack_l3proto *proto) +{ + int ret = 0; + + write_lock_bh(&nf_conntrack_lock); + if (nf_ct_l3protos[proto->l3proto] != &nf_conntrack_generic_l3proto) { + ret = -EBUSY; + goto out; + } + nf_ct_l3protos[proto->l3proto] = proto; +out: + write_unlock_bh(&nf_conntrack_lock); + + return ret; +} + +void nf_conntrack_l3proto_unregister(struct nf_conntrack_l3proto *proto) +{ + write_lock_bh(&nf_conntrack_lock); + nf_ct_l3protos[proto->l3proto] = &nf_conntrack_generic_l3proto; + write_unlock_bh(&nf_conntrack_lock); + + /* Somebody could be still looking at the proto in bh. */ + synchronize_net(); + + /* Remove all contrack entries for this protocol */ + nf_ct_iterate_cleanup(kill_l3proto, proto); +} + +/* FIXME: Allow NULL functions and sub in pointers to generic for + them. --RR */ +int nf_conntrack_protocol_register(struct nf_conntrack_protocol *proto) +{ + int ret = 0; + +retry: + write_lock_bh(&nf_conntrack_lock); + if (nf_ct_protos[proto->l3proto]) { + if (nf_ct_protos[proto->l3proto][proto->proto] + != &nf_conntrack_generic_protocol) { + ret = -EBUSY; + goto out_unlock; + } + } else { + /* l3proto may be loaded latter. */ + struct nf_conntrack_protocol **proto_array; + int i; + + write_unlock_bh(&nf_conntrack_lock); + + proto_array = (struct nf_conntrack_protocol **) + kmalloc(MAX_NF_CT_PROTO * + sizeof(struct nf_conntrack_protocol *), + GFP_KERNEL); + if (proto_array == NULL) { + ret = -ENOMEM; + goto out; + } + for (i = 0; i < MAX_NF_CT_PROTO; i++) + proto_array[i] = &nf_conntrack_generic_protocol; + + write_lock_bh(&nf_conntrack_lock); + if (nf_ct_protos[proto->l3proto]) { + /* bad timing, but no problem */ + write_unlock_bh(&nf_conntrack_lock); + kfree(proto_array); + } else { + nf_ct_protos[proto->l3proto] = proto_array; + write_unlock_bh(&nf_conntrack_lock); + } + + /* + * Just once because array is never freed until unloading + * nf_conntrack.ko + */ + goto retry; + } + + nf_ct_protos[proto->l3proto][proto->proto] = proto; + +out_unlock: + write_unlock_bh(&nf_conntrack_lock); +out: + return ret; +} + +void nf_conntrack_protocol_unregister(struct nf_conntrack_protocol *proto) +{ + write_lock_bh(&nf_conntrack_lock); + nf_ct_protos[proto->l3proto][proto->proto] + = &nf_conntrack_generic_protocol; + write_unlock_bh(&nf_conntrack_lock); + + /* Somebody could be still looking at the proto in bh. */ + synchronize_net(); + + /* Remove all contrack entries for this protocol */ + nf_ct_iterate_cleanup(kill_proto, proto); +} + +static int __init init(void) +{ + return init_or_cleanup(1); +} + +static void __exit fini(void) +{ + init_or_cleanup(0); +} + +module_init(init); +module_exit(fini); + +/* Some modules need us, but don't depend directly on any symbol. + They should call this. */ +void need_nf_conntrack(void) +{ +} + +#ifdef CONFIG_NF_CONNTRACK_EVENTS +EXPORT_SYMBOL_GPL(nf_conntrack_chain); +EXPORT_SYMBOL_GPL(nf_conntrack_expect_chain); +EXPORT_SYMBOL_GPL(nf_conntrack_register_notifier); +EXPORT_SYMBOL_GPL(nf_conntrack_unregister_notifier); +EXPORT_SYMBOL_GPL(__nf_ct_event_cache_init); +EXPORT_PER_CPU_SYMBOL_GPL(nf_conntrack_ecache); +EXPORT_SYMBOL_GPL(nf_ct_deliver_cached_events); +#endif +EXPORT_SYMBOL(nf_conntrack_l3proto_register); +EXPORT_SYMBOL(nf_conntrack_l3proto_unregister); +EXPORT_SYMBOL(nf_conntrack_protocol_register); +EXPORT_SYMBOL(nf_conntrack_protocol_unregister); +EXPORT_SYMBOL(nf_ct_invert_tuplepr); +EXPORT_SYMBOL(nf_conntrack_alter_reply); +EXPORT_SYMBOL(nf_conntrack_destroyed); +EXPORT_SYMBOL(need_nf_conntrack); +EXPORT_SYMBOL(nf_conntrack_helper_register); +EXPORT_SYMBOL(nf_conntrack_helper_unregister); +EXPORT_SYMBOL(nf_ct_iterate_cleanup); +EXPORT_SYMBOL(__nf_ct_refresh_acct); +EXPORT_SYMBOL(nf_ct_protos); +EXPORT_SYMBOL(nf_ct_find_proto); +EXPORT_SYMBOL(nf_ct_l3protos); +EXPORT_SYMBOL(nf_conntrack_expect_alloc); +EXPORT_SYMBOL(nf_conntrack_expect_put); +EXPORT_SYMBOL(nf_conntrack_expect_related); +EXPORT_SYMBOL(nf_conntrack_unexpect_related); +EXPORT_SYMBOL(nf_conntrack_tuple_taken); +EXPORT_SYMBOL(nf_conntrack_htable_size); +EXPORT_SYMBOL(nf_conntrack_lock); +EXPORT_SYMBOL(nf_conntrack_hash); +EXPORT_SYMBOL(nf_conntrack_untracked); +EXPORT_SYMBOL_GPL(nf_conntrack_find_get); +#ifdef CONFIG_IP_NF_NAT_NEEDED +EXPORT_SYMBOL(nf_conntrack_tcp_update); +#endif +EXPORT_SYMBOL(__nf_conntrack_confirm); +EXPORT_SYMBOL(nf_ct_get_tuple); +EXPORT_SYMBOL(nf_ct_invert_tuple); +EXPORT_SYMBOL(nf_conntrack_in); +EXPORT_SYMBOL(__nf_conntrack_attach); -- cgit v1.2.3 From bfa83a9e03cf8d501c6272999843470afecb32ed Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Thu, 10 Nov 2005 02:25:51 +0100 Subject: [NETLINK]: Type-safe netlink messages/attributes interface Introduces a new type-safe interface for netlink message and attributes handling. The interface is fully binary compatible with the old interface towards userspace. Besides type safety, this interface features attribute validation capabilities, simplified message contstruction, and documentation. The resulting netlink code should be smaller, less error prone and easier to understand. Signed-off-by: Thomas Graf Signed-off-by: David S. Miller --- include/linux/netlink.h | 24 +- include/net/netlink.h | 877 ++++++++++++++++++++++++++++++++++++++++++++++++ net/netlink/Makefile | 2 +- net/netlink/attr.c | 328 ++++++++++++++++++ 4 files changed, 1229 insertions(+), 2 deletions(-) create mode 100644 include/net/netlink.h create mode 100644 net/netlink/attr.c (limited to 'net') diff --git a/include/linux/netlink.h b/include/linux/netlink.h index ba25ca874c20..6a2ccf78a356 100644 --- a/include/linux/netlink.h +++ b/include/linux/netlink.h @@ -71,7 +71,8 @@ struct nlmsghdr #define NLMSG_ALIGNTO 4 #define NLMSG_ALIGN(len) ( ((len)+NLMSG_ALIGNTO-1) & ~(NLMSG_ALIGNTO-1) ) -#define NLMSG_LENGTH(len) ((len)+NLMSG_ALIGN(sizeof(struct nlmsghdr))) +#define NLMSG_HDRLEN ((int) NLMSG_ALIGN(sizeof(struct nlmsghdr))) +#define NLMSG_LENGTH(len) ((len)+NLMSG_ALIGN(NLMSG_HDRLEN)) #define NLMSG_SPACE(len) NLMSG_ALIGN(NLMSG_LENGTH(len)) #define NLMSG_DATA(nlh) ((void*)(((char*)nlh) + NLMSG_LENGTH(0))) #define NLMSG_NEXT(nlh,len) ((len) -= NLMSG_ALIGN((nlh)->nlmsg_len), \ @@ -86,6 +87,8 @@ struct nlmsghdr #define NLMSG_DONE 0x3 /* End of a dump */ #define NLMSG_OVERRUN 0x4 /* Data lost */ +#define NLMSG_MIN_TYPE 0x10 /* < 0x10: reserved control messages */ + struct nlmsgerr { int error; @@ -108,6 +111,25 @@ enum { NETLINK_CONNECTED, }; +/* + * <------- NLA_HDRLEN ------> <-- NLA_ALIGN(payload)--> + * +---------------------+- - -+- - - - - - - - - -+- - -+ + * | Header | Pad | Payload | Pad | + * | (struct nlattr) | ing | | ing | + * +---------------------+- - -+- - - - - - - - - -+- - -+ + * <-------------- nlattr->nla_len --------------> + */ + +struct nlattr +{ + __u16 nla_len; + __u16 nla_type; +}; + +#define NLA_ALIGNTO 4 +#define NLA_ALIGN(len) (((len) + NLA_ALIGNTO - 1) & ~(NLA_ALIGNTO - 1)) +#define NLA_HDRLEN ((int) NLA_ALIGN(sizeof(struct nlattr))) + #ifdef __KERNEL__ #include diff --git a/include/net/netlink.h b/include/net/netlink.h new file mode 100644 index 000000000000..c99e22db9632 --- /dev/null +++ b/include/net/netlink.h @@ -0,0 +1,877 @@ +#ifndef __NET_NETLINK_H +#define __NET_NETLINK_H + +#include +#include + +/* ======================================================================== + * Netlink Messages and Attributes Interface (As Seen On TV) + * ------------------------------------------------------------------------ + * Messages Interface + * ------------------------------------------------------------------------ + * + * Message Format: + * <--- nlmsg_total_size(payload) ---> + * <-- nlmsg_msg_size(payload) -> + * +----------+- - -+-------------+- - -+-------- - - + * | nlmsghdr | Pad | Payload | Pad | nlmsghdr + * +----------+- - -+-------------+- - -+-------- - - + * nlmsg_data(nlh)---^ ^ + * nlmsg_next(nlh)-----------------------+ + * + * Payload Format: + * <---------------------- nlmsg_len(nlh) ---------------------> + * <------ hdrlen ------> <- nlmsg_attrlen(nlh, hdrlen) -> + * +----------------------+- - -+--------------------------------+ + * | Family Header | Pad | Attributes | + * +----------------------+- - -+--------------------------------+ + * nlmsg_attrdata(nlh, hdrlen)---^ + * + * Data Structures: + * struct nlmsghdr netlink message header + * + * Message Construction: + * nlmsg_new() create a new netlink message + * nlmsg_put() add a netlink message to an skb + * nlmsg_put_answer() callback based nlmsg_put() + * nlmsg_end() finanlize netlink message + * nlmsg_cancel() cancel message construction + * nlmsg_free() free a netlink message + * + * Message Sending: + * nlmsg_multicast() multicast message to several groups + * nlmsg_unicast() unicast a message to a single socket + * + * Message Length Calculations: + * nlmsg_msg_size(payload) length of message w/o padding + * nlmsg_total_size(payload) length of message w/ padding + * nlmsg_padlen(payload) length of padding at tail + * + * Message Payload Access: + * nlmsg_data(nlh) head of message payload + * nlmsg_len(nlh) length of message payload + * nlmsg_attrdata(nlh, hdrlen) head of attributes data + * nlmsg_attrlen(nlh, hdrlen) length of attributes data + * + * Message Parsing: + * nlmsg_ok(nlh, remaining) does nlh fit into remaining bytes? + * nlmsg_next(nlh, remaining) get next netlink message + * nlmsg_parse() parse attributes of a message + * nlmsg_find_attr() find an attribute in a message + * nlmsg_for_each_msg() loop over all messages + * nlmsg_validate() validate netlink message incl. attrs + * nlmsg_for_each_attr() loop over all attributes + * + * ------------------------------------------------------------------------ + * Attributes Interface + * ------------------------------------------------------------------------ + * + * Attribute Format: + * <------- nla_total_size(payload) -------> + * <---- nla_attr_size(payload) -----> + * +----------+- - -+- - - - - - - - - +- - -+-------- - - + * | Header | Pad | Payload | Pad | Header + * +----------+- - -+- - - - - - - - - +- - -+-------- - - + * <- nla_len(nla) -> ^ + * nla_data(nla)----^ | + * nla_next(nla)-----------------------------' + * + * Data Structures: + * struct nlattr netlink attribtue header + * + * Attribute Construction: + * nla_reserve(skb, type, len) reserve skb tailroom for an attribute + * nla_put(skb, type, len, data) add attribute to skb + * + * Attribute Construction for Basic Types: + * nla_put_u8(skb, type, value) add u8 attribute to skb + * nla_put_u16(skb, type, value) add u16 attribute to skb + * nla_put_u32(skb, type, value) add u32 attribute to skb + * nla_put_u64(skb, type, value) add u64 attribute to skb + * nla_put_string(skb, type, str) add string attribute to skb + * nla_put_flag(skb, type) add flag attribute to skb + * nla_put_msecs(skb, type, jiffies) add msecs attribute to skb + * + * Exceptions Based Attribute Construction: + * NLA_PUT(skb, type, len, data) add attribute to skb + * NLA_PUT_U8(skb, type, value) add u8 attribute to skb + * NLA_PUT_U16(skb, type, value) add u16 attribute to skb + * NLA_PUT_U32(skb, type, value) add u32 attribute to skb + * NLA_PUT_U64(skb, type, value) add u64 attribute to skb + * NLA_PUT_STRING(skb, type, str) add string attribute to skb + * NLA_PUT_FLAG(skb, type) add flag attribute to skb + * NLA_PUT_MSECS(skb, type, jiffies) add msecs attribute to skb + * + * The meaning of these functions is equal to their lower case + * variants but they jump to the label nla_put_failure in case + * of a failure. + * + * Nested Attributes Construction: + * nla_nest_start(skb, type) start a nested attribute + * nla_nest_end(skb, nla) finalize a nested attribute + * nla_nest_cancel(skb, nla) cancel nested attribute construction + * + * Attribute Length Calculations: + * nla_attr_size(payload) length of attribute w/o padding + * nla_total_size(payload) length of attribute w/ padding + * nla_padlen(payload) length of padding + * + * Attribute Payload Access: + * nla_data(nla) head of attribute payload + * nla_len(nla) length of attribute payload + * + * Attribute Payload Access for Basic Types: + * nla_get_u8(nla) get payload for a u8 attribute + * nla_get_u16(nla) get payload for a u16 attribute + * nla_get_u32(nla) get payload for a u32 attribute + * nla_get_u64(nla) get payload for a u64 attribute + * nla_get_flag(nla) return 1 if flag is true + * nla_get_msecs(nla) get payload for a msecs attribute + * + * Attribute Misc: + * nla_memcpy(dest, nla, count) copy attribute into memory + * nla_memcmp(nla, data, size) compare attribute with memory area + * nla_strlcpy(dst, nla, size) copy attribute to a sized string + * nla_strcmp(nla, str) compare attribute with string + * + * Attribute Parsing: + * nla_ok(nla, remaining) does nla fit into remaining bytes? + * nla_next(nla, remaining) get next netlink attribute + * nla_validate() validate a stream of attributes + * nla_find() find attribute in stream of attributes + * nla_parse() parse and validate stream of attrs + * nla_parse_nested() parse nested attribuets + * nla_for_each_attr() loop over all attributes + *========================================================================= + */ + + /** + * Standard attribute types to specify validation policy + */ +enum { + NLA_UNSPEC, + NLA_U8, + NLA_U16, + NLA_U32, + NLA_U64, + NLA_STRING, + NLA_FLAG, + NLA_MSECS, + NLA_NESTED, + __NLA_TYPE_MAX, +}; + +#define NLA_TYPE_MAX (__NLA_TYPE_MAX - 1) + +/** + * struct nla_policy - attribute validation policy + * @type: Type of attribute or NLA_UNSPEC + * @minlen: Minimal length of payload required to be available + * + * Policies are defined as arrays of this struct, the array must be + * accessible by attribute type up to the highest identifier to be expected. + * + * Example: + * static struct nla_policy my_policy[ATTR_MAX+1] __read_mostly = { + * [ATTR_FOO] = { .type = NLA_U16 }, + * [ATTR_BAR] = { .type = NLA_STRING }, + * [ATTR_BAZ] = { .minlen = sizeof(struct mystruct) }, + * }; + */ +struct nla_policy { + u16 type; + u16 minlen; +}; + +extern int nla_validate(struct nlattr *head, int len, int maxtype, + struct nla_policy *policy); +extern int nla_parse(struct nlattr *tb[], int maxtype, + struct nlattr *head, int len, + struct nla_policy *policy); +extern struct nlattr * nla_find(struct nlattr *head, int len, int attrtype); +extern size_t nla_strlcpy(char *dst, const struct nlattr *nla, + size_t dstsize); +extern int nla_memcpy(void *dest, struct nlattr *src, int count); +extern int nla_memcmp(const struct nlattr *nla, const void *data, + size_t size); +extern int nla_strcmp(const struct nlattr *nla, const char *str); +extern struct nlattr * __nla_reserve(struct sk_buff *skb, int attrtype, + int attrlen); +extern struct nlattr * nla_reserve(struct sk_buff *skb, int attrtype, + int attrlen); +extern void __nla_put(struct sk_buff *skb, int attrtype, + int attrlen, const void *data); +extern int nla_put(struct sk_buff *skb, int attrtype, + int attrlen, const void *data); + +/************************************************************************** + * Netlink Messages + **************************************************************************/ + +/** + * nlmsg_msg_size - length of netlink message not including padding + * @payload: length of message payload + */ +static inline int nlmsg_msg_size(int payload) +{ + return NLMSG_HDRLEN + payload; +} + +/** + * nlmsg_total_size - length of netlink message including padding + * @payload: length of message payload + */ +static inline int nlmsg_total_size(int payload) +{ + return NLMSG_ALIGN(nlmsg_msg_size(payload)); +} + +/** + * nlmsg_padlen - length of padding at the message's tail + * @payload: length of message payload + */ +static inline int nlmsg_padlen(int payload) +{ + return nlmsg_total_size(payload) - nlmsg_msg_size(payload); +} + +/** + * nlmsg_data - head of message payload + * @nlh: netlink messsage header + */ +static inline void *nlmsg_data(const struct nlmsghdr *nlh) +{ + return (unsigned char *) nlh + NLMSG_HDRLEN; +} + +/** + * nlmsg_len - length of message payload + * @nlh: netlink message header + */ +static inline int nlmsg_len(const struct nlmsghdr *nlh) +{ + return nlh->nlmsg_len - NLMSG_HDRLEN; +} + +/** + * nlmsg_attrdata - head of attributes data + * @nlh: netlink message header + * @hdrlen: length of family specific header + */ +static inline struct nlattr *nlmsg_attrdata(const struct nlmsghdr *nlh, + int hdrlen) +{ + unsigned char *data = nlmsg_data(nlh); + return (struct nlattr *) (data + NLMSG_ALIGN(hdrlen)); +} + +/** + * nlmsg_attrlen - length of attributes data + * @nlh: netlink message header + * @hdrlen: length of family specific header + */ +static inline int nlmsg_attrlen(const struct nlmsghdr *nlh, int hdrlen) +{ + return nlmsg_len(nlh) - NLMSG_ALIGN(hdrlen); +} + +/** + * nlmsg_ok - check if the netlink message fits into the remaining bytes + * @nlh: netlink message header + * @remaining: number of bytes remaining in message stream + */ +static inline int nlmsg_ok(const struct nlmsghdr *nlh, int remaining) +{ + return (remaining >= sizeof(struct nlmsghdr) && + nlh->nlmsg_len >= sizeof(struct nlmsghdr) && + nlh->nlmsg_len <= remaining); +} + +/** + * nlmsg_next - next netlink message in message stream + * @nlh: netlink message header + * @remaining: number of bytes remaining in message stream + * + * Returns the next netlink message in the message stream and + * decrements remaining by the size of the current message. + */ +static inline struct nlmsghdr *nlmsg_next(struct nlmsghdr *nlh, int *remaining) +{ + int totlen = NLMSG_ALIGN(nlh->nlmsg_len); + + *remaining -= totlen; + + return (struct nlmsghdr *) ((unsigned char *) nlh + totlen); +} + +/** + * nlmsg_parse - parse attributes of a netlink message + * @nlh: netlink message header + * @hdrlen: length of family specific header + * @tb: destination array with maxtype+1 elements + * @maxtype: maximum attribute type to be expected + * @policy: validation policy + * + * See nla_parse() + */ +static inline int nlmsg_parse(struct nlmsghdr *nlh, int hdrlen, + struct nlattr *tb[], int maxtype, + struct nla_policy *policy) +{ + if (nlh->nlmsg_len < nlmsg_msg_size(hdrlen)) + return -EINVAL; + + return nla_parse(tb, maxtype, nlmsg_attrdata(nlh, hdrlen), + nlmsg_attrlen(nlh, hdrlen), policy); +} + +/** + * nlmsg_find_attr - find a specific attribute in a netlink message + * @nlh: netlink message header + * @hdrlen: length of familiy specific header + * @attrtype: type of attribute to look for + * + * Returns the first attribute which matches the specified type. + */ +static inline struct nlattr *nlmsg_find_attr(struct nlmsghdr *nlh, + int hdrlen, int attrtype) +{ + return nla_find(nlmsg_attrdata(nlh, hdrlen), + nlmsg_attrlen(nlh, hdrlen), attrtype); +} + +/** + * nlmsg_validate - validate a netlink message including attributes + * @nlh: netlinket message header + * @hdrlen: length of familiy specific header + * @maxtype: maximum attribute type to be expected + * @policy: validation policy + */ +static inline int nlmsg_validate(struct nlmsghdr *nlh, int hdrlen, int maxtype, + struct nla_policy *policy) +{ + if (nlh->nlmsg_len < nlmsg_msg_size(hdrlen)) + return -EINVAL; + + return nla_validate(nlmsg_attrdata(nlh, hdrlen), + nlmsg_attrlen(nlh, hdrlen), maxtype, policy); +} + +/** + * nlmsg_for_each_attr - iterate over a stream of attributes + * @pos: loop counter, set to current attribute + * @nlh: netlink message header + * @hdrlen: length of familiy specific header + * @rem: initialized to len, holds bytes currently remaining in stream + */ +#define nlmsg_for_each_attr(pos, nlh, hdrlen, rem) \ + nla_for_each_attr(pos, nlmsg_attrdata(nlh, hdrlen), \ + nlmsg_attrlen(nlh, hdrlen), rem) + +#if 0 +/* FIXME: Enable once all users have been converted */ + +/** + * __nlmsg_put - Add a new netlink message to an skb + * @skb: socket buffer to store message in + * @pid: netlink process id + * @seq: sequence number of message + * @type: message type + * @payload: length of message payload + * @flags: message flags + * + * The caller is responsible to ensure that the skb provides enough + * tailroom for both the netlink header and payload. + */ +static inline struct nlmsghdr *__nlmsg_put(struct sk_buff *skb, u32 pid, + u32 seq, int type, int payload, + int flags) +{ + struct nlmsghdr *nlh; + + nlh = (struct nlmsghdr *) skb_put(skb, nlmsg_total_size(payload)); + nlh->nlmsg_type = type; + nlh->nlmsg_len = nlmsg_msg_size(payload); + nlh->nlmsg_flags = flags; + nlh->nlmsg_pid = pid; + nlh->nlmsg_seq = seq; + + memset((unsigned char *) nlmsg_data(nlh) + payload, 0, + nlmsg_padlen(payload)); + + return nlh; +} +#endif + +/** + * nlmsg_put - Add a new netlink message to an skb + * @skb: socket buffer to store message in + * @pid: netlink process id + * @seq: sequence number of message + * @type: message type + * @payload: length of message payload + * @flags: message flags + * + * Returns NULL if the tailroom of the skb is insufficient to store + * the message header and payload. + */ +static inline struct nlmsghdr *nlmsg_put(struct sk_buff *skb, u32 pid, u32 seq, + int type, int payload, int flags) +{ + if (unlikely(skb_tailroom(skb) < nlmsg_total_size(payload))) + return NULL; + + return __nlmsg_put(skb, pid, seq, type, payload, flags); +} + +/** + * nlmsg_put_answer - Add a new callback based netlink message to an skb + * @skb: socket buffer to store message in + * @cb: netlink callback + * @type: message type + * @payload: length of message payload + * @flags: message flags + * + * Returns NULL if the tailroom of the skb is insufficient to store + * the message header and payload. + */ +static inline struct nlmsghdr *nlmsg_put_answer(struct sk_buff *skb, + struct netlink_callback *cb, + int type, int payload, + int flags) +{ + return nlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, + type, payload, flags); +} + +/** + * nlmsg_new - Allocate a new netlink message + * @size: maximum size of message + * + * Use NLMSG_GOODSIZE if size isn't know and you need a good default size. + */ +static inline struct sk_buff *nlmsg_new(int size) +{ + return alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); +} + +/** + * nlmsg_end - Finalize a netlink message + * @skb: socket buffer the message is stored in + * @nlh: netlink message header + * + * Corrects the netlink message header to include the appeneded + * attributes. Only necessary if attributes have been added to + * the message. + * + * Returns the total data length of the skb. + */ +static inline int nlmsg_end(struct sk_buff *skb, struct nlmsghdr *nlh) +{ + nlh->nlmsg_len = skb->tail - (unsigned char *) nlh; + + return skb->len; +} + +/** + * nlmsg_cancel - Cancel construction of a netlink message + * @skb: socket buffer the message is stored in + * @nlh: netlink message header + * + * Removes the complete netlink message including all + * attributes from the socket buffer again. Returns -1. + */ +static inline int nlmsg_cancel(struct sk_buff *skb, struct nlmsghdr *nlh) +{ + skb_trim(skb, (unsigned char *) nlh - skb->data); + + return -1; +} + +/** + * nlmsg_free - free a netlink message + * @skb: socket buffer of netlink message + */ +static inline void nlmsg_free(struct sk_buff *skb) +{ + kfree_skb(skb); +} + +/** + * nlmsg_multicast - multicast a netlink message + * @sk: netlink socket to spread messages to + * @skb: netlink message as socket buffer + * @pid: own netlink pid to avoid sending to yourself + * @group: multicast group id + */ +static inline int nlmsg_multicast(struct sock *sk, struct sk_buff *skb, + u32 pid, unsigned int group) +{ + int err; + + NETLINK_CB(skb).dst_group = group; + + err = netlink_broadcast(sk, skb, pid, group, GFP_KERNEL); + if (err > 0) + err = 0; + + return err; +} + +/** + * nlmsg_unicast - unicast a netlink message + * @sk: netlink socket to spread message to + * @skb: netlink message as socket buffer + * @pid: netlink pid of the destination socket + */ +static inline int nlmsg_unicast(struct sock *sk, struct sk_buff *skb, u32 pid) +{ + int err; + + err = netlink_unicast(sk, skb, pid, MSG_DONTWAIT); + if (err > 0) + err = 0; + + return err; +} + +/** + * nlmsg_for_each_msg - iterate over a stream of messages + * @pos: loop counter, set to current message + * @head: head of message stream + * @len: length of message stream + * @rem: initialized to len, holds bytes currently remaining in stream + */ +#define nlmsg_for_each_msg(pos, head, len, rem) \ + for (pos = head, rem = len; \ + nlmsg_ok(pos, rem); \ + pos = nlmsg_next(pos, &(rem))) + +/************************************************************************** + * Netlink Attributes + **************************************************************************/ + +/** + * nla_attr_size - length of attribute not including padding + * @payload: length of payload + */ +static inline int nla_attr_size(int payload) +{ + return NLA_HDRLEN + payload; +} + +/** + * nla_total_size - total length of attribute including padding + * @payload: length of payload + */ +static inline int nla_total_size(int payload) +{ + return NLA_ALIGN(nla_attr_size(payload)); +} + +/** + * nla_padlen - length of padding at the tail of attribute + * @payload: length of payload + */ +static inline int nla_padlen(int payload) +{ + return nla_total_size(payload) - nla_attr_size(payload); +} + +/** + * nla_data - head of payload + * @nla: netlink attribute + */ +static inline void *nla_data(const struct nlattr *nla) +{ + return (char *) nla + NLA_HDRLEN; +} + +/** + * nla_len - length of payload + * @nla: netlink attribute + */ +static inline int nla_len(const struct nlattr *nla) +{ + return nla->nla_len - NLA_HDRLEN; +} + +/** + * nla_ok - check if the netlink attribute fits into the remaining bytes + * @nla: netlink attribute + * @remaining: number of bytes remaining in attribute stream + */ +static inline int nla_ok(const struct nlattr *nla, int remaining) +{ + return remaining >= sizeof(*nla) && + nla->nla_len >= sizeof(*nla) && + nla->nla_len <= remaining; +} + +/** + * nla_next - next netlink attribte in attribute stream + * @nla: netlink attribute + * @remaining: number of bytes remaining in attribute stream + * + * Returns the next netlink attribute in the attribute stream and + * decrements remaining by the size of the current attribute. + */ +static inline struct nlattr *nla_next(const struct nlattr *nla, int *remaining) +{ + int totlen = NLA_ALIGN(nla->nla_len); + + *remaining -= totlen; + return (struct nlattr *) ((char *) nla + totlen); +} + +/** + * nla_parse_nested - parse nested attributes + * @tb: destination array with maxtype+1 elements + * @maxtype: maximum attribute type to be expected + * @nla: attribute containing the nested attributes + * @policy: validation policy + * + * See nla_parse() + */ +static inline int nla_parse_nested(struct nlattr *tb[], int maxtype, + struct nlattr *nla, + struct nla_policy *policy) +{ + return nla_parse(tb, maxtype, nla_data(nla), nla_len(nla), policy); +} +/** + * nla_put_u8 - Add a u16 netlink attribute to a socket buffer + * @skb: socket buffer to add attribute to + * @attrtype: attribute type + * @value: numeric value + */ +static inline int nla_put_u8(struct sk_buff *skb, int attrtype, u8 value) +{ + return nla_put(skb, attrtype, sizeof(u8), &value); +} + +/** + * nla_put_u16 - Add a u16 netlink attribute to a socket buffer + * @skb: socket buffer to add attribute to + * @attrtype: attribute type + * @value: numeric value + */ +static inline int nla_put_u16(struct sk_buff *skb, int attrtype, u16 value) +{ + return nla_put(skb, attrtype, sizeof(u16), &value); +} + +/** + * nla_put_u32 - Add a u32 netlink attribute to a socket buffer + * @skb: socket buffer to add attribute to + * @attrtype: attribute type + * @value: numeric value + */ +static inline int nla_put_u32(struct sk_buff *skb, int attrtype, u32 value) +{ + return nla_put(skb, attrtype, sizeof(u32), &value); +} + +/** + * nla_put_64 - Add a u64 netlink attribute to a socket buffer + * @skb: socket buffer to add attribute to + * @attrtype: attribute type + * @value: numeric value + */ +static inline int nla_put_u64(struct sk_buff *skb, int attrtype, u64 value) +{ + return nla_put(skb, attrtype, sizeof(u64), &value); +} + +/** + * nla_put_string - Add a string netlink attribute to a socket buffer + * @skb: socket buffer to add attribute to + * @attrtype: attribute type + * @str: NUL terminated string + */ +static inline int nla_put_string(struct sk_buff *skb, int attrtype, + const char *str) +{ + return nla_put(skb, attrtype, strlen(str) + 1, str); +} + +/** + * nla_put_flag - Add a flag netlink attribute to a socket buffer + * @skb: socket buffer to add attribute to + * @attrtype: attribute type + */ +static inline int nla_put_flag(struct sk_buff *skb, int attrtype) +{ + return nla_put(skb, attrtype, 0, NULL); +} + +/** + * nla_put_msecs - Add a msecs netlink attribute to a socket buffer + * @skb: socket buffer to add attribute to + * @attrtype: attribute type + * @jiffies: number of msecs in jiffies + */ +static inline int nla_put_msecs(struct sk_buff *skb, int attrtype, + unsigned long jiffies) +{ + u64 tmp = jiffies_to_msecs(jiffies); + return nla_put(skb, attrtype, sizeof(u64), &tmp); +} + +#define NLA_PUT(skb, attrtype, attrlen, data) \ + do { \ + if (nla_put(skb, attrtype, attrlen, data) < 0) \ + goto nla_put_failure; \ + } while(0) + +#define NLA_PUT_TYPE(skb, type, attrtype, value) \ + do { \ + type __tmp = value; \ + NLA_PUT(skb, attrtype, sizeof(type), &__tmp); \ + } while(0) + +#define NLA_PUT_U8(skb, attrtype, value) \ + NLA_PUT_TYPE(skb, u8, attrtype, value) + +#define NLA_PUT_U16(skb, attrtype, value) \ + NLA_PUT_TYPE(skb, u16, attrtype, value) + +#define NLA_PUT_U32(skb, attrtype, value) \ + NLA_PUT_TYPE(skb, u32, attrtype, value) + +#define NLA_PUT_U64(skb, attrtype, value) \ + NLA_PUT_TYPE(skb, u64, attrtype, value) + +#define NLA_PUT_STRING(skb, attrtype, value) \ + NLA_PUT(skb, attrtype, strlen(value) + 1, value) + +#define NLA_PUT_FLAG(skb, attrtype, value) \ + NLA_PUT(skb, attrtype, 0, NULL) + +#define NLA_PUT_MSECS(skb, attrtype, jiffies) \ + NLA_PUT_U64(skb, attrtype, jiffies_to_msecs(jiffies)) + +/** + * nla_get_u32 - return payload of u32 attribute + * @nla: u32 netlink attribute + */ +static inline u32 nla_get_u32(struct nlattr *nla) +{ + return *(u32 *) nla_data(nla); +} + +/** + * nla_get_u16 - return payload of u16 attribute + * @nla: u16 netlink attribute + */ +static inline u16 nla_get_u16(struct nlattr *nla) +{ + return *(u16 *) nla_data(nla); +} + +/** + * nla_get_u8 - return payload of u8 attribute + * @nla: u8 netlink attribute + */ +static inline u8 nla_get_u8(struct nlattr *nla) +{ + return *(u8 *) nla_data(nla); +} + +/** + * nla_get_u64 - return payload of u64 attribute + * @nla: u64 netlink attribute + */ +static inline u64 nla_get_u64(struct nlattr *nla) +{ + u64 tmp; + + nla_memcpy(&tmp, nla, sizeof(tmp)); + + return tmp; +} + +/** + * nla_get_flag - return payload of flag attribute + * @nla: flag netlink attribute + */ +static inline int nla_get_flag(struct nlattr *nla) +{ + return !!nla; +} + +/** + * nla_get_msecs - return payload of msecs attribute + * @nla: msecs netlink attribute + * + * Returns the number of milliseconds in jiffies. + */ +static inline unsigned long nla_get_msecs(struct nlattr *nla) +{ + u64 msecs = nla_get_u64(nla); + + return msecs_to_jiffies((unsigned long) msecs); +} + +/** + * nla_nest_start - Start a new level of nested attributes + * @skb: socket buffer to add attributes to + * @attrtype: attribute type of container + * + * Returns the container attribute + */ +static inline struct nlattr *nla_nest_start(struct sk_buff *skb, int attrtype) +{ + struct nlattr *start = (struct nlattr *) skb->tail; + + if (nla_put(skb, attrtype, 0, NULL) < 0) + return NULL; + + return start; +} + +/** + * nla_nest_end - Finalize nesting of attributes + * @skb: socket buffer the attribtues are stored in + * @start: container attribute + * + * Corrects the container attribute header to include the all + * appeneded attributes. + * + * Returns the total data length of the skb. + */ +static inline int nla_nest_end(struct sk_buff *skb, struct nlattr *start) +{ + start->nla_len = skb->tail - (unsigned char *) start; + return skb->len; +} + +/** + * nla_nest_cancel - Cancel nesting of attributes + * @skb: socket buffer the message is stored in + * @start: container attribute + * + * Removes the container attribute and including all nested + * attributes. Returns -1. + */ +static inline int nla_nest_cancel(struct sk_buff *skb, struct nlattr *start) +{ + if (start) + skb_trim(skb, (unsigned char *) start - skb->data); + + return -1; +} + +/** + * nla_for_each_attr - iterate over a stream of attributes + * @pos: loop counter, set to current attribute + * @head: head of attribute stream + * @len: length of attribute stream + * @rem: initialized to len, holds bytes currently remaining in stream + */ +#define nla_for_each_attr(pos, head, len, rem) \ + for (pos = head, rem = len; \ + nla_ok(pos, rem); \ + pos = nla_next(pos, &(rem))) + +#endif diff --git a/net/netlink/Makefile b/net/netlink/Makefile index 39d9c2dcd03c..68179015bd39 100644 --- a/net/netlink/Makefile +++ b/net/netlink/Makefile @@ -2,4 +2,4 @@ # Makefile for the netlink driver. # -obj-y := af_netlink.o +obj-y := af_netlink.o attr.o diff --git a/net/netlink/attr.c b/net/netlink/attr.c new file mode 100644 index 000000000000..fffef4ab276f --- /dev/null +++ b/net/netlink/attr.c @@ -0,0 +1,328 @@ +/* + * NETLINK Netlink attributes + * + * Authors: Thomas Graf + * Alexey Kuznetsov + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static u16 nla_attr_minlen[NLA_TYPE_MAX+1] __read_mostly = { + [NLA_U8] = sizeof(u8), + [NLA_U16] = sizeof(u16), + [NLA_U32] = sizeof(u32), + [NLA_U64] = sizeof(u64), + [NLA_STRING] = 1, + [NLA_NESTED] = NLA_HDRLEN, +}; + +static int validate_nla(struct nlattr *nla, int maxtype, + struct nla_policy *policy) +{ + struct nla_policy *pt; + int minlen = 0; + + if (nla->nla_type <= 0 || nla->nla_type > maxtype) + return 0; + + pt = &policy[nla->nla_type]; + + BUG_ON(pt->type > NLA_TYPE_MAX); + + if (pt->minlen) + minlen = pt->minlen; + else if (pt->type != NLA_UNSPEC) + minlen = nla_attr_minlen[pt->type]; + + if (pt->type == NLA_FLAG && nla_len(nla) > 0) + return -ERANGE; + + if (nla_len(nla) < minlen) + return -ERANGE; + + return 0; +} + +/** + * nla_validate - Validate a stream of attributes + * @head: head of attribute stream + * @len: length of attribute stream + * @maxtype: maximum attribute type to be expected + * @policy: validation policy + * + * Validates all attributes in the specified attribute stream against the + * specified policy. Attributes with a type exceeding maxtype will be + * ignored. See documenation of struct nla_policy for more details. + * + * Returns 0 on success or a negative error code. + */ +int nla_validate(struct nlattr *head, int len, int maxtype, + struct nla_policy *policy) +{ + struct nlattr *nla; + int rem, err; + + nla_for_each_attr(nla, head, len, rem) { + err = validate_nla(nla, maxtype, policy); + if (err < 0) + goto errout; + } + + err = 0; +errout: + return err; +} + +/** + * nla_parse - Parse a stream of attributes into a tb buffer + * @tb: destination array with maxtype+1 elements + * @maxtype: maximum attribute type to be expected + * @head: head of attribute stream + * @len: length of attribute stream + * + * Parses a stream of attributes and stores a pointer to each attribute in + * the tb array accessable via the attribute type. Attributes with a type + * exceeding maxtype will be silently ignored for backwards compatibility + * reasons. policy may be set to NULL if no validation is required. + * + * Returns 0 on success or a negative error code. + */ +int nla_parse(struct nlattr *tb[], int maxtype, struct nlattr *head, int len, + struct nla_policy *policy) +{ + struct nlattr *nla; + int rem, err; + + memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1)); + + nla_for_each_attr(nla, head, len, rem) { + u16 type = nla->nla_type; + + if (type > 0 && type <= maxtype) { + if (policy) { + err = validate_nla(nla, maxtype, policy); + if (err < 0) + goto errout; + } + + tb[type] = nla; + } + } + + if (unlikely(rem > 0)) + printk(KERN_WARNING "netlink: %d bytes leftover after parsing " + "attributes.\n", rem); + + err = 0; +errout: + return err; +} + +/** + * nla_find - Find a specific attribute in a stream of attributes + * @head: head of attribute stream + * @len: length of attribute stream + * @attrtype: type of attribute to look for + * + * Returns the first attribute in the stream matching the specified type. + */ +struct nlattr *nla_find(struct nlattr *head, int len, int attrtype) +{ + struct nlattr *nla; + int rem; + + nla_for_each_attr(nla, head, len, rem) + if (nla->nla_type == attrtype) + return nla; + + return NULL; +} + +/** + * nla_strlcpy - Copy string attribute payload into a sized buffer + * @dst: where to copy the string to + * @src: attribute to copy the string from + * @dstsize: size of destination buffer + * + * Copies at most dstsize - 1 bytes into the destination buffer. + * The result is always a valid NUL-terminated string. Unlike + * strlcpy the destination buffer is always padded out. + * + * Returns the length of the source buffer. + */ +size_t nla_strlcpy(char *dst, const struct nlattr *nla, size_t dstsize) +{ + size_t srclen = nla_len(nla); + char *src = nla_data(nla); + + if (srclen > 0 && src[srclen - 1] == '\0') + srclen--; + + if (dstsize > 0) { + size_t len = (srclen >= dstsize) ? dstsize - 1 : srclen; + + memset(dst, 0, dstsize); + memcpy(dst, src, len); + } + + return srclen; +} + +/** + * nla_memcpy - Copy a netlink attribute into another memory area + * @dest: where to copy to memcpy + * @src: netlink attribute to copy from + * @count: size of the destination area + * + * Note: The number of bytes copied is limited by the length of + * attribute's payload. memcpy + * + * Returns the number of bytes copied. + */ +int nla_memcpy(void *dest, struct nlattr *src, int count) +{ + int minlen = min_t(int, count, nla_len(src)); + + memcpy(dest, nla_data(src), minlen); + + return minlen; +} + +/** + * nla_memcmp - Compare an attribute with sized memory area + * @nla: netlink attribute + * @data: memory area + * @size: size of memory area + */ +int nla_memcmp(const struct nlattr *nla, const void *data, + size_t size) +{ + int d = nla_len(nla) - size; + + if (d == 0) + d = memcmp(nla_data(nla), data, size); + + return d; +} + +/** + * nla_strcmp - Compare a string attribute against a string + * @nla: netlink string attribute + * @str: another string + */ +int nla_strcmp(const struct nlattr *nla, const char *str) +{ + int len = strlen(str) + 1; + int d = nla_len(nla) - len; + + if (d == 0) + d = memcmp(nla_data(nla), str, len); + + return d; +} + +/** + * __nla_reserve - reserve room for attribute on the skb + * @skb: socket buffer to reserve room on + * @attrtype: attribute type + * @attrlen: length of attribute payload + * + * Adds a netlink attribute header to a socket buffer and reserves + * room for the payload but does not copy it. + * + * The caller is responsible to ensure that the skb provides enough + * tailroom for the attribute header and payload. + */ +struct nlattr *__nla_reserve(struct sk_buff *skb, int attrtype, int attrlen) +{ + struct nlattr *nla; + + nla = (struct nlattr *) skb_put(skb, nla_total_size(attrlen)); + nla->nla_type = attrtype; + nla->nla_len = nla_attr_size(attrlen); + + memset((unsigned char *) nla + nla->nla_len, 0, nla_padlen(attrlen)); + + return nla; +} + +/** + * nla_reserve - reserve room for attribute on the skb + * @skb: socket buffer to reserve room on + * @attrtype: attribute type + * @attrlen: length of attribute payload + * + * Adds a netlink attribute header to a socket buffer and reserves + * room for the payload but does not copy it. + * + * Returns NULL if the tailroom of the skb is insufficient to store + * the attribute header and payload. + */ +struct nlattr *nla_reserve(struct sk_buff *skb, int attrtype, int attrlen) +{ + if (unlikely(skb_tailroom(skb) < nla_total_size(attrlen))) + return NULL; + + return __nla_reserve(skb, attrtype, attrlen); +} + +/** + * __nla_put - Add a netlink attribute to a socket buffer + * @skb: socket buffer to add attribute to + * @attrtype: attribute type + * @attrlen: length of attribute payload + * @data: head of attribute payload + * + * The caller is responsible to ensure that the skb provides enough + * tailroom for the attribute header and payload. + */ +void __nla_put(struct sk_buff *skb, int attrtype, int attrlen, + const void *data) +{ + struct nlattr *nla; + + nla = __nla_reserve(skb, attrtype, attrlen); + memcpy(nla_data(nla), data, attrlen); +} + + +/** + * nla_put - Add a netlink attribute to a socket buffer + * @skb: socket buffer to add attribute to + * @attrtype: attribute type + * @attrlen: length of attribute payload + * @data: head of attribute payload + * + * Returns -1 if the tailroom of the skb is insufficient to store + * the attribute header and payload. + */ +int nla_put(struct sk_buff *skb, int attrtype, int attrlen, const void *data) +{ + if (unlikely(skb_tailroom(skb) < nla_total_size(attrlen))) + return -1; + + __nla_put(skb, attrtype, attrlen, data); + return 0; +} + + +EXPORT_SYMBOL(nla_validate); +EXPORT_SYMBOL(nla_parse); +EXPORT_SYMBOL(nla_find); +EXPORT_SYMBOL(nla_strlcpy); +EXPORT_SYMBOL(__nla_reserve); +EXPORT_SYMBOL(nla_reserve); +EXPORT_SYMBOL(__nla_put); +EXPORT_SYMBOL(nla_put); +EXPORT_SYMBOL(nla_memcpy); +EXPORT_SYMBOL(nla_memcmp); +EXPORT_SYMBOL(nla_strcmp); -- cgit v1.2.3 From a8f74b228826eef1cbe04a05647d61e896f5fd63 Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Thu, 10 Nov 2005 02:25:52 +0100 Subject: [NETLINK]: Make netlink_callback->done() optional Most netlink families make no use of the done() callback, making it optional gets rid of all unnecessary dummy implementations. Signed-off-by: Thomas Graf Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 8 +------- net/ipv4/inet_diag.c | 9 +-------- net/ipv6/route.c | 2 +- net/netlink/af_netlink.c | 6 ++++-- net/xfrm/xfrm_user.c | 8 +------- 5 files changed, 8 insertions(+), 25 deletions(-) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 9bed7569ce3f..193fd8637f9f 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -462,11 +462,6 @@ void rtmsg_ifinfo(int type, struct net_device *dev, unsigned change) netlink_broadcast(rtnl, skb, 0, RTNLGRP_LINK, GFP_KERNEL); } -static int rtnetlink_done(struct netlink_callback *cb) -{ - return 0; -} - /* Protected by RTNL sempahore. */ static struct rtattr **rta_buf; static int rtattr_max; @@ -533,8 +528,7 @@ rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, int *errp) goto err_inval; if ((*errp = netlink_dump_start(rtnl, skb, nlh, - link->dumpit, - rtnetlink_done)) != 0) { + link->dumpit, NULL)) != 0) { return -1; } rlen = NLMSG_ALIGN(nlh->nlmsg_len); diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 71f3c7350c6e..39061ed53cfd 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -724,12 +724,6 @@ done: return skb->len; } -static int inet_diag_dump_done(struct netlink_callback *cb) -{ - return 0; -} - - static __inline__ int inet_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) { @@ -760,8 +754,7 @@ inet_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) goto err_inval; } return netlink_dump_start(idiagnl, skb, nlh, - inet_diag_dump, - inet_diag_dump_done); + inet_diag_dump, NULL); } else { return inet_diag_get_exact(skb, nlh); } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 227e99ed510c..f7f42c3e96cb 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1710,7 +1710,7 @@ static void fib6_dump_end(struct netlink_callback *cb) static int fib6_dump_done(struct netlink_callback *cb) { fib6_dump_end(cb); - return cb->done(cb); + return cb->done ? cb->done(cb) : 0; } int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 5ca283537bc6..f3fb7e575816 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -427,7 +427,8 @@ static int netlink_release(struct socket *sock) spin_lock(&nlk->cb_lock); if (nlk->cb) { - nlk->cb->done(nlk->cb); + if (nlk->cb->done) + nlk->cb->done(nlk->cb); netlink_destroy_callback(nlk->cb); nlk->cb = NULL; } @@ -1322,7 +1323,8 @@ static int netlink_dump(struct sock *sk) skb_queue_tail(&sk->sk_receive_queue, skb); sk->sk_data_ready(sk, skb->len); - cb->done(cb); + if (cb->done) + cb->done(cb); nlk->cb = NULL; spin_unlock(&nlk->cb_lock); diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index c35336a0f71b..6996b7e85665 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -948,11 +948,6 @@ static struct xfrm_link { [XFRM_MSG_FLUSHPOLICY - XFRM_MSG_BASE] = { .doit = xfrm_flush_policy }, }; -static int xfrm_done(struct netlink_callback *cb) -{ - return 0; -} - static int xfrm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, int *errp) { struct rtattr *xfrma[XFRMA_MAX]; @@ -990,8 +985,7 @@ static int xfrm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, int *err goto err_einval; if ((*errp = netlink_dump_start(xfrm_nl, skb, nlh, - link->dump, - xfrm_done)) != 0) { + link->dump, NULL)) != 0) { return -1; } rlen = NLMSG_ALIGN(nlh->nlmsg_len); -- cgit v1.2.3 From 82ace47a7256fd39d370a6442e0649f75961b831 Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Thu, 10 Nov 2005 02:25:53 +0100 Subject: [NETLINK]: Generic netlink receive queue processor Introduces netlink_run_queue() to handle the receive queue of a netlink socket in a generic way. Processes as much as there was in the queue upon entry and invokes a callback function for each netlink message found. The callback function may refuse a message by returning a negative error code but setting the error pointer to 0 in which case netlink_run_queue() will return with a qlen != 0. Signed-off-by: Thomas Graf Signed-off-by: David S. Miller --- include/net/netlink.h | 6 ++++ net/netlink/af_netlink.c | 91 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 97 insertions(+) (limited to 'net') diff --git a/include/net/netlink.h b/include/net/netlink.h index c99e22db9632..640c26a90cf1 100644 --- a/include/net/netlink.h +++ b/include/net/netlink.h @@ -183,6 +183,12 @@ struct nla_policy { u16 minlen; }; +extern void netlink_run_queue(struct sock *sk, unsigned int *qlen, + int (*cb)(struct sk_buff *, + struct nlmsghdr *, int *)); +extern void netlink_queue_skip(struct nlmsghdr *nlh, + struct sk_buff *skb); + extern int nla_validate(struct nlattr *head, int len, int maxtype, struct nla_policy *policy); extern int nla_parse(struct nlattr *tb[], int maxtype, diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index f3fb7e575816..8c38ee6d255e 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -58,6 +58,7 @@ #include #include +#include #define Nprintk(a...) #define NLGRPSZ(x) (ALIGN(x, sizeof(unsigned long) * 8) / 8) @@ -1411,6 +1412,94 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err) netlink_unicast(in_skb->sk, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT); } +static int netlink_rcv_skb(struct sk_buff *skb, int (*cb)(struct sk_buff *, + struct nlmsghdr *, int *)) +{ + unsigned int total_len; + struct nlmsghdr *nlh; + int err; + + while (skb->len >= nlmsg_total_size(0)) { + nlh = (struct nlmsghdr *) skb->data; + + if (skb->len < nlh->nlmsg_len) + return 0; + + total_len = min(NLMSG_ALIGN(nlh->nlmsg_len), skb->len); + + if (cb(skb, nlh, &err) < 0) { + /* Not an error, but we have to interrupt processing + * here. Note: that in this case we do not pull + * message from skb, it will be processed later. + */ + if (err == 0) + return -1; + netlink_ack(skb, nlh, err); + } else if (nlh->nlmsg_flags & NLM_F_ACK) + netlink_ack(skb, nlh, 0); + + skb_pull(skb, total_len); + } + + return 0; +} + +/** + * nelink_run_queue - Process netlink receive queue. + * @sk: Netlink socket containing the queue + * @qlen: Place to store queue length upon entry + * @cb: Callback function invoked for each netlink message found + * + * Processes as much as there was in the queue upon entry and invokes + * a callback function for each netlink message found. The callback + * function may refuse a message by returning a negative error code + * but setting the error pointer to 0 in which case this function + * returns with a qlen != 0. + * + * qlen must be initialized to 0 before the initial entry, afterwards + * the function may be called repeatedly until qlen reaches 0. + */ +void netlink_run_queue(struct sock *sk, unsigned int *qlen, + int (*cb)(struct sk_buff *, struct nlmsghdr *, int *)) +{ + struct sk_buff *skb; + + if (!*qlen || *qlen > skb_queue_len(&sk->sk_receive_queue)) + *qlen = skb_queue_len(&sk->sk_receive_queue); + + for (; *qlen; (*qlen)--) { + skb = skb_dequeue(&sk->sk_receive_queue); + if (netlink_rcv_skb(skb, cb)) { + if (skb->len) + skb_queue_head(&sk->sk_receive_queue, skb); + else { + kfree_skb(skb); + (*qlen)--; + } + break; + } + + kfree_skb(skb); + } +} + +/** + * netlink_queue_skip - Skip netlink message while processing queue. + * @nlh: Netlink message to be skipped + * @skb: Socket buffer containing the netlink messages. + * + * Pulls the given netlink message off the socket buffer so the next + * call to netlink_queue_run() will not reconsider the message. + */ +void netlink_queue_skip(struct nlmsghdr *nlh, struct sk_buff *skb) +{ + int msglen = NLMSG_ALIGN(nlh->nlmsg_len); + + if (msglen > skb->len) + msglen = skb->len; + + skb_pull(skb, msglen); +} #ifdef CONFIG_PROC_FS struct nl_seq_iter { @@ -1659,6 +1748,8 @@ out: core_initcall(netlink_proto_init); EXPORT_SYMBOL(netlink_ack); +EXPORT_SYMBOL(netlink_run_queue); +EXPORT_SYMBOL(netlink_queue_skip); EXPORT_SYMBOL(netlink_broadcast); EXPORT_SYMBOL(netlink_dump_start); EXPORT_SYMBOL(netlink_kernel_create); -- cgit v1.2.3 From 88fc2c84312d095545c08a9f871ad1888a688cf6 Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Thu, 10 Nov 2005 02:25:54 +0100 Subject: [XFRM]: Use generic netlink receive queue processor Signed-off-by: Thomas Graf Signed-off-by: David S. Miller --- net/xfrm/xfrm_user.c | 61 +++++----------------------------------------------- 1 file changed, 5 insertions(+), 56 deletions(-) (limited to 'net') diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 6996b7e85665..0cdd9a07e043 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -18,7 +18,6 @@ #include #include #include -#include #include #include #include @@ -26,6 +25,7 @@ #include #include #include +#include #include static struct sock *xfrm_nl; @@ -979,8 +979,6 @@ static int xfrm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, int *err if ((type == (XFRM_MSG_GETSA - XFRM_MSG_BASE) || type == (XFRM_MSG_GETPOLICY - XFRM_MSG_BASE)) && (nlh->nlmsg_flags & NLM_F_DUMP)) { - u32 rlen; - if (link->dump == NULL) goto err_einval; @@ -988,10 +986,8 @@ static int xfrm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, int *err link->dump, NULL)) != 0) { return -1; } - rlen = NLMSG_ALIGN(nlh->nlmsg_len); - if (rlen > skb->len) - rlen = skb->len; - skb_pull(skb, rlen); + + netlink_queue_skip(nlh, skb); return -1; } @@ -1026,60 +1022,13 @@ err_einval: return -1; } -static int xfrm_user_rcv_skb(struct sk_buff *skb) -{ - int err; - struct nlmsghdr *nlh; - - while (skb->len >= NLMSG_SPACE(0)) { - u32 rlen; - - nlh = (struct nlmsghdr *) skb->data; - if (nlh->nlmsg_len < sizeof(*nlh) || - skb->len < nlh->nlmsg_len) - return 0; - rlen = NLMSG_ALIGN(nlh->nlmsg_len); - if (rlen > skb->len) - rlen = skb->len; - if (xfrm_user_rcv_msg(skb, nlh, &err) < 0) { - if (err == 0) - return -1; - netlink_ack(skb, nlh, err); - } else if (nlh->nlmsg_flags & NLM_F_ACK) - netlink_ack(skb, nlh, 0); - skb_pull(skb, rlen); - } - - return 0; -} - static void xfrm_netlink_rcv(struct sock *sk, int len) { - unsigned int qlen = skb_queue_len(&sk->sk_receive_queue); + unsigned int qlen = 0; do { - struct sk_buff *skb; - down(&xfrm_cfg_sem); - - if (qlen > skb_queue_len(&sk->sk_receive_queue)) - qlen = skb_queue_len(&sk->sk_receive_queue); - - for (; qlen; qlen--) { - skb = skb_dequeue(&sk->sk_receive_queue); - if (xfrm_user_rcv_skb(skb)) { - if (skb->len) - skb_queue_head(&sk->sk_receive_queue, - skb); - else { - kfree_skb(skb); - qlen--; - } - break; - } - kfree_skb(skb); - } - + netlink_run_queue(sk, &qlen, &xfrm_user_rcv_msg); up(&xfrm_cfg_sem); } while (qlen); -- cgit v1.2.3 From 9ac4a16983ea4edf719c390a1a234d956947688d Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Thu, 10 Nov 2005 02:25:55 +0100 Subject: [RTNETLINK]: Use generic netlink receive queue processor Signed-off-by: Thomas Graf Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 75 ++++------------------------------------------------ 1 file changed, 5 insertions(+), 70 deletions(-) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 193fd8637f9f..8700379685e0 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -49,6 +49,7 @@ #include #include #include +#include DECLARE_MUTEX(rtnl_sem); @@ -519,8 +520,6 @@ rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, int *errp) } if (kind == 2 && nlh->nlmsg_flags&NLM_F_DUMP) { - u32 rlen; - if (link->dumpit == NULL) link = &(rtnetlink_links[PF_UNSPEC][type]); @@ -531,10 +530,8 @@ rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, int *errp) link->dumpit, NULL)) != 0) { return -1; } - rlen = NLMSG_ALIGN(nlh->nlmsg_len); - if (rlen > skb->len) - rlen = skb->len; - skb_pull(skb, rlen); + + netlink_queue_skip(nlh, skb); return -1; } @@ -573,75 +570,13 @@ err_inval: return -1; } -/* - * Process one packet of messages. - * Malformed skbs with wrong lengths of messages are discarded silently. - */ - -static inline int rtnetlink_rcv_skb(struct sk_buff *skb) -{ - int err; - struct nlmsghdr * nlh; - - while (skb->len >= NLMSG_SPACE(0)) { - u32 rlen; - - nlh = (struct nlmsghdr *)skb->data; - if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len) - return 0; - rlen = NLMSG_ALIGN(nlh->nlmsg_len); - if (rlen > skb->len) - rlen = skb->len; - if (rtnetlink_rcv_msg(skb, nlh, &err)) { - /* Not error, but we must interrupt processing here: - * Note, that in this case we do not pull message - * from skb, it will be processed later. - */ - if (err == 0) - return -1; - netlink_ack(skb, nlh, err); - } else if (nlh->nlmsg_flags&NLM_F_ACK) - netlink_ack(skb, nlh, 0); - skb_pull(skb, rlen); - } - - return 0; -} - -/* - * rtnetlink input queue processing routine: - * - process as much as there was in the queue upon entry. - * - feed skbs to rtnetlink_rcv_skb, until it refuse a message, - * that will occur, when a dump started. - */ - static void rtnetlink_rcv(struct sock *sk, int len) { - unsigned int qlen = skb_queue_len(&sk->sk_receive_queue); + unsigned int qlen = 0; do { - struct sk_buff *skb; - rtnl_lock(); - - if (qlen > skb_queue_len(&sk->sk_receive_queue)) - qlen = skb_queue_len(&sk->sk_receive_queue); - - for (; qlen; qlen--) { - skb = skb_dequeue(&sk->sk_receive_queue); - if (rtnetlink_rcv_skb(skb)) { - if (skb->len) - skb_queue_head(&sk->sk_receive_queue, - skb); - else { - kfree_skb(skb); - qlen--; - } - break; - } - kfree_skb(skb); - } - + netlink_run_queue(sk, &qlen, &rtnetlink_rcv_msg); up(&rtnl_sem); netdev_run_todo(); -- cgit v1.2.3 From 482a8524f85a7d8c40c6fb5d072e85bc2fef327f Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Thu, 10 Nov 2005 02:25:56 +0100 Subject: [NETLINK]: Generic netlink family The generic netlink family builds on top of netlink and provides simplifies access for the less demanding netlink users. It solves the problem of protocol numbers running out by introducing a so called controller taking care of id management and name resolving. Generic netlink modules register themself after filling out their id card (struct genl_family), after successful registration the modules are able to register callbacks to command numbers by filling out a struct genl_ops and calling genl_register_op(). The registered callbacks are invoked with attributes parsed making life of simple modules a lot easier. Although generic netlink modules can request static identifiers, it is recommended to use GENL_ID_GENERATE and to let the controller assign a unique identifier to the module. Userspace applications will then ask the controller and lookup the idenfier by the module name. Due to the current multicast implementation of netlink, the number of generic netlink modules is restricted to 1024 to avoid wasting memory for the per socket multiacst subscription bitmask. Signed-off-by: Thomas Graf Signed-off-by: David S. Miller --- include/linux/genetlink.h | 51 ++++ include/net/genetlink.h | 154 ++++++++++++ net/netlink/Makefile | 2 +- net/netlink/genetlink.c | 579 ++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 785 insertions(+), 1 deletion(-) create mode 100644 include/linux/genetlink.h create mode 100644 include/net/genetlink.h create mode 100644 net/netlink/genetlink.c (limited to 'net') diff --git a/include/linux/genetlink.h b/include/linux/genetlink.h new file mode 100644 index 000000000000..84f12a41dc01 --- /dev/null +++ b/include/linux/genetlink.h @@ -0,0 +1,51 @@ +#ifndef __LINUX_GENERIC_NETLINK_H +#define __LINUX_GENERIC_NETLINK_H + +#include + +#define GENL_NAMSIZ 16 /* length of family name */ + +#define GENL_MIN_ID NLMSG_MIN_TYPE +#define GENL_MAX_ID 1023 + +struct genlmsghdr { + __u8 cmd; + __u8 version; + __u16 reserved; +}; + +#define GENL_HDRLEN NLMSG_ALIGN(sizeof(struct genlmsghdr)) + +/* + * List of reserved static generic netlink identifiers: + */ +#define GENL_ID_GENERATE 0 +#define GENL_ID_CTRL NLMSG_MIN_TYPE + +/************************************************************************** + * Controller + **************************************************************************/ + +enum { + CTRL_CMD_UNSPEC, + CTRL_CMD_NEWFAMILY, + CTRL_CMD_DELFAMILY, + CTRL_CMD_GETFAMILY, + CTRL_CMD_NEWOPS, + CTRL_CMD_DELOPS, + CTRL_CMD_GETOPS, + __CTRL_CMD_MAX, +}; + +#define CTRL_CMD_MAX (__CTRL_CMD_MAX - 1) + +enum { + CTRL_ATTR_UNSPEC, + CTRL_ATTR_FAMILY_ID, + CTRL_ATTR_FAMILY_NAME, + __CTRL_ATTR_MAX, +}; + +#define CTRL_ATTR_MAX (__CTRL_ATTR_MAX - 1) + +#endif /* __LINUX_GENERIC_NETLINK_H */ diff --git a/include/net/genetlink.h b/include/net/genetlink.h new file mode 100644 index 000000000000..52d8b1a73d52 --- /dev/null +++ b/include/net/genetlink.h @@ -0,0 +1,154 @@ +#ifndef __NET_GENERIC_NETLINK_H +#define __NET_GENERIC_NETLINK_H + +#include +#include + +/** + * struct genl_family - generic netlink family + * @id: protocol family idenfitier + * @hdrsize: length of user specific header in bytes + * @name: name of family + * @version: protocol version + * @maxattr: maximum number of attributes supported + * @attrbuf: buffer to store parsed attributes + * @ops_list: list of all assigned operations + * @family_list: family list + */ +struct genl_family +{ + unsigned int id; + unsigned int hdrsize; + char name[GENL_NAMSIZ]; + unsigned int version; + unsigned int maxattr; + struct module * owner; + struct nlattr ** attrbuf; /* private */ + struct list_head ops_list; /* private */ + struct list_head family_list; /* private */ +}; + +#define GENL_ADMIN_PERM 0x01 + +/** + * struct genl_info - receiving information + * @snd_seq: sending sequence number + * @snd_pid: netlink pid of sender + * @nlhdr: netlink message header + * @genlhdr: generic netlink message header + * @userhdr: user specific header + * @attrs: netlink attributes + */ +struct genl_info +{ + u32 snd_seq; + u32 snd_pid; + struct nlmsghdr * nlhdr; + struct genlmsghdr * genlhdr; + void * userhdr; + struct nlattr ** attrs; +}; + +/** + * struct genl_ops - generic netlink operations + * @cmd: command identifier + * @flags: flags + * @policy: attribute validation policy + * @doit: standard command callback + * @dumpit: callback for dumpers + * @ops_list: operations list + */ +struct genl_ops +{ + unsigned int cmd; + unsigned int flags; + struct nla_policy *policy; + int (*doit)(struct sk_buff *skb, + struct genl_info *info); + int (*dumpit)(struct sk_buff *skb, + struct netlink_callback *cb); + struct list_head ops_list; +}; + +extern int genl_register_family(struct genl_family *family); +extern int genl_unregister_family(struct genl_family *family); +extern int genl_register_ops(struct genl_family *, struct genl_ops *ops); +extern int genl_unregister_ops(struct genl_family *, struct genl_ops *ops); + +extern struct sock *genl_sock; + +/** + * genlmsg_put - Add generic netlink header to netlink message + * @skb: socket buffer holding the message + * @pid: netlink pid the message is addressed to + * @seq: sequence number (usually the one of the sender) + * @type: netlink message type + * @hdrlen: length of the user specific header + * @flags netlink message flags + * @cmd: generic netlink command + * @version: version + * + * Returns pointer to user specific header + */ +static inline void *genlmsg_put(struct sk_buff *skb, u32 pid, u32 seq, + int type, int hdrlen, int flags, + u8 cmd, u8 version) +{ + struct nlmsghdr *nlh; + struct genlmsghdr *hdr; + + nlh = nlmsg_put(skb, pid, seq, type, GENL_HDRLEN + hdrlen, flags); + if (nlh == NULL) + return NULL; + + hdr = nlmsg_data(nlh); + hdr->cmd = cmd; + hdr->version = version; + hdr->reserved = 0; + + return (char *) hdr + GENL_HDRLEN; +} + +/** + * genlmsg_end - Finalize a generic netlink message + * @skb: socket buffer the message is stored in + * @hdr: user specific header + */ +static inline int genlmsg_end(struct sk_buff *skb, void *hdr) +{ + return nlmsg_end(skb, hdr - GENL_HDRLEN - NLMSG_HDRLEN); +} + +/** + * genlmsg_cancel - Cancel construction of a generic netlink message + * @skb: socket buffer the message is stored in + * @hdr: generic netlink message header + */ +static inline int genlmsg_cancel(struct sk_buff *skb, void *hdr) +{ + return nlmsg_cancel(skb, hdr - GENL_HDRLEN - NLMSG_HDRLEN); +} + +/** + * genlmsg_multicast - multicast a netlink message + * @skb: netlink message as socket buffer + * @pid: own netlink pid to avoid sending to yourself + * @group: multicast group id + */ +static inline int genlmsg_multicast(struct sk_buff *skb, u32 pid, + unsigned int group) +{ + return nlmsg_multicast(genl_sock, skb, pid, group); +} + +/** + * genlmsg_unicast - unicast a netlink message + * @skb: netlink message as socket buffer + * @pid: netlink pid of the destination socket + */ +static inline int genlmsg_unicast(struct sk_buff *skb, u32 pid) +{ + return nlmsg_unicast(genl_sock, skb, pid); +} + +#endif /* __NET_GENERIC_NETLINK_H */ diff --git a/net/netlink/Makefile b/net/netlink/Makefile index 68179015bd39..e3589c2de49e 100644 --- a/net/netlink/Makefile +++ b/net/netlink/Makefile @@ -2,4 +2,4 @@ # Makefile for the netlink driver. # -obj-y := af_netlink.o attr.o +obj-y := af_netlink.o attr.o genetlink.o diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c new file mode 100644 index 000000000000..287cfcc56951 --- /dev/null +++ b/net/netlink/genetlink.c @@ -0,0 +1,579 @@ +/* + * NETLINK Generic Netlink Family + * + * Authors: Jamal Hadi Salim + * Thomas Graf + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct sock *genl_sock = NULL; + +static DECLARE_MUTEX(genl_sem); /* serialization of message processing */ + +static void genl_lock(void) +{ + down(&genl_sem); +} + +static int genl_trylock(void) +{ + return down_trylock(&genl_sem); +} + +static void genl_unlock(void) +{ + up(&genl_sem); + + if (genl_sock && genl_sock->sk_receive_queue.qlen) + genl_sock->sk_data_ready(genl_sock, 0); +} + +#define GENL_FAM_TAB_SIZE 16 +#define GENL_FAM_TAB_MASK (GENL_FAM_TAB_SIZE - 1) + +static struct list_head family_ht[GENL_FAM_TAB_SIZE]; + +static int genl_ctrl_event(int event, void *data); + +static inline unsigned int genl_family_hash(unsigned int id) +{ + return id & GENL_FAM_TAB_MASK; +} + +static inline struct list_head *genl_family_chain(unsigned int id) +{ + return &family_ht[genl_family_hash(id)]; +} + +static struct genl_family *genl_family_find_byid(unsigned int id) +{ + struct genl_family *f; + + list_for_each_entry(f, genl_family_chain(id), family_list) + if (f->id == id) + return f; + + return NULL; +} + +static struct genl_family *genl_family_find_byname(char *name) +{ + struct genl_family *f; + int i; + + for (i = 0; i < GENL_FAM_TAB_SIZE; i++) + list_for_each_entry(f, genl_family_chain(i), family_list) + if (strcmp(f->name, name) == 0) + return f; + + return NULL; +} + +static struct genl_ops *genl_get_cmd(u8 cmd, struct genl_family *family) +{ + struct genl_ops *ops; + + list_for_each_entry(ops, &family->ops_list, ops_list) + if (ops->cmd == cmd) + return ops; + + return NULL; +} + +/* Of course we are going to have problems once we hit + * 2^16 alive types, but that can only happen by year 2K +*/ +static inline u16 genl_generate_id(void) +{ + static u16 id_gen_idx; + int overflowed = 0; + + do { + if (id_gen_idx == 0) + id_gen_idx = GENL_MIN_ID; + + if (++id_gen_idx > GENL_MAX_ID) { + if (!overflowed) { + overflowed = 1; + id_gen_idx = 0; + continue; + } else + return 0; + } + + } while (genl_family_find_byid(id_gen_idx)); + + return id_gen_idx; +} + +/** + * genl_register_ops - register generic netlink operations + * @family: generic netlink family + * @ops: operations to be registered + * + * Registers the specified operations and assigns them to the specified + * family. Either a doit or dumpit callback must be specified or the + * operation will fail. Only one operation structure per command + * identifier may be registered. + * + * See include/net/genetlink.h for more documenation on the operations + * structure. + * + * Returns 0 on success or a negative error code. + */ +int genl_register_ops(struct genl_family *family, struct genl_ops *ops) +{ + int err = -EINVAL; + + if (ops->dumpit == NULL && ops->doit == NULL) + goto errout; + + if (genl_get_cmd(ops->cmd, family)) { + err = -EEXIST; + goto errout; + } + + genl_lock(); + list_add_tail(&ops->ops_list, &family->ops_list); + genl_unlock(); + + genl_ctrl_event(CTRL_CMD_NEWOPS, ops); + err = 0; +errout: + return err; +} + +/** + * genl_unregister_ops - unregister generic netlink operations + * @family: generic netlink family + * @ops: operations to be unregistered + * + * Unregisters the specified operations and unassigns them from the + * specified family. The operation blocks until the current message + * processing has finished and doesn't start again until the + * unregister process has finished. + * + * Note: It is not necessary to unregister all operations before + * unregistering the family, unregistering the family will cause + * all assigned operations to be unregistered automatically. + * + * Returns 0 on success or a negative error code. + */ +int genl_unregister_ops(struct genl_family *family, struct genl_ops *ops) +{ + struct genl_ops *rc; + + genl_lock(); + list_for_each_entry(rc, &family->ops_list, ops_list) { + if (rc == ops) { + list_del(&ops->ops_list); + genl_unlock(); + genl_ctrl_event(CTRL_CMD_DELOPS, ops); + return 0; + } + } + genl_unlock(); + + return -ENOENT; +} + +/** + * genl_register_family - register a generic netlink family + * @family: generic netlink family + * + * Registers the specified family after validating it first. Only one + * family may be registered with the same family name or identifier. + * The family id may equal GENL_ID_GENERATE causing an unique id to + * be automatically generated and assigned. + * + * Return 0 on success or a negative error code. + */ +int genl_register_family(struct genl_family *family) +{ + int err = -EINVAL; + + if (family->id && family->id < GENL_MIN_ID) + goto errout; + + if (family->id > GENL_MAX_ID) + goto errout; + + INIT_LIST_HEAD(&family->ops_list); + + genl_lock(); + + if (genl_family_find_byname(family->name)) { + err = -EEXIST; + goto errout_locked; + } + + if (genl_family_find_byid(family->id)) { + err = -EEXIST; + goto errout_locked; + } + + if (!try_module_get(family->owner)) { + err = -EBUSY; + goto errout_locked; + } + + if (family->id == GENL_ID_GENERATE) { + u16 newid = genl_generate_id(); + + if (!newid) { + err = -ENOMEM; + goto errout_locked; + } + + family->id = newid; + } + + if (family->maxattr) { + family->attrbuf = kmalloc((family->maxattr+1) * + sizeof(struct nlattr *), GFP_KERNEL); + if (family->attrbuf == NULL) { + err = -ENOMEM; + goto errout; + } + } else + family->attrbuf = NULL; + + list_add_tail(&family->family_list, genl_family_chain(family->id)); + genl_unlock(); + + genl_ctrl_event(CTRL_CMD_NEWFAMILY, family); + + return 0; + +errout_locked: + genl_unlock(); +errout: + return err; +} + +/** + * genl_unregister_family - unregister generic netlink family + * @family: generic netlink family + * + * Unregisters the specified family. + * + * Returns 0 on success or a negative error code. + */ +int genl_unregister_family(struct genl_family *family) +{ + struct genl_family *rc; + + genl_lock(); + + list_for_each_entry(rc, genl_family_chain(family->id), family_list) { + if (family->id != rc->id || strcmp(rc->name, family->name)) + continue; + + list_del(&rc->family_list); + INIT_LIST_HEAD(&family->ops_list); + genl_unlock(); + + module_put(family->owner); + kfree(family->attrbuf); + genl_ctrl_event(CTRL_CMD_DELFAMILY, family); + return 0; + } + + genl_unlock(); + + return -ENOENT; +} + +static inline int genl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, + int *errp) +{ + struct genl_ops *ops; + struct genl_family *family; + struct genl_info info; + struct genlmsghdr *hdr = nlmsg_data(nlh); + int hdrlen, err = -EINVAL; + + if (!(nlh->nlmsg_flags & NLM_F_REQUEST)) + goto ignore; + + if (nlh->nlmsg_type < NLMSG_MIN_TYPE) + goto ignore; + + family = genl_family_find_byid(nlh->nlmsg_type); + if (family == NULL) { + err = -ENOENT; + goto errout; + } + + hdrlen = GENL_HDRLEN + family->hdrsize; + if (nlh->nlmsg_len < nlmsg_msg_size(hdrlen)) + goto errout; + + ops = genl_get_cmd(hdr->cmd, family); + if (ops == NULL) { + err = -EOPNOTSUPP; + goto errout; + } + + if ((ops->flags & GENL_ADMIN_PERM) && security_netlink_recv(skb)) { + err = -EPERM; + goto errout; + } + + if (nlh->nlmsg_flags & NLM_F_DUMP) { + if (ops->dumpit == NULL) { + err = -EOPNOTSUPP; + goto errout; + } + + *errp = err = netlink_dump_start(genl_sock, skb, nlh, + ops->dumpit, NULL); + if (err == 0) + skb_pull(skb, min(NLMSG_ALIGN(nlh->nlmsg_len), + skb->len)); + return -1; + } + + if (ops->doit == NULL) { + err = -EOPNOTSUPP; + goto errout; + } + + if (family->attrbuf) { + err = nlmsg_parse(nlh, hdrlen, family->attrbuf, family->maxattr, + ops->policy); + if (err < 0) + goto errout; + } + + info.snd_seq = nlh->nlmsg_seq; + info.snd_pid = NETLINK_CB(skb).pid; + info.nlhdr = nlh; + info.genlhdr = nlmsg_data(nlh); + info.userhdr = nlmsg_data(nlh) + GENL_HDRLEN; + info.attrs = family->attrbuf; + + *errp = err = ops->doit(skb, &info); + return err; + +ignore: + return 0; + +errout: + *errp = err; + return -1; +} + +static void genl_rcv(struct sock *sk, int len) +{ + unsigned int qlen = 0; + + do { + if (genl_trylock()) + return; + netlink_run_queue(sk, &qlen, &genl_rcv_msg); + genl_unlock(); + } while (qlen && genl_sock && genl_sock->sk_receive_queue.qlen); +} + +/************************************************************************** + * Controller + **************************************************************************/ + +static int ctrl_fill_info(struct genl_family *family, u32 pid, u32 seq, + u32 flags, struct sk_buff *skb, u8 cmd) +{ + void *hdr; + + hdr = genlmsg_put(skb, pid, seq, GENL_ID_CTRL, 0, flags, cmd, + family->version); + if (hdr == NULL) + return -1; + + NLA_PUT_STRING(skb, CTRL_ATTR_FAMILY_NAME, family->name); + NLA_PUT_U16(skb, CTRL_ATTR_FAMILY_ID, family->id); + + return genlmsg_end(skb, hdr); + +nla_put_failure: + return genlmsg_cancel(skb, hdr); +} + +static int ctrl_dumpfamily(struct sk_buff *skb, struct netlink_callback *cb) +{ + + int i, n = 0; + struct genl_family *rt; + int chains_to_skip = cb->args[0]; + int fams_to_skip = cb->args[1]; + + for (i = 0; i < GENL_FAM_TAB_SIZE; i++) { + if (i < chains_to_skip) + continue; + n = 0; + list_for_each_entry(rt, genl_family_chain(i), family_list) { + if (++n < fams_to_skip) + continue; + if (ctrl_fill_info(rt, NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, NLM_F_MULTI, + skb, CTRL_CMD_NEWFAMILY) < 0) + goto errout; + } + + fams_to_skip = 0; + } + +errout: + cb->args[0] = i; + cb->args[1] = n; + + return skb->len; +} + +static struct sk_buff *ctrl_build_msg(struct genl_family *family, u32 pid, + int seq, int cmd) +{ + struct sk_buff *skb; + int err; + + skb = nlmsg_new(NLMSG_GOODSIZE); + if (skb == NULL) + return ERR_PTR(-ENOBUFS); + + err = ctrl_fill_info(family, pid, seq, 0, skb, cmd); + if (err < 0) { + nlmsg_free(skb); + return ERR_PTR(err); + } + + return skb; +} + +static struct nla_policy ctrl_policy[CTRL_ATTR_MAX+1] __read_mostly = { + [CTRL_ATTR_FAMILY_ID] = { .type = NLA_U16 }, + [CTRL_ATTR_FAMILY_NAME] = { .type = NLA_STRING }, +}; + +static int ctrl_getfamily(struct sk_buff *skb, struct genl_info *info) +{ + struct sk_buff *msg; + struct genl_family *res = NULL; + int err = -EINVAL; + + if (info->attrs[CTRL_ATTR_FAMILY_ID]) { + u16 id = nla_get_u16(info->attrs[CTRL_ATTR_FAMILY_ID]); + res = genl_family_find_byid(id); + } + + if (info->attrs[CTRL_ATTR_FAMILY_NAME]) { + char name[GENL_NAMSIZ]; + + if (nla_strlcpy(name, info->attrs[CTRL_ATTR_FAMILY_NAME], + GENL_NAMSIZ) >= GENL_NAMSIZ) + goto errout; + + res = genl_family_find_byname(name); + } + + if (res == NULL) { + err = -ENOENT; + goto errout; + } + + msg = ctrl_build_msg(res, info->snd_pid, info->snd_seq, + CTRL_CMD_NEWFAMILY); + if (IS_ERR(msg)) { + err = PTR_ERR(msg); + goto errout; + } + + err = genlmsg_unicast(msg, info->snd_pid); +errout: + return err; +} + +static int genl_ctrl_event(int event, void *data) +{ + struct sk_buff *msg; + + if (genl_sock == NULL) + return 0; + + switch (event) { + case CTRL_CMD_NEWFAMILY: + case CTRL_CMD_DELFAMILY: + msg = ctrl_build_msg(data, 0, 0, event); + if (IS_ERR(msg)) + return PTR_ERR(msg); + + genlmsg_multicast(msg, 0, GENL_ID_CTRL); + break; + } + + return 0; +} + +static struct genl_ops genl_ctrl_ops = { + .cmd = CTRL_CMD_GETFAMILY, + .doit = ctrl_getfamily, + .dumpit = ctrl_dumpfamily, + .policy = ctrl_policy, +}; + +static struct genl_family genl_ctrl = { + .id = GENL_ID_CTRL, + .name = "nlctrl", + .version = 0x1, + .maxattr = CTRL_ATTR_MAX, + .owner = THIS_MODULE, +}; + +static int __init genl_init(void) +{ + int i, err; + + for (i = 0; i < GENL_FAM_TAB_SIZE; i++) + INIT_LIST_HEAD(&family_ht[i]); + + err = genl_register_family(&genl_ctrl); + if (err < 0) + goto errout; + + err = genl_register_ops(&genl_ctrl, &genl_ctrl_ops); + if (err < 0) + goto errout_register; + + netlink_set_nonroot(NETLINK_GENERIC, NL_NONROOT_RECV); + genl_sock = netlink_kernel_create(NETLINK_GENERIC, GENL_MAX_ID, + genl_rcv, THIS_MODULE); + if (genl_sock == NULL) { + panic("GENL: Cannot initialize generic netlink\n"); + return -ENOMEM; + } + + return 0; + +errout_register: + genl_unregister_family(&genl_ctrl); +errout: + panic("GENL: Cannot register controller: %d\n", err); + return err; +} + +subsys_initcall(genl_init); + +EXPORT_SYMBOL(genl_sock); +EXPORT_SYMBOL(genl_register_ops); +EXPORT_SYMBOL(genl_unregister_ops); +EXPORT_SYMBOL(genl_register_family); +EXPORT_SYMBOL(genl_unregister_family); -- cgit v1.2.3 From 940e3318c36394939d805e797d7be39ddaaa7911 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 9 Nov 2005 21:45:24 -0500 Subject: [PATCH] SUNRPC: don't reencode when looping in call transmit. If the call to xprt_transmit() fails due to socket buffer space exhaustion, we do not need to re-encode the RPC message when we loop back through call_transmit. Re-encoding can actually end up triggering the WARN_ON() in call_decode() if we re-encode something like a read() request and auth->au_rslack has changed. It can also cause us to increment the RPCSEC_GSS sequence number beyond the limits of the allowed window. Signed-off-by: Trond Myklebust Signed-off-by: Linus Torvalds --- net/sunrpc/clnt.c | 32 +++++++++++++++++++++++++++++--- 1 file changed, 29 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 702ede309b06..61c3abeaccae 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -55,6 +55,7 @@ static void call_bind(struct rpc_task *task); static void call_bind_status(struct rpc_task *task); static void call_transmit(struct rpc_task *task); static void call_status(struct rpc_task *task); +static void call_transmit_status(struct rpc_task *task); static void call_refresh(struct rpc_task *task); static void call_refreshresult(struct rpc_task *task); static void call_timeout(struct rpc_task *task); @@ -672,6 +673,18 @@ call_allocate(struct rpc_task *task) rpc_exit(task, -ERESTARTSYS); } +static inline int +rpc_task_need_encode(struct rpc_task *task) +{ + return task->tk_rqstp->rq_snd_buf.len == 0; +} + +static inline void +rpc_task_force_reencode(struct rpc_task *task) +{ + task->tk_rqstp->rq_snd_buf.len = 0; +} + /* * 3. Encode arguments of an RPC call */ @@ -867,12 +880,14 @@ call_transmit(struct rpc_task *task) if (task->tk_status != 0) return; /* Encode here so that rpcsec_gss can use correct sequence number. */ - if (task->tk_rqstp->rq_bytes_sent == 0) { + if (rpc_task_need_encode(task)) { + task->tk_rqstp->rq_bytes_sent = 0; call_encode(task); /* Did the encode result in an error condition? */ if (task->tk_status != 0) goto out_nosend; } + task->tk_action = call_transmit_status; xprt_transmit(task); if (task->tk_status < 0) return; @@ -884,6 +899,7 @@ call_transmit(struct rpc_task *task) out_nosend: /* release socket write lock before attempting to handle error */ xprt_abort_transmit(task); + rpc_task_force_reencode(task); } /* @@ -915,7 +931,6 @@ call_status(struct rpc_task *task) break; case -ECONNREFUSED: case -ENOTCONN: - req->rq_bytes_sent = 0; if (clnt->cl_autobind) clnt->cl_port = 0; task->tk_action = call_bind; @@ -937,7 +952,18 @@ call_status(struct rpc_task *task) } /* - * 6a. Handle RPC timeout + * 6a. Handle transmission errors. + */ +static void +call_transmit_status(struct rpc_task *task) +{ + if (task->tk_status != -EAGAIN) + rpc_task_force_reencode(task); + call_status(task); +} + +/* + * 6b. Handle RPC timeout * We do not release the request slot, so we keep using the * same XID for all retransmits. */ -- cgit v1.2.3