From 51454ea42c1ab4e0c2828bb0d4d53957976980de Mon Sep 17 00:00:00 2001 From: Niels Dossche Date: Mon, 4 Apr 2022 01:15:24 +0200 Subject: ipv6: fix locking issues with loops over idev->addr_list idev->addr_list needs to be protected by idev->lock. However, it is not always possible to do so while iterating and performing actions on inet6_ifaddr instances. For example, multiple functions (like addrconf_{join,leave}_anycast) eventually call down to other functions that acquire the idev->lock. The current code temporarily unlocked the idev->lock during the loops, which can cause race conditions. Moving the locks up is also not an appropriate solution as the ordering of lock acquisition will be inconsistent with for example mc_lock. This solution adds an additional field to inet6_ifaddr that is used to temporarily add the instances to a temporary list while holding idev->lock. The temporary list can then be traversed without holding idev->lock. This change was done in two places. In addrconf_ifdown, the list_for_each_entry_safe variant of the list loop is also no longer necessary as there is no deletion within that specific loop. Suggested-by: Paolo Abeni Signed-off-by: Niels Dossche Acked-by: Paolo Abeni Link: https://lore.kernel.org/r/20220403231523.45843-1-dossche.niels@gmail.com Signed-off-by: Jakub Kicinski --- net/ipv6/addrconf.c | 30 ++++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) (limited to 'net/ipv6/addrconf.c') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index b22504176588..1afc4c024981 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -797,6 +797,7 @@ static void dev_forward_change(struct inet6_dev *idev) { struct net_device *dev; struct inet6_ifaddr *ifa; + LIST_HEAD(tmp_addr_list); if (!idev) return; @@ -815,14 +816,24 @@ static void dev_forward_change(struct inet6_dev *idev) } } + read_lock_bh(&idev->lock); list_for_each_entry(ifa, &idev->addr_list, if_list) { if (ifa->flags&IFA_F_TENTATIVE) continue; + list_add_tail(&ifa->if_list_aux, &tmp_addr_list); + } + read_unlock_bh(&idev->lock); + + while (!list_empty(&tmp_addr_list)) { + ifa = list_first_entry(&tmp_addr_list, + struct inet6_ifaddr, if_list_aux); + list_del(&ifa->if_list_aux); if (idev->cnf.forwarding) addrconf_join_anycast(ifa); else addrconf_leave_anycast(ifa); } + inet6_netconf_notify_devconf(dev_net(dev), RTM_NEWNETCONF, NETCONFA_FORWARDING, dev->ifindex, &idev->cnf); @@ -3728,7 +3739,8 @@ static int addrconf_ifdown(struct net_device *dev, bool unregister) unsigned long event = unregister ? NETDEV_UNREGISTER : NETDEV_DOWN; struct net *net = dev_net(dev); struct inet6_dev *idev; - struct inet6_ifaddr *ifa, *tmp; + struct inet6_ifaddr *ifa; + LIST_HEAD(tmp_addr_list); bool keep_addr = false; bool was_ready; int state, i; @@ -3820,16 +3832,23 @@ restart: write_lock_bh(&idev->lock); } - list_for_each_entry_safe(ifa, tmp, &idev->addr_list, if_list) { + list_for_each_entry(ifa, &idev->addr_list, if_list) + list_add_tail(&ifa->if_list_aux, &tmp_addr_list); + write_unlock_bh(&idev->lock); + + while (!list_empty(&tmp_addr_list)) { struct fib6_info *rt = NULL; bool keep; + ifa = list_first_entry(&tmp_addr_list, + struct inet6_ifaddr, if_list_aux); + list_del(&ifa->if_list_aux); + addrconf_del_dad_work(ifa); keep = keep_addr && (ifa->flags & IFA_F_PERMANENT) && !addr_is_local(&ifa->addr); - write_unlock_bh(&idev->lock); spin_lock_bh(&ifa->lock); if (keep) { @@ -3860,15 +3879,14 @@ restart: addrconf_leave_solict(ifa->idev, &ifa->addr); } - write_lock_bh(&idev->lock); if (!keep) { + write_lock_bh(&idev->lock); list_del_rcu(&ifa->if_list); + write_unlock_bh(&idev->lock); in6_ifa_put(ifa); } } - write_unlock_bh(&idev->lock); - /* Step 5: Discard anycast and multicast list */ if (unregister) { ipv6_ac_destroy_dev(idev); -- cgit v1.2.3 From f9a2fb73318eb4dbf8cd84866b8b0dd012d8b116 Mon Sep 17 00:00:00 2001 From: Arun Ajith S Date: Fri, 15 Apr 2022 08:34:02 +0000 Subject: net/ipv6: Introduce accept_unsolicited_na knob to implement router-side changes for RFC9131 Add a new neighbour cache entry in STALE state for routers on receiving an unsolicited (gratuitous) neighbour advertisement with target link-layer-address option specified. This is similar to the arp_accept configuration for IPv4. A new sysctl endpoint is created to turn on this behaviour: /proc/sys/net/ipv6/conf/interface/accept_unsolicited_na. Signed-off-by: Arun Ajith S Reviewed-by: David Ahern Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.rst | 27 +++ include/linux/ipv6.h | 1 + include/uapi/linux/ipv6.h | 1 + net/ipv6/addrconf.c | 10 + net/ipv6/ndisc.c | 20 +- tools/testing/selftests/net/Makefile | 1 + .../selftests/net/ndisc_unsolicited_na_test.sh | 255 +++++++++++++++++++++ 7 files changed, 314 insertions(+), 1 deletion(-) create mode 100755 tools/testing/selftests/net/ndisc_unsolicited_na_test.sh (limited to 'net/ipv6/addrconf.c') diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst index b0024aa7b051..433f2e4a5fed 100644 --- a/Documentation/networking/ip-sysctl.rst +++ b/Documentation/networking/ip-sysctl.rst @@ -2467,6 +2467,33 @@ drop_unsolicited_na - BOOLEAN By default this is turned off. +accept_unsolicited_na - BOOLEAN + Add a new neighbour cache entry in STALE state for routers on receiving an + unsolicited neighbour advertisement with target link-layer address option + specified. This is as per router-side behavior documented in RFC9131. + This has lower precedence than drop_unsolicited_na. + + ==== ====== ====== ============================================== + drop accept fwding behaviour + ---- ------ ------ ---------------------------------------------- + 1 X X Drop NA packet and don't pass up the stack + 0 0 X Pass NA packet up the stack, don't update NC + 0 1 0 Pass NA packet up the stack, don't update NC + 0 1 1 Pass NA packet up the stack, and add a STALE + NC entry + ==== ====== ====== ============================================== + + This will optimize the return path for the initial off-link communication + that is initiated by a directly connected host, by ensuring that + the first-hop router which turns on this setting doesn't have to + buffer the initial return packets to do neighbour-solicitation. + The prerequisite is that the host is configured to send + unsolicited neighbour advertisements on interface bringup. + This setting should be used in conjunction with the ndisc_notify setting + on the host to satisfy this prerequisite. + + By default this is turned off. + enhanced_dad - BOOLEAN Include a nonce option in the IPv6 neighbor solicitation messages used for duplicate address detection per RFC7527. A received DAD NS will only signal diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 16870f86c74d..918bfea4ef5f 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -61,6 +61,7 @@ struct ipv6_devconf { __s32 suppress_frag_ndisc; __s32 accept_ra_mtu; __s32 drop_unsolicited_na; + __s32 accept_unsolicited_na; struct ipv6_stable_secret { bool initialized; struct in6_addr secret; diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h index d4178dace0bf..549ddeaf788b 100644 --- a/include/uapi/linux/ipv6.h +++ b/include/uapi/linux/ipv6.h @@ -194,6 +194,7 @@ enum { DEVCONF_IOAM6_ID, DEVCONF_IOAM6_ID_WIDE, DEVCONF_NDISC_EVICT_NOCARRIER, + DEVCONF_ACCEPT_UNSOLICITED_NA, DEVCONF_MAX }; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 1afc4c024981..6473dc84b71d 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -5587,6 +5587,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf, array[DEVCONF_IOAM6_ID] = cnf->ioam6_id; array[DEVCONF_IOAM6_ID_WIDE] = cnf->ioam6_id_wide; array[DEVCONF_NDISC_EVICT_NOCARRIER] = cnf->ndisc_evict_nocarrier; + array[DEVCONF_ACCEPT_UNSOLICITED_NA] = cnf->accept_unsolicited_na; } static inline size_t inet6_ifla6_size(void) @@ -7037,6 +7038,15 @@ static const struct ctl_table addrconf_sysctl[] = { .extra1 = (void *)SYSCTL_ZERO, .extra2 = (void *)SYSCTL_ONE, }, + { + .procname = "accept_unsolicited_na", + .data = &ipv6_devconf.accept_unsolicited_na, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + .extra1 = (void *)SYSCTL_ZERO, + .extra2 = (void *)SYSCTL_ONE, + }, { /* sentinel */ } diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index fcb288b0ae13..254addad0dd3 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -979,6 +979,7 @@ static void ndisc_recv_na(struct sk_buff *skb) struct inet6_dev *idev = __in6_dev_get(dev); struct inet6_ifaddr *ifp; struct neighbour *neigh; + bool create_neigh; if (skb->len < sizeof(struct nd_msg)) { ND_PRINTK(2, warn, "NA: packet too short\n"); @@ -999,6 +1000,7 @@ static void ndisc_recv_na(struct sk_buff *skb) /* For some 802.11 wireless deployments (and possibly other networks), * there will be a NA proxy and unsolicitd packets are attacks * and thus should not be accepted. + * drop_unsolicited_na takes precedence over accept_unsolicited_na */ if (!msg->icmph.icmp6_solicited && idev && idev->cnf.drop_unsolicited_na) @@ -1039,7 +1041,23 @@ static void ndisc_recv_na(struct sk_buff *skb) in6_ifa_put(ifp); return; } - neigh = neigh_lookup(&nd_tbl, &msg->target, dev); + /* RFC 9131 updates original Neighbour Discovery RFC 4861. + * An unsolicited NA can now create a neighbour cache entry + * on routers if it has Target LL Address option. + * + * drop accept fwding behaviour + * ---- ------ ------ ---------------------------------------------- + * 1 X X Drop NA packet and don't pass up the stack + * 0 0 X Pass NA packet up the stack, don't update NC + * 0 1 0 Pass NA packet up the stack, don't update NC + * 0 1 1 Pass NA packet up the stack, and add a STALE + * NC entry + * Note that we don't do a (daddr == all-routers-mcast) check. + */ + create_neigh = !msg->icmph.icmp6_solicited && lladdr && + idev && idev->cnf.forwarding && + idev->cnf.accept_unsolicited_na; + neigh = __neigh_lookup(&nd_tbl, &msg->target, dev, create_neigh); if (neigh) { u8 old_flags = neigh->flags; diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index 3fe2515aa616..af7f6e6ff182 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -36,6 +36,7 @@ TEST_PROGS += srv6_end_dt4_l3vpn_test.sh TEST_PROGS += srv6_end_dt6_l3vpn_test.sh TEST_PROGS += vrf_strict_mode_test.sh TEST_PROGS += arp_ndisc_evict_nocarrier.sh +TEST_PROGS += ndisc_unsolicited_na_test.sh TEST_PROGS_EXTENDED := in_netns.sh setup_loopback.sh setup_veth.sh TEST_PROGS_EXTENDED += toeplitz_client.sh toeplitz.sh TEST_GEN_FILES = socket nettest diff --git a/tools/testing/selftests/net/ndisc_unsolicited_na_test.sh b/tools/testing/selftests/net/ndisc_unsolicited_na_test.sh new file mode 100755 index 000000000000..f508657ee126 --- /dev/null +++ b/tools/testing/selftests/net/ndisc_unsolicited_na_test.sh @@ -0,0 +1,255 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +# This test is for the accept_unsolicited_na feature to +# enable RFC9131 behaviour. The following is the test-matrix. +# drop accept fwding behaviour +# ---- ------ ------ ---------------------------------------------- +# 1 X X Drop NA packet and don't pass up the stack +# 0 0 X Pass NA packet up the stack, don't update NC +# 0 1 0 Pass NA packet up the stack, don't update NC +# 0 1 1 Pass NA packet up the stack, and add a STALE +# NC entry + +ret=0 +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + +PAUSE_ON_FAIL=no +PAUSE=no + +HOST_NS="ns-host" +ROUTER_NS="ns-router" + +HOST_INTF="veth-host" +ROUTER_INTF="veth-router" + +ROUTER_ADDR="2000:20::1" +HOST_ADDR="2000:20::2" +SUBNET_WIDTH=64 +ROUTER_ADDR_WITH_MASK="${ROUTER_ADDR}/${SUBNET_WIDTH}" +HOST_ADDR_WITH_MASK="${HOST_ADDR}/${SUBNET_WIDTH}" + +IP_HOST="ip -6 -netns ${HOST_NS}" +IP_HOST_EXEC="ip netns exec ${HOST_NS}" +IP_ROUTER="ip -6 -netns ${ROUTER_NS}" +IP_ROUTER_EXEC="ip netns exec ${ROUTER_NS}" + +tcpdump_stdout= +tcpdump_stderr= + +log_test() +{ + local rc=$1 + local expected=$2 + local msg="$3" + + if [ ${rc} -eq ${expected} ]; then + printf " TEST: %-60s [ OK ]\n" "${msg}" + nsuccess=$((nsuccess+1)) + else + ret=1 + nfail=$((nfail+1)) + printf " TEST: %-60s [FAIL]\n" "${msg}" + if [ "${PAUSE_ON_FAIL}" = "yes" ]; then + echo + echo "hit enter to continue, 'q' to quit" + read a + [ "$a" = "q" ] && exit 1 + fi + fi + + if [ "${PAUSE}" = "yes" ]; then + echo + echo "hit enter to continue, 'q' to quit" + read a + [ "$a" = "q" ] && exit 1 + fi +} + +setup() +{ + set -e + + local drop_unsolicited_na=$1 + local accept_unsolicited_na=$2 + local forwarding=$3 + + # Setup two namespaces and a veth tunnel across them. + # On end of the tunnel is a router and the other end is a host. + ip netns add ${HOST_NS} + ip netns add ${ROUTER_NS} + ${IP_ROUTER} link add ${ROUTER_INTF} type veth \ + peer name ${HOST_INTF} netns ${HOST_NS} + + # Enable IPv6 on both router and host, and configure static addresses. + # The router here is the DUT + # Setup router configuration as specified by the arguments. + # forwarding=0 case is to check that a non-router + # doesn't add neighbour entries. + ROUTER_CONF=net.ipv6.conf.${ROUTER_INTF} + ${IP_ROUTER_EXEC} sysctl -qw \ + ${ROUTER_CONF}.forwarding=${forwarding} + ${IP_ROUTER_EXEC} sysctl -qw \ + ${ROUTER_CONF}.drop_unsolicited_na=${drop_unsolicited_na} + ${IP_ROUTER_EXEC} sysctl -qw \ + ${ROUTER_CONF}.accept_unsolicited_na=${accept_unsolicited_na} + ${IP_ROUTER_EXEC} sysctl -qw ${ROUTER_CONF}.disable_ipv6=0 + ${IP_ROUTER} addr add ${ROUTER_ADDR_WITH_MASK} dev ${ROUTER_INTF} + + # Turn on ndisc_notify on host interface so that + # the host sends unsolicited NAs. + HOST_CONF=net.ipv6.conf.${HOST_INTF} + ${IP_HOST_EXEC} sysctl -qw ${HOST_CONF}.ndisc_notify=1 + ${IP_HOST_EXEC} sysctl -qw ${HOST_CONF}.disable_ipv6=0 + ${IP_HOST} addr add ${HOST_ADDR_WITH_MASK} dev ${HOST_INTF} + + set +e +} + +start_tcpdump() { + set -e + tcpdump_stdout=`mktemp` + tcpdump_stderr=`mktemp` + ${IP_ROUTER_EXEC} timeout 15s \ + tcpdump --immediate-mode -tpni ${ROUTER_INTF} -c 1 \ + "icmp6 && icmp6[0] == 136 && src ${HOST_ADDR}" \ + > ${tcpdump_stdout} 2> /dev/null + set +e +} + +cleanup_tcpdump() +{ + set -e + [[ ! -z ${tcpdump_stdout} ]] && rm -f ${tcpdump_stdout} + [[ ! -z ${tcpdump_stderr} ]] && rm -f ${tcpdump_stderr} + tcpdump_stdout= + tcpdump_stderr= + set +e +} + +cleanup() +{ + cleanup_tcpdump + ip netns del ${HOST_NS} + ip netns del ${ROUTER_NS} +} + +link_up() { + set -e + ${IP_ROUTER} link set dev ${ROUTER_INTF} up + ${IP_HOST} link set dev ${HOST_INTF} up + set +e +} + +verify_ndisc() { + local drop_unsolicited_na=$1 + local accept_unsolicited_na=$2 + local forwarding=$3 + + neigh_show_output=$(${IP_ROUTER} neigh show \ + to ${HOST_ADDR} dev ${ROUTER_INTF} nud stale) + if [ ${drop_unsolicited_na} -eq 0 ] && \ + [ ${accept_unsolicited_na} -eq 1 ] && \ + [ ${forwarding} -eq 1 ]; then + # Neighbour entry expected to be present for 011 case + [[ ${neigh_show_output} ]] + else + # Neighbour entry expected to be absent for all other cases + [[ -z ${neigh_show_output} ]] + fi +} + +test_unsolicited_na_common() +{ + # Setup the test bed, but keep links down + setup $1 $2 $3 + + # Bring the link up, wait for the NA, + # and add a delay to ensure neighbour processing is done. + link_up + start_tcpdump + + # Verify the neighbour table + verify_ndisc $1 $2 $3 + +} + +test_unsolicited_na_combination() { + test_unsolicited_na_common $1 $2 $3 + test_msg=("test_unsolicited_na: " + "drop_unsolicited_na=$1 " + "accept_unsolicited_na=$2 " + "forwarding=$3") + log_test $? 0 "${test_msg[*]}" + cleanup +} + +test_unsolicited_na_combinations() { + # Args: drop_unsolicited_na accept_unsolicited_na forwarding + + # Expect entry + test_unsolicited_na_combination 0 1 1 + + # Expect no entry + test_unsolicited_na_combination 0 0 0 + test_unsolicited_na_combination 0 0 1 + test_unsolicited_na_combination 0 1 0 + test_unsolicited_na_combination 1 0 0 + test_unsolicited_na_combination 1 0 1 + test_unsolicited_na_combination 1 1 0 + test_unsolicited_na_combination 1 1 1 +} + +############################################################################### +# usage + +usage() +{ + cat < /dev/null + +test_unsolicited_na_combinations + +printf "\nTests passed: %3d\n" ${nsuccess} +printf "Tests failed: %3d\n" ${nfail} + +exit $ret -- cgit v1.2.3 From d09d3ec03f025fdc22e193277b30a7db20816d2b Mon Sep 17 00:00:00 2001 From: Arun Ajith S Date: Tue, 19 Apr 2022 10:59:10 +0000 Subject: net/ipv6: Enforce limits for accept_unsolicited_na sysctl Fix mistake in the original patch where limits were specified but the handler didn't take care of the limits. Signed-off-by: Arun Ajith S Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv6/addrconf.c') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 6473dc84b71d..f01b8a3e1952 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -7043,7 +7043,7 @@ static const struct ctl_table addrconf_sysctl[] = { .data = &ipv6_devconf.accept_unsolicited_na, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, .extra1 = (void *)SYSCTL_ZERO, .extra2 = (void *)SYSCTL_ONE, }, -- cgit v1.2.3 From b52e1cce31ca721e937d517411179f9196ee6135 Mon Sep 17 00:00:00 2001 From: jianghaoran Date: Fri, 29 Apr 2022 13:38:02 +0800 Subject: ipv6: Don't send rs packets to the interface of ARPHRD_TUNNEL ARPHRD_TUNNEL interface can't process rs packets and will generate TX errors ex: ip tunnel add ethn mode ipip local 192.168.1.1 remote 192.168.1.2 ifconfig ethn x.x.x.x ethn: flags=209 mtu 1480 inet x.x.x.x netmask 255.255.255.255 destination x.x.x.x inet6 fe80::5efe:ac1e:3cdb prefixlen 64 scopeid 0x20 tunnel txqueuelen 1000 (IPIP Tunnel) RX packets 0 bytes 0 (0.0 B) RX errors 0 dropped 0 overruns 0 frame 0 TX packets 0 bytes 0 (0.0 B) TX errors 3 dropped 0 overruns 0 carrier 0 collisions 0 Signed-off-by: jianghaoran Link: https://lore.kernel.org/r/20220429053802.246681-1-jianghaoran@kylinos.cn Signed-off-by: Jakub Kicinski --- net/ipv6/addrconf.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/ipv6/addrconf.c') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index f01b8a3e1952..7dee662d7019 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -4219,7 +4219,8 @@ static void addrconf_dad_completed(struct inet6_ifaddr *ifp, bool bump_id, send_rs = send_mld && ipv6_accept_ra(ifp->idev) && ifp->idev->cnf.rtr_solicits != 0 && - (dev->flags&IFF_LOOPBACK) == 0; + (dev->flags & IFF_LOOPBACK) == 0 && + (dev->type != ARPHRD_TUNNEL); read_unlock_bh(&ifp->idev->lock); /* While dad is in progress mld report's source address is in6_addrany. -- cgit v1.2.3 From 425b9c7f51c98443db71ad679893725483b21196 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Mon, 2 May 2022 15:15:51 +0300 Subject: memcg: accounting for objects allocated for new netdevice Creating a new netdevice allocates at least ~50Kb of memory for various kernel objects, but only ~5Kb of them are accounted to memcg. As a result, creating an unlimited number of netdevice inside a memcg-limited container does not fall within memcg restrictions, consumes a significant part of the host's memory, can cause global OOM and lead to random kills of host processes. The main consumers of non-accounted memory are: ~10Kb 80+ kernfs nodes ~6Kb ipv6_add_dev() allocations 6Kb __register_sysctl_table() allocations 4Kb neigh_sysctl_register() allocations 4Kb __devinet_sysctl_register() allocations 4Kb __addrconf_sysctl_register() allocations Accounting of these objects allows to increase the share of memcg-related memory up to 60-70% (~38Kb accounted vs ~54Kb total for dummy netdevice on typical VM with default Fedora 35 kernel) and this should be enough to somehow protect the host from misuse inside container. Other related objects are quite small and may not be taken into account to minimize the expected performance degradation. It should be separately mentonied ~300 bytes of percpu allocation of struct ipstats_mib in snmp6_alloc_dev(), on huge multi-cpu nodes it can become the main consumer of memory. This patch does not enables kernfs accounting as it affects other parts of the kernel and should be discussed separately. However, even without kernfs, this patch significantly improves the current situation and allows to take into account more than half of all netdevice allocations. Signed-off-by: Vasily Averin Acked-by: Luis Chamberlain Link: https://lore.kernel.org/r/354a0a5f-9ec3-a25c-3215-304eab2157bc@openvz.org Signed-off-by: Jakub Kicinski --- fs/proc/proc_sysctl.c | 2 +- net/core/neighbour.c | 2 +- net/ipv4/devinet.c | 2 +- net/ipv6/addrconf.c | 8 ++++---- 4 files changed, 7 insertions(+), 7 deletions(-) (limited to 'net/ipv6/addrconf.c') diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 5851c2a92c0d..c29f39c01a9a 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -1333,7 +1333,7 @@ struct ctl_table_header *__register_sysctl_table( nr_entries++; header = kzalloc(sizeof(struct ctl_table_header) + - sizeof(struct ctl_node)*nr_entries, GFP_KERNEL); + sizeof(struct ctl_node)*nr_entries, GFP_KERNEL_ACCOUNT); if (!header) return NULL; diff --git a/net/core/neighbour.c b/net/core/neighbour.c index f64ebd050f6c..47b6c1f0fdbb 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -3728,7 +3728,7 @@ int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p, char neigh_path[ sizeof("net//neigh/") + IFNAMSIZ + IFNAMSIZ ]; char *p_name; - t = kmemdup(&neigh_sysctl_template, sizeof(*t), GFP_KERNEL); + t = kmemdup(&neigh_sysctl_template, sizeof(*t), GFP_KERNEL_ACCOUNT); if (!t) goto err; diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 53a6b14dc50a..89141ba5e9ee 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -2573,7 +2573,7 @@ static int __devinet_sysctl_register(struct net *net, char *dev_name, struct devinet_sysctl_table *t; char path[sizeof("net/ipv4/conf/") + IFNAMSIZ]; - t = kmemdup(&devinet_sysctl, sizeof(*t), GFP_KERNEL); + t = kmemdup(&devinet_sysctl, sizeof(*t), GFP_KERNEL_ACCOUNT); if (!t) goto out; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 7dee662d7019..cde242dca530 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -335,7 +335,7 @@ static int snmp6_alloc_dev(struct inet6_dev *idev) { int i; - idev->stats.ipv6 = alloc_percpu(struct ipstats_mib); + idev->stats.ipv6 = alloc_percpu_gfp(struct ipstats_mib, GFP_KERNEL_ACCOUNT); if (!idev->stats.ipv6) goto err_ip; @@ -351,7 +351,7 @@ static int snmp6_alloc_dev(struct inet6_dev *idev) if (!idev->stats.icmpv6dev) goto err_icmp; idev->stats.icmpv6msgdev = kzalloc(sizeof(struct icmpv6msg_mib_device), - GFP_KERNEL); + GFP_KERNEL_ACCOUNT); if (!idev->stats.icmpv6msgdev) goto err_icmpmsg; @@ -375,7 +375,7 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev) if (dev->mtu < IPV6_MIN_MTU && dev != blackhole_netdev) return ERR_PTR(-EINVAL); - ndev = kzalloc(sizeof(struct inet6_dev), GFP_KERNEL); + ndev = kzalloc(sizeof(*ndev), GFP_KERNEL_ACCOUNT); if (!ndev) return ERR_PTR(err); @@ -7060,7 +7060,7 @@ static int __addrconf_sysctl_register(struct net *net, char *dev_name, struct ctl_table *table; char path[sizeof("net/ipv6/conf/") + IFNAMSIZ]; - table = kmemdup(addrconf_sysctl, sizeof(addrconf_sysctl), GFP_KERNEL); + table = kmemdup(addrconf_sysctl, sizeof(addrconf_sysctl), GFP_KERNEL_ACCOUNT); if (!table) goto out; -- cgit v1.2.3