From 85b8ac01a421791d66c3a458a7f83cfd173fe3fa Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Fri, 7 Feb 2020 10:37:12 +0000 Subject: bpf, sockmap: Check update requirements after locking It's currently possible to insert sockets in unexpected states into a sockmap, due to a TOCTTOU when updating the map from a syscall. sock_map_update_elem checks that sk->sk_state == TCP_ESTABLISHED, locks the socket and then calls sock_map_update_common. At this point, the socket may have transitioned into another state, and the earlier assumptions don't hold anymore. Crucially, it's conceivable (though very unlikely) that a socket has become unhashed. This breaks the sockmap's assumption that it will get a callback via sk->sk_prot->unhash. Fix this by checking the (fixed) sk_type and sk_protocol without the lock, followed by a locked check of sk_state. Unfortunately it's not possible to push the check down into sock_(map|hash)_update_common, since BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB run before the socket has transitioned from TCP_SYN_RECV into TCP_ESTABLISHED. Fixes: 604326b41a6f ("bpf, sockmap: convert to generic sk_msg interface") Signed-off-by: Lorenz Bauer Signed-off-by: Daniel Borkmann Reviewed-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20200207103713.28175-1-lmb@cloudflare.com --- net/core/sock_map.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 8998e356f423..36a2433e183f 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -416,14 +416,16 @@ static int sock_map_update_elem(struct bpf_map *map, void *key, ret = -EINVAL; goto out; } - if (!sock_map_sk_is_suitable(sk) || - sk->sk_state != TCP_ESTABLISHED) { + if (!sock_map_sk_is_suitable(sk)) { ret = -EOPNOTSUPP; goto out; } sock_map_sk_acquire(sk); - ret = sock_map_update_common(map, idx, sk, flags); + if (sk->sk_state != TCP_ESTABLISHED) + ret = -EOPNOTSUPP; + else + ret = sock_map_update_common(map, idx, sk, flags); sock_map_sk_release(sk); out: fput(sock->file); @@ -739,14 +741,16 @@ static int sock_hash_update_elem(struct bpf_map *map, void *key, ret = -EINVAL; goto out; } - if (!sock_map_sk_is_suitable(sk) || - sk->sk_state != TCP_ESTABLISHED) { + if (!sock_map_sk_is_suitable(sk)) { ret = -EOPNOTSUPP; goto out; } sock_map_sk_acquire(sk); - ret = sock_hash_update_common(map, key, sk, flags); + if (sk->sk_state != TCP_ESTABLISHED) + ret = -EOPNOTSUPP; + else + ret = sock_hash_update_common(map, key, sk, flags); sock_map_sk_release(sk); out: fput(sock->file); -- cgit v1.2.3 From db6a5018b6e008c1d69c6628cdaa9541b8e70940 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Thu, 6 Feb 2020 12:16:50 +0100 Subject: bpf, sockmap: Don't sleep while holding RCU lock on tear-down rcu_read_lock is needed to protect access to psock inside sock_map_unref when tearing down the map. However, we can't afford to sleep in lock_sock while in RCU read-side critical section. Grab the RCU lock only after we have locked the socket. This fixes RCU warnings triggerable on a VM with 1 vCPU when free'ing a sockmap/sockhash that contains at least one socket: | ============================= | WARNING: suspicious RCU usage | 5.5.0-04005-g8fc91b972b73 #450 Not tainted | ----------------------------- | include/linux/rcupdate.h:272 Illegal context switch in RCU read-side critical section! | | other info that might help us debug this: | | | rcu_scheduler_active = 2, debug_locks = 1 | 4 locks held by kworker/0:1/62: | #0: ffff88813b019748 ((wq_completion)events){+.+.}, at: process_one_work+0x1d7/0x5e0 | #1: ffffc900000abe50 ((work_completion)(&map->work)){+.+.}, at: process_one_work+0x1d7/0x5e0 | #2: ffffffff82065d20 (rcu_read_lock){....}, at: sock_map_free+0x5/0x170 | #3: ffff8881368c5df8 (&stab->lock){+...}, at: sock_map_free+0x64/0x170 | | stack backtrace: | CPU: 0 PID: 62 Comm: kworker/0:1 Not tainted 5.5.0-04005-g8fc91b972b73 #450 | Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS ?-20190727_073836-buildvm-ppc64le-16.ppc.fedoraproject.org-3.fc31 04/01/2014 | Workqueue: events bpf_map_free_deferred | Call Trace: | dump_stack+0x71/0xa0 | ___might_sleep+0x105/0x190 | lock_sock_nested+0x28/0x90 | sock_map_free+0x95/0x170 | bpf_map_free_deferred+0x58/0x80 | process_one_work+0x260/0x5e0 | worker_thread+0x4d/0x3e0 | kthread+0x108/0x140 | ? process_one_work+0x5e0/0x5e0 | ? kthread_park+0x90/0x90 | ret_from_fork+0x3a/0x50 | ============================= | WARNING: suspicious RCU usage | 5.5.0-04005-g8fc91b972b73-dirty #452 Not tainted | ----------------------------- | include/linux/rcupdate.h:272 Illegal context switch in RCU read-side critical section! | | other info that might help us debug this: | | | rcu_scheduler_active = 2, debug_locks = 1 | 4 locks held by kworker/0:1/62: | #0: ffff88813b019748 ((wq_completion)events){+.+.}, at: process_one_work+0x1d7/0x5e0 | #1: ffffc900000abe50 ((work_completion)(&map->work)){+.+.}, at: process_one_work+0x1d7/0x5e0 | #2: ffffffff82065d20 (rcu_read_lock){....}, at: sock_hash_free+0x5/0x1d0 | #3: ffff888139966e00 (&htab->buckets[i].lock){+...}, at: sock_hash_free+0x92/0x1d0 | | stack backtrace: | CPU: 0 PID: 62 Comm: kworker/0:1 Not tainted 5.5.0-04005-g8fc91b972b73-dirty #452 | Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS ?-20190727_073836-buildvm-ppc64le-16.ppc.fedoraproject.org-3.fc31 04/01/2014 | Workqueue: events bpf_map_free_deferred | Call Trace: | dump_stack+0x71/0xa0 | ___might_sleep+0x105/0x190 | lock_sock_nested+0x28/0x90 | sock_hash_free+0xec/0x1d0 | bpf_map_free_deferred+0x58/0x80 | process_one_work+0x260/0x5e0 | worker_thread+0x4d/0x3e0 | kthread+0x108/0x140 | ? process_one_work+0x5e0/0x5e0 | ? kthread_park+0x90/0x90 | ret_from_fork+0x3a/0x50 Fixes: 7e81a3530206 ("bpf: Sockmap, ensure sock lock held during tear down") Signed-off-by: Jakub Sitnicki Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20200206111652.694507-2-jakub@cloudflare.com --- net/core/sock_map.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 36a2433e183f..992504478e68 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -234,7 +234,6 @@ static void sock_map_free(struct bpf_map *map) int i; synchronize_rcu(); - rcu_read_lock(); raw_spin_lock_bh(&stab->lock); for (i = 0; i < stab->map.max_entries; i++) { struct sock **psk = &stab->sks[i]; @@ -243,12 +242,13 @@ static void sock_map_free(struct bpf_map *map) sk = xchg(psk, NULL); if (sk) { lock_sock(sk); + rcu_read_lock(); sock_map_unref(sk, psk); + rcu_read_unlock(); release_sock(sk); } } raw_spin_unlock_bh(&stab->lock); - rcu_read_unlock(); synchronize_rcu(); @@ -863,19 +863,19 @@ static void sock_hash_free(struct bpf_map *map) int i; synchronize_rcu(); - rcu_read_lock(); for (i = 0; i < htab->buckets_num; i++) { bucket = sock_hash_select_bucket(htab, i); raw_spin_lock_bh(&bucket->lock); hlist_for_each_entry_safe(elem, node, &bucket->head, node) { hlist_del_rcu(&elem->node); lock_sock(elem->sk); + rcu_read_lock(); sock_map_unref(elem->sk, elem); + rcu_read_unlock(); release_sock(elem->sk); } raw_spin_unlock_bh(&bucket->lock); } - rcu_read_unlock(); bpf_map_area_free(htab->buckets); kfree(htab); -- cgit v1.2.3 From 0b2dc83906cf1e694e48003eae5df8fa63f76fd9 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Thu, 6 Feb 2020 12:16:51 +0100 Subject: bpf, sockhash: Synchronize_rcu before free'ing map We need to have a synchronize_rcu before free'ing the sockhash because any outstanding psock references will have a pointer to the map and when they use it, this could trigger a use after free. This is a sister fix for sockhash, following commit 2bb90e5cc90e ("bpf: sockmap, synchronize_rcu before free'ing map") which addressed sockmap, which comes from a manual audit. Fixes: 604326b41a6fb ("bpf, sockmap: convert to generic sk_msg interface") Signed-off-by: Jakub Sitnicki Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20200206111652.694507-3-jakub@cloudflare.com --- net/core/sock_map.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 992504478e68..085cef5857bb 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -250,6 +250,7 @@ static void sock_map_free(struct bpf_map *map) } raw_spin_unlock_bh(&stab->lock); + /* wait for psock readers accessing its map link */ synchronize_rcu(); bpf_map_area_free(stab->sks); @@ -877,6 +878,9 @@ static void sock_hash_free(struct bpf_map *map) raw_spin_unlock_bh(&bucket->lock); } + /* wait for psock readers accessing its map link */ + synchronize_rcu(); + bpf_map_area_free(htab->buckets); kfree(htab); } -- cgit v1.2.3 From 88d6f130e5632bbf419a2e184ec7adcbe241260b Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Fri, 7 Feb 2020 00:18:10 -0800 Subject: bpf: Improve bucket_log calculation logic It was reported that the max_t, ilog2, and roundup_pow_of_two macros have exponential effects on the number of states in the sparse checker. This patch breaks them up by calculating the "nbuckets" first so that the "bucket_log" only needs to take ilog2(). In addition, Linus mentioned: Patch looks good, but I'd like to point out that it's not just sparse. You can see it with a simple make net/core/bpf_sk_storage.i grep 'smap->bucket_log = ' net/core/bpf_sk_storage.i | wc and see the end result: 1 365071 2686974 That's one line (the assignment line) that is 2,686,974 characters in length. Now, sparse does happen to react particularly badly to that (I didn't look to why, but I suspect it's just that evaluating all the types that don't actually ever end up getting used ends up being much more expensive than it should be), but I bet it's not good for gcc either. Fixes: 6ac99e8f23d4 ("bpf: Introduce bpf sk local storage") Reported-by: Randy Dunlap Reported-by: Luc Van Oostenryck Suggested-by: Linus Torvalds Signed-off-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann Reviewed-by: Luc Van Oostenryck Link: https://lore.kernel.org/bpf/20200207081810.3918919-1-kafai@fb.com --- net/core/bpf_sk_storage.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index 458be6b3eda9..3ab23f698221 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -643,9 +643,10 @@ static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr) return ERR_PTR(-ENOMEM); bpf_map_init_from_attr(&smap->map, attr); + nbuckets = roundup_pow_of_two(num_possible_cpus()); /* Use at least 2 buckets, select_bucket() is undefined behavior with 1 bucket */ - smap->bucket_log = max_t(u32, 1, ilog2(roundup_pow_of_two(num_possible_cpus()))); - nbuckets = 1U << smap->bucket_log; + nbuckets = max_t(u32, 2, nbuckets); + smap->bucket_log = ilog2(nbuckets); cost = sizeof(*smap->buckets) * nbuckets + sizeof(*smap); ret = bpf_map_charge_init(&smap->map.memory, cost); -- cgit v1.2.3