summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2015-08-17 14:22:48 -0700
committerDavid S. Miller <davem@davemloft.net>2015-08-17 14:22:48 -0700
commit90eb7fa51ccb9af58c437581af73c4a1d5fd2b9f (patch)
treeb2a932691177a1608f4c91016eea0a71c244987f
parenta1c234f95cae2d293047bb6c36e7a4840dbac815 (diff)
parent30da679e679dbe7bb103c5d9c137ec5a070b7150 (diff)
downloadlinux-90eb7fa51ccb9af58c437581af73c4a1d5fd2b9f.tar.bz2
Merge branch 'bpf_fanout'
Willem de Bruijn says: ==================== packet: add cBPF and eBPF fanout modes Allow programmable fanout modes. Support both classical BPF programs passed directly and extended BPF programs passed by file descriptor. One use case is packet steering by deep packet inspection, for instance for packet steering by application layer header fields. Separate the configuration of the fanout mode and the configuration of the program, to allow dynamic updates to the latter at runtime. Changes v1 -> v2: - follow SO_LOCK_FILTER semantics on filter updates - only accept eBPF programs of type BPF_PROG_TYPE_SOCKET_FILTER - rename PACKET_FANOUT_BPF to PACKET_FANOUT_CBPF to match man 2 bpf usage: "classic" vs. "extended" BPF. ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--include/uapi/linux/if_packet.h3
-rw-r--r--net/packet/af_packet.c130
-rw-r--r--net/packet/internal.h5
-rw-r--r--tools/testing/selftests/net/psock_fanout.c69
-rw-r--r--tools/testing/selftests/net/psock_lib.h29
5 files changed, 222 insertions, 14 deletions
diff --git a/include/uapi/linux/if_packet.h b/include/uapi/linux/if_packet.h
index d3d715f8c88f..9e7edfd8141e 100644
--- a/include/uapi/linux/if_packet.h
+++ b/include/uapi/linux/if_packet.h
@@ -55,6 +55,7 @@ struct sockaddr_ll {
#define PACKET_TX_HAS_OFF 19
#define PACKET_QDISC_BYPASS 20
#define PACKET_ROLLOVER_STATS 21
+#define PACKET_FANOUT_DATA 22
#define PACKET_FANOUT_HASH 0
#define PACKET_FANOUT_LB 1
@@ -62,6 +63,8 @@ struct sockaddr_ll {
#define PACKET_FANOUT_ROLLOVER 3
#define PACKET_FANOUT_RND 4
#define PACKET_FANOUT_QM 5
+#define PACKET_FANOUT_CBPF 6
+#define PACKET_FANOUT_EBPF 7
#define PACKET_FANOUT_FLAG_ROLLOVER 0x1000
#define PACKET_FANOUT_FLAG_DEFRAG 0x8000
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index b5afe538bb88..7b8e39a22387 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -92,6 +92,7 @@
#ifdef CONFIG_INET
#include <net/inet_common.h>
#endif
+#include <linux/bpf.h>
#include "internal.h"
@@ -1410,6 +1411,22 @@ static unsigned int fanout_demux_qm(struct packet_fanout *f,
return skb_get_queue_mapping(skb) % num;
}
+static unsigned int fanout_demux_bpf(struct packet_fanout *f,
+ struct sk_buff *skb,
+ unsigned int num)
+{
+ struct bpf_prog *prog;
+ unsigned int ret = 0;
+
+ rcu_read_lock();
+ prog = rcu_dereference(f->bpf_prog);
+ if (prog)
+ ret = BPF_PROG_RUN(prog, skb) % num;
+ rcu_read_unlock();
+
+ return ret;
+}
+
static bool fanout_has_flag(struct packet_fanout *f, u16 flag)
{
return f->flags & (flag >> 8);
@@ -1454,6 +1471,10 @@ static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev,
case PACKET_FANOUT_ROLLOVER:
idx = fanout_demux_rollover(f, skb, 0, false, num);
break;
+ case PACKET_FANOUT_CBPF:
+ case PACKET_FANOUT_EBPF:
+ idx = fanout_demux_bpf(f, skb, num);
+ break;
}
if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER))
@@ -1502,6 +1523,103 @@ static bool match_fanout_group(struct packet_type *ptype, struct sock *sk)
return false;
}
+static void fanout_init_data(struct packet_fanout *f)
+{
+ switch (f->type) {
+ case PACKET_FANOUT_LB:
+ atomic_set(&f->rr_cur, 0);
+ break;
+ case PACKET_FANOUT_CBPF:
+ case PACKET_FANOUT_EBPF:
+ RCU_INIT_POINTER(f->bpf_prog, NULL);
+ break;
+ }
+}
+
+static void __fanout_set_data_bpf(struct packet_fanout *f, struct bpf_prog *new)
+{
+ struct bpf_prog *old;
+
+ spin_lock(&f->lock);
+ old = rcu_dereference_protected(f->bpf_prog, lockdep_is_held(&f->lock));
+ rcu_assign_pointer(f->bpf_prog, new);
+ spin_unlock(&f->lock);
+
+ if (old) {
+ synchronize_net();
+ bpf_prog_destroy(old);
+ }
+}
+
+static int fanout_set_data_cbpf(struct packet_sock *po, char __user *data,
+ unsigned int len)
+{
+ struct bpf_prog *new;
+ struct sock_fprog fprog;
+ int ret;
+
+ if (sock_flag(&po->sk, SOCK_FILTER_LOCKED))
+ return -EPERM;
+ if (len != sizeof(fprog))
+ return -EINVAL;
+ if (copy_from_user(&fprog, data, len))
+ return -EFAULT;
+
+ ret = bpf_prog_create_from_user(&new, &fprog, NULL);
+ if (ret)
+ return ret;
+
+ __fanout_set_data_bpf(po->fanout, new);
+ return 0;
+}
+
+static int fanout_set_data_ebpf(struct packet_sock *po, char __user *data,
+ unsigned int len)
+{
+ struct bpf_prog *new;
+ u32 fd;
+
+ if (sock_flag(&po->sk, SOCK_FILTER_LOCKED))
+ return -EPERM;
+ if (len != sizeof(fd))
+ return -EINVAL;
+ if (copy_from_user(&fd, data, len))
+ return -EFAULT;
+
+ new = bpf_prog_get(fd);
+ if (IS_ERR(new))
+ return PTR_ERR(new);
+ if (new->type != BPF_PROG_TYPE_SOCKET_FILTER) {
+ bpf_prog_put(new);
+ return -EINVAL;
+ }
+
+ __fanout_set_data_bpf(po->fanout, new);
+ return 0;
+}
+
+static int fanout_set_data(struct packet_sock *po, char __user *data,
+ unsigned int len)
+{
+ switch (po->fanout->type) {
+ case PACKET_FANOUT_CBPF:
+ return fanout_set_data_cbpf(po, data, len);
+ case PACKET_FANOUT_EBPF:
+ return fanout_set_data_ebpf(po, data, len);
+ default:
+ return -EINVAL;
+ };
+}
+
+static void fanout_release_data(struct packet_fanout *f)
+{
+ switch (f->type) {
+ case PACKET_FANOUT_CBPF:
+ case PACKET_FANOUT_EBPF:
+ __fanout_set_data_bpf(f, NULL);
+ };
+}
+
static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
{
struct packet_sock *po = pkt_sk(sk);
@@ -1519,6 +1637,8 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
case PACKET_FANOUT_CPU:
case PACKET_FANOUT_RND:
case PACKET_FANOUT_QM:
+ case PACKET_FANOUT_CBPF:
+ case PACKET_FANOUT_EBPF:
break;
default:
return -EINVAL;
@@ -1561,10 +1681,10 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
match->id = id;
match->type = type;
match->flags = flags;
- atomic_set(&match->rr_cur, 0);
INIT_LIST_HEAD(&match->list);
spin_lock_init(&match->lock);
atomic_set(&match->sk_ref, 0);
+ fanout_init_data(match);
match->prot_hook.type = po->prot_hook.type;
match->prot_hook.dev = po->prot_hook.dev;
match->prot_hook.func = packet_rcv_fanout;
@@ -1610,6 +1730,7 @@ static void fanout_release(struct sock *sk)
if (atomic_dec_and_test(&f->sk_ref)) {
list_del(&f->list);
dev_remove_pack(&f->prot_hook);
+ fanout_release_data(f);
kfree(f);
}
mutex_unlock(&fanout_mutex);
@@ -3529,6 +3650,13 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
return fanout_add(sk, val & 0xffff, val >> 16);
}
+ case PACKET_FANOUT_DATA:
+ {
+ if (!po->fanout)
+ return -EINVAL;
+
+ return fanout_set_data(po, optval, optlen);
+ }
case PACKET_TX_HAS_OFF:
{
unsigned int val;
diff --git a/net/packet/internal.h b/net/packet/internal.h
index e20b3e8829b8..9ee46314b7d7 100644
--- a/net/packet/internal.h
+++ b/net/packet/internal.h
@@ -79,7 +79,10 @@ struct packet_fanout {
u16 id;
u8 type;
u8 flags;
- atomic_t rr_cur;
+ union {
+ atomic_t rr_cur;
+ struct bpf_prog __rcu *bpf_prog;
+ };
struct list_head list;
struct sock *arr[PACKET_FANOUT_MAX];
spinlock_t lock;
diff --git a/tools/testing/selftests/net/psock_fanout.c b/tools/testing/selftests/net/psock_fanout.c
index 08c2a36ef7a9..412459369686 100644
--- a/tools/testing/selftests/net/psock_fanout.c
+++ b/tools/testing/selftests/net/psock_fanout.c
@@ -19,6 +19,8 @@
* - PACKET_FANOUT_LB
* - PACKET_FANOUT_CPU
* - PACKET_FANOUT_ROLLOVER
+ * - PACKET_FANOUT_CBPF
+ * - PACKET_FANOUT_EBPF
*
* Todo:
* - functionality: PACKET_FANOUT_FLAG_DEFRAG
@@ -44,7 +46,9 @@
#include <arpa/inet.h>
#include <errno.h>
#include <fcntl.h>
+#include <linux/unistd.h> /* for __NR_bpf */
#include <linux/filter.h>
+#include <linux/bpf.h>
#include <linux/if_packet.h>
#include <net/ethernet.h>
#include <netinet/ip.h>
@@ -91,6 +95,51 @@ static int sock_fanout_open(uint16_t typeflags, int num_packets)
return fd;
}
+static void sock_fanout_set_ebpf(int fd)
+{
+ const int len_off = __builtin_offsetof(struct __sk_buff, len);
+ struct bpf_insn prog[] = {
+ { BPF_ALU64 | BPF_MOV | BPF_X, 6, 1, 0, 0 },
+ { BPF_LDX | BPF_W | BPF_MEM, 0, 6, len_off, 0 },
+ { BPF_JMP | BPF_JGE | BPF_K, 0, 0, 1, DATA_LEN },
+ { BPF_JMP | BPF_JA | BPF_K, 0, 0, 4, 0 },
+ { BPF_LD | BPF_B | BPF_ABS, 0, 0, 0, 0x50 },
+ { BPF_JMP | BPF_JEQ | BPF_K, 0, 0, 2, DATA_CHAR },
+ { BPF_JMP | BPF_JEQ | BPF_K, 0, 0, 1, DATA_CHAR_1 },
+ { BPF_ALU | BPF_MOV | BPF_K, 0, 0, 0, 0 },
+ { BPF_JMP | BPF_EXIT, 0, 0, 0, 0 }
+ };
+ char log_buf[512];
+ union bpf_attr attr;
+ int pfd;
+
+ memset(&attr, 0, sizeof(attr));
+ attr.prog_type = BPF_PROG_TYPE_SOCKET_FILTER;
+ attr.insns = (unsigned long) prog;
+ attr.insn_cnt = sizeof(prog) / sizeof(prog[0]);
+ attr.license = (unsigned long) "GPL";
+ attr.log_buf = (unsigned long) log_buf,
+ attr.log_size = sizeof(log_buf),
+ attr.log_level = 1,
+
+ pfd = syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr));
+ if (pfd < 0) {
+ perror("bpf");
+ fprintf(stderr, "bpf verifier:\n%s\n", log_buf);
+ exit(1);
+ }
+
+ if (setsockopt(fd, SOL_PACKET, PACKET_FANOUT_DATA, &pfd, sizeof(pfd))) {
+ perror("fanout data ebpf");
+ exit(1);
+ }
+
+ if (close(pfd)) {
+ perror("close ebpf");
+ exit(1);
+ }
+}
+
static char *sock_fanout_open_ring(int fd)
{
struct tpacket_req req = {
@@ -115,8 +164,8 @@ static char *sock_fanout_open_ring(int fd)
ring = mmap(0, req.tp_block_size * req.tp_block_nr,
PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
- if (!ring) {
- fprintf(stderr, "packetsock ring mmap\n");
+ if (ring == MAP_FAILED) {
+ perror("packetsock ring mmap");
exit(1);
}
@@ -209,6 +258,7 @@ static int test_datapath(uint16_t typeflags, int port_off,
{
const int expect0[] = { 0, 0 };
char *rings[2];
+ uint8_t type = typeflags & 0xFF;
int fds[2], fds_udp[2][2], ret;
fprintf(stderr, "test: datapath 0x%hx\n", typeflags);
@@ -219,6 +269,11 @@ static int test_datapath(uint16_t typeflags, int port_off,
fprintf(stderr, "ERROR: failed open\n");
exit(1);
}
+ if (type == PACKET_FANOUT_CBPF)
+ sock_setfilter(fds[0], SOL_PACKET, PACKET_FANOUT_DATA);
+ else if (type == PACKET_FANOUT_EBPF)
+ sock_fanout_set_ebpf(fds[0]);
+
rings[0] = sock_fanout_open_ring(fds[0]);
rings[1] = sock_fanout_open_ring(fds[1]);
pair_udp_open(fds_udp[0], PORT_BASE);
@@ -227,11 +282,11 @@ static int test_datapath(uint16_t typeflags, int port_off,
/* Send data, but not enough to overflow a queue */
pair_udp_send(fds_udp[0], 15);
- pair_udp_send(fds_udp[1], 5);
+ pair_udp_send_char(fds_udp[1], 5, DATA_CHAR_1);
ret = sock_fanout_read(fds, rings, expect1);
/* Send more data, overflow the queue */
- pair_udp_send(fds_udp[0], 15);
+ pair_udp_send_char(fds_udp[0], 15, DATA_CHAR_1);
/* TODO: ensure consistent order between expect1 and expect2 */
ret |= sock_fanout_read(fds, rings, expect2);
@@ -275,6 +330,7 @@ int main(int argc, char **argv)
const int expect_rb[2][2] = { { 15, 5 }, { 20, 15 } };
const int expect_cpu0[2][2] = { { 20, 0 }, { 20, 0 } };
const int expect_cpu1[2][2] = { { 0, 20 }, { 0, 20 } };
+ const int expect_bpf[2][2] = { { 15, 5 }, { 15, 20 } };
int port_off = 2, tries = 5, ret;
test_control_single();
@@ -296,6 +352,11 @@ int main(int argc, char **argv)
ret |= test_datapath(PACKET_FANOUT_ROLLOVER,
port_off, expect_rb[0], expect_rb[1]);
+ ret |= test_datapath(PACKET_FANOUT_CBPF,
+ port_off, expect_bpf[0], expect_bpf[1]);
+ ret |= test_datapath(PACKET_FANOUT_EBPF,
+ port_off, expect_bpf[0], expect_bpf[1]);
+
set_cpuaffinity(0);
ret |= test_datapath(PACKET_FANOUT_CPU, port_off,
expect_cpu0[0], expect_cpu0[1]);
diff --git a/tools/testing/selftests/net/psock_lib.h b/tools/testing/selftests/net/psock_lib.h
index 37da54ac85a9..24bc7ec1be7d 100644
--- a/tools/testing/selftests/net/psock_lib.h
+++ b/tools/testing/selftests/net/psock_lib.h
@@ -30,6 +30,7 @@
#define DATA_LEN 100
#define DATA_CHAR 'a'
+#define DATA_CHAR_1 'b'
#define PORT_BASE 8000
@@ -37,29 +38,36 @@
# define __maybe_unused __attribute__ ((__unused__))
#endif
-static __maybe_unused void pair_udp_setfilter(int fd)
+static __maybe_unused void sock_setfilter(int fd, int lvl, int optnum)
{
struct sock_filter bpf_filter[] = {
{ 0x80, 0, 0, 0x00000000 }, /* LD pktlen */
- { 0x35, 0, 5, DATA_LEN }, /* JGE DATA_LEN [f goto nomatch]*/
+ { 0x35, 0, 4, DATA_LEN }, /* JGE DATA_LEN [f goto nomatch]*/
{ 0x30, 0, 0, 0x00000050 }, /* LD ip[80] */
- { 0x15, 0, 3, DATA_CHAR }, /* JEQ DATA_CHAR [f goto nomatch]*/
- { 0x30, 0, 0, 0x00000051 }, /* LD ip[81] */
- { 0x15, 0, 1, DATA_CHAR }, /* JEQ DATA_CHAR [f goto nomatch]*/
+ { 0x15, 1, 0, DATA_CHAR }, /* JEQ DATA_CHAR [t goto match]*/
+ { 0x15, 0, 1, DATA_CHAR_1}, /* JEQ DATA_CHAR_1 [t goto match]*/
{ 0x06, 0, 0, 0x00000060 }, /* RET match */
{ 0x06, 0, 0, 0x00000000 }, /* RET no match */
};
struct sock_fprog bpf_prog;
+ if (lvl == SOL_PACKET && optnum == PACKET_FANOUT_DATA)
+ bpf_filter[5].code = 0x16; /* RET A */
+
bpf_prog.filter = bpf_filter;
bpf_prog.len = sizeof(bpf_filter) / sizeof(struct sock_filter);
- if (setsockopt(fd, SOL_SOCKET, SO_ATTACH_FILTER, &bpf_prog,
+ if (setsockopt(fd, lvl, optnum, &bpf_prog,
sizeof(bpf_prog))) {
perror("setsockopt SO_ATTACH_FILTER");
exit(1);
}
}
+static __maybe_unused void pair_udp_setfilter(int fd)
+{
+ sock_setfilter(fd, SOL_SOCKET, SO_ATTACH_FILTER);
+}
+
static __maybe_unused void pair_udp_open(int fds[], uint16_t port)
{
struct sockaddr_in saddr, daddr;
@@ -96,11 +104,11 @@ static __maybe_unused void pair_udp_open(int fds[], uint16_t port)
}
}
-static __maybe_unused void pair_udp_send(int fds[], int num)
+static __maybe_unused void pair_udp_send_char(int fds[], int num, char payload)
{
char buf[DATA_LEN], rbuf[DATA_LEN];
- memset(buf, DATA_CHAR, sizeof(buf));
+ memset(buf, payload, sizeof(buf));
while (num--) {
/* Should really handle EINTR and EAGAIN */
if (write(fds[0], buf, sizeof(buf)) != sizeof(buf)) {
@@ -118,6 +126,11 @@ static __maybe_unused void pair_udp_send(int fds[], int num)
}
}
+static __maybe_unused void pair_udp_send(int fds[], int num)
+{
+ return pair_udp_send_char(fds, num, DATA_CHAR);
+}
+
static __maybe_unused void pair_udp_close(int fds[])
{
close(fds[0]);