From 4749c3ef854e3a5d3dd3cc0ccd2dcb7e05d583bd Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 30 Apr 2015 12:12:00 +0200 Subject: net: sched: remove TC_MUNGED bits Not used. pedit sets TC_MUNGED when packet content was altered, but all the core does is unset MUNGED again and then set OK2MUNGE. And the latter isn't tested anywhere. So lets remove both TC_MUNGED and TC_OK2MUNGE. Signed-off-by: Florian Westphal Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Acked-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- net/sched/act_api.c | 5 ----- net/sched/act_pedit.c | 5 +---- 2 files changed, 1 insertion(+), 9 deletions(-) (limited to 'net/sched') diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 3d43e4979f27..af427a3dbcba 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -392,11 +392,6 @@ int tcf_action_exec(struct sk_buff *skb, const struct list_head *actions, list_for_each_entry(a, actions, list) { repeat: ret = a->ops->act(skb, a, res); - if (TC_MUNGED & skb->tc_verd) { - /* copied already, allow trampling */ - skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd); - skb->tc_verd = CLR_TC_MUNGED(skb->tc_verd); - } if (ret == TC_ACT_REPEAT) goto repeat; /* we need a ttl - JHS */ if (ret != TC_ACT_PIPE) diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index 59649d588d79..17e6d6669c7f 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -108,7 +108,7 @@ static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { struct tcf_pedit *p = a->priv; - int i, munged = 0; + int i; unsigned int off; if (skb_unclone(skb, GFP_ATOMIC)) @@ -156,11 +156,8 @@ static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a, *ptr = ((*ptr & tkey->mask) ^ tkey->val); if (ptr == &_data) skb_store_bits(skb, off + offset, ptr, 4); - munged++; } - if (munged) - skb->tc_verd = SET_TC_MUNGED(skb->tc_verd); goto done; } else WARN(1, "pedit BUG: index %d\n", p->tcf_index); -- cgit v1.2.3 From 087c1a601ad7f851a2d31f5fa0e5e9dfc766df55 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 30 Apr 2015 20:14:07 -0700 Subject: net: sched: run ingress qdisc without locks TC classifiers/actions were converted to RCU by John in the series: http://thread.gmane.org/gmane.linux.network/329739/focus=329739 and many follow on patches. This is the last patch from that series that finally drops ingress spin_lock. Single cpu ingress+u32 performance goes from 22.9 Mpps to 24.5 Mpps. In two cpu case when both cores are receiving traffic on the same device and go into the same ingress+u32 the performance jumps from 4.5 + 4.5 Mpps to 23.5 + 23.5 Mpps Signed-off-by: John Fastabend Signed-off-by: Alexei Starovoitov Signed-off-by: Jamal Hadi Salim Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- net/core/dev.c | 2 -- net/sched/sch_ingress.c | 5 +++-- 2 files changed, 3 insertions(+), 4 deletions(-) (limited to 'net/sched') diff --git a/net/core/dev.c b/net/core/dev.c index c7ba0388f1be..74a5b62f7568 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3547,10 +3547,8 @@ static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq) q = rcu_dereference(rxq->qdisc); if (q != &noop_qdisc) { - spin_lock(qdisc_lock(q)); if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) result = qdisc_enqueue_root(skb, q); - spin_unlock(qdisc_lock(q)); } return result; diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c index 4cdbfb85686a..a89cc3278bfb 100644 --- a/net/sched/sch_ingress.c +++ b/net/sched/sch_ingress.c @@ -65,11 +65,11 @@ static int ingress_enqueue(struct sk_buff *skb, struct Qdisc *sch) result = tc_classify(skb, fl, &res); - qdisc_bstats_update(sch, skb); + qdisc_bstats_update_cpu(sch, skb); switch (result) { case TC_ACT_SHOT: result = TC_ACT_SHOT; - qdisc_qstats_drop(sch); + qdisc_qstats_drop_cpu(sch); break; case TC_ACT_STOLEN: case TC_ACT_QUEUED: @@ -91,6 +91,7 @@ static int ingress_enqueue(struct sk_buff *skb, struct Qdisc *sch) static int ingress_init(struct Qdisc *sch, struct nlattr *opt) { net_inc_ingress_queue(); + sch->flags |= TCQ_F_CPUSTATS; return 0; } -- cgit v1.2.3 From 342db221829f8341e51839e40c47be137dfc6ebe Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Fri, 1 May 2015 11:30:13 -0700 Subject: sched: Call skb_get_hash_perturb in sch_fq_codel Call skb_get_hash_perturb instead of doing skb_flow_dissect and then jhash by hand. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/sched/sch_fq_codel.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) (limited to 'net/sched') diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index 1e52decb7b59..a6fc53d69513 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -23,7 +23,6 @@ #include #include #include -#include #include /* Fair Queue CoDel. @@ -68,15 +67,9 @@ struct fq_codel_sched_data { }; static unsigned int fq_codel_hash(const struct fq_codel_sched_data *q, - const struct sk_buff *skb) + struct sk_buff *skb) { - struct flow_keys keys; - unsigned int hash; - - skb_flow_dissect(skb, &keys); - hash = jhash_3words((__force u32)keys.dst, - (__force u32)keys.src ^ keys.ip_proto, - (__force u32)keys.ports, q->perturbation); + u32 hash = skb_get_hash_perturb(skb, q->perturbation); return reciprocal_scale(hash, q->flows_cnt); } -- cgit v1.2.3 From f969777ac35506133777ac7674c394e8e298e623 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Fri, 1 May 2015 11:30:14 -0700 Subject: sched: Call skb_get_hash_perturb in sch_hhf Call skb_get_hash_perturb instead of doing skb_flow_dissect and then jhash by hand. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/sched/sch_hhf.c | 19 +------------------ 1 file changed, 1 insertion(+), 18 deletions(-) (limited to 'net/sched') diff --git a/net/sched/sch_hhf.c b/net/sched/sch_hhf.c index 15d3aabfe250..9d15cb6b8cb1 100644 --- a/net/sched/sch_hhf.c +++ b/net/sched/sch_hhf.c @@ -9,7 +9,6 @@ #include #include #include -#include #include #include @@ -176,22 +175,6 @@ static u32 hhf_time_stamp(void) return jiffies; } -static unsigned int skb_hash(const struct hhf_sched_data *q, - const struct sk_buff *skb) -{ - struct flow_keys keys; - unsigned int hash; - - if (skb->sk && skb->sk->sk_hash) - return skb->sk->sk_hash; - - skb_flow_dissect(skb, &keys); - hash = jhash_3words((__force u32)keys.dst, - (__force u32)keys.src ^ keys.ip_proto, - (__force u32)keys.ports, q->perturbation); - return hash; -} - /* Looks up a heavy-hitter flow in a chaining list of table T. */ static struct hh_flow_state *seek_list(const u32 hash, struct list_head *head, @@ -280,7 +263,7 @@ static enum wdrr_bucket_idx hhf_classify(struct sk_buff *skb, struct Qdisc *sch) } /* Get hashed flow-id of the skb. */ - hash = skb_hash(q, skb); + hash = skb_get_hash_perturb(skb, q->perturbation); /* Check if this packet belongs to an already established HH flow. */ flow_pos = hash & HHF_BIT_MASK; -- cgit v1.2.3 From 63c0ad4d4135d3bdb81a1ee42436f3a403632a3e Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Fri, 1 May 2015 11:30:15 -0700 Subject: sched: Call skb_get_hash_perturb in sch_sfb Call skb_get_hash_perturb instead of doing skb_flow_dissect and then jhash by hand. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/sched/sch_sfb.c | 24 ++++++++---------------- 1 file changed, 8 insertions(+), 16 deletions(-) (limited to 'net/sched') diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c index 5819dd82630d..4b815193326c 100644 --- a/net/sched/sch_sfb.c +++ b/net/sched/sch_sfb.c @@ -26,7 +26,6 @@ #include #include #include -#include /* * SFB uses two B[l][n] : L x N arrays of bins (L levels, N bins per level) @@ -285,9 +284,9 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch) int i; u32 p_min = ~0; u32 minqlen = ~0; - u32 r, slot, salt, sfbhash; + u32 r, sfbhash; + u32 slot = q->slot; int ret = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; - struct flow_keys keys; if (unlikely(sch->q.qlen >= q->limit)) { qdisc_qstats_overlimit(sch); @@ -309,22 +308,17 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch) fl = rcu_dereference_bh(q->filter_list); if (fl) { + u32 salt; + /* If using external classifiers, get result and record it. */ if (!sfb_classify(skb, fl, &ret, &salt)) goto other_drop; - keys.src = salt; - keys.dst = 0; - keys.ports = 0; + sfbhash = jhash_1word(salt, q->bins[slot].perturbation); } else { - skb_flow_dissect(skb, &keys); + sfbhash = skb_get_hash_perturb(skb, q->bins[slot].perturbation); } - slot = q->slot; - sfbhash = jhash_3words((__force u32)keys.dst, - (__force u32)keys.src, - (__force u32)keys.ports, - q->bins[slot].perturbation); if (!sfbhash) sfbhash = 1; sfb_skb_cb(skb)->hashes[slot] = sfbhash; @@ -356,10 +350,8 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch) if (unlikely(p_min >= SFB_MAX_PROB)) { /* Inelastic flow */ if (q->double_buffering) { - sfbhash = jhash_3words((__force u32)keys.dst, - (__force u32)keys.src, - (__force u32)keys.ports, - q->bins[slot].perturbation); + sfbhash = skb_get_hash_perturb(skb, + q->bins[slot].perturbation); if (!sfbhash) sfbhash = 1; sfb_skb_cb(skb)->hashes[slot] = sfbhash; -- cgit v1.2.3 From ada1dba04c273dbabefff6a9a9f8c2bdcb61a858 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Fri, 1 May 2015 11:30:16 -0700 Subject: sched: Call skb_get_hash_perturb in sch_sfq Call skb_get_hash_perturb instead of doing skb_flow_dissect and then jhash by hand. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/sched/sch_sfq.c | 27 ++------------------------- 1 file changed, 2 insertions(+), 25 deletions(-) (limited to 'net/sched') diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index b877140beda5..7d1492663360 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -23,7 +23,6 @@ #include #include #include -#include #include @@ -156,30 +155,10 @@ static inline struct sfq_head *sfq_dep_head(struct sfq_sched_data *q, sfq_index return &q->dep[val - SFQ_MAX_FLOWS]; } -/* - * In order to be able to quickly rehash our queue when timer changes - * q->perturbation, we store flow_keys in skb->cb[] - */ -struct sfq_skb_cb { - struct flow_keys keys; -}; - -static inline struct sfq_skb_cb *sfq_skb_cb(const struct sk_buff *skb) -{ - qdisc_cb_private_validate(skb, sizeof(struct sfq_skb_cb)); - return (struct sfq_skb_cb *)qdisc_skb_cb(skb)->data; -} - static unsigned int sfq_hash(const struct sfq_sched_data *q, const struct sk_buff *skb) { - const struct flow_keys *keys = &sfq_skb_cb(skb)->keys; - unsigned int hash; - - hash = jhash_3words((__force u32)keys->dst, - (__force u32)keys->src ^ keys->ip_proto, - (__force u32)keys->ports, q->perturbation); - return hash & (q->divisor - 1); + return skb_get_hash_perturb(skb, q->perturbation) & (q->divisor - 1); } static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch, @@ -196,10 +175,8 @@ static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch, return TC_H_MIN(skb->priority); fl = rcu_dereference_bh(q->filter_list); - if (!fl) { - skb_flow_dissect(skb, &sfq_skb_cb(skb)->keys); + if (!fl) return sfq_hash(q, skb) + 1; - } *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; result = tc_classify(skb, fl, &res); -- cgit v1.2.3 From 2e99403d28c182aa7ffb2d4ef34472df4873a4dd Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Fri, 1 May 2015 11:30:18 -0700 Subject: sch_choke: Use flow_keys_digest Call make_flow_keys_digest to get a digest from flow keys and use that to pass skbuff cb and for comparing flows. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/sched/sch_choke.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) (limited to 'net/sched') diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c index c009eb9045ce..dfe3da75594c 100644 --- a/net/sched/sch_choke.c +++ b/net/sched/sch_choke.c @@ -133,16 +133,10 @@ static void choke_drop_by_idx(struct Qdisc *sch, unsigned int idx) --sch->q.qlen; } -/* private part of skb->cb[] that a qdisc is allowed to use - * is limited to QDISC_CB_PRIV_LEN bytes. - * As a flow key might be too large, we store a part of it only. - */ -#define CHOKE_K_LEN min_t(u32, sizeof(struct flow_keys), QDISC_CB_PRIV_LEN - 3) - struct choke_skb_cb { u16 classid; u8 keys_valid; - u8 keys[QDISC_CB_PRIV_LEN - 3]; + struct flow_keys_digest keys; }; static inline struct choke_skb_cb *choke_skb_cb(const struct sk_buff *skb) @@ -177,18 +171,18 @@ static bool choke_match_flow(struct sk_buff *skb1, if (!choke_skb_cb(skb1)->keys_valid) { choke_skb_cb(skb1)->keys_valid = 1; skb_flow_dissect(skb1, &temp); - memcpy(&choke_skb_cb(skb1)->keys, &temp, CHOKE_K_LEN); + make_flow_keys_digest(&choke_skb_cb(skb1)->keys, &temp); } if (!choke_skb_cb(skb2)->keys_valid) { choke_skb_cb(skb2)->keys_valid = 1; skb_flow_dissect(skb2, &temp); - memcpy(&choke_skb_cb(skb2)->keys, &temp, CHOKE_K_LEN); + make_flow_keys_digest(&choke_skb_cb(skb2)->keys, &temp); } return !memcmp(&choke_skb_cb(skb1)->keys, &choke_skb_cb(skb2)->keys, - CHOKE_K_LEN); + sizeof(choke_skb_cb(skb1)->keys)); } /* -- cgit v1.2.3 From 80ba92fa1a92dea128283f69f55b02242e213650 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 8 May 2015 15:05:12 -0700 Subject: codel: add ce_threshold attribute For DCTCP or similar ECN based deployments on fabrics with shallow buffers, hosts are responsible for a good part of the buffering. This patch adds an optional ce_threshold to codel & fq_codel qdiscs, so that DCTCP can have feedback from queuing in the host. A DCTCP enabled egress port simply have a queue occupancy threshold above which ECT packets get CE mark. In codel language this translates to a sojourn time, so that one doesn't have to worry about bytes or bandwidth but delays. This makes the host an active participant in the health of the whole network. This also helps experimenting DCTCP in a setup without DCTCP compliant fabric. On following example, ce_threshold is set to 1ms, and we can see from 'ldelay xxx us' that TCP is not trying to go around the 5ms codel target. Queue has more capacity to absorb inelastic bursts (say from UDP traffic), as queues are maintained to an optimal level. lpaa23:~# ./tc -s -d qd sh dev eth1 qdisc mq 1: dev eth1 root Sent 87910654696 bytes 58065331 pkt (dropped 0, overlimits 0 requeues 42961) backlog 3108242b 364p requeues 42961 qdisc codel 8063: dev eth1 parent 1:1 limit 1000p target 5.0ms ce_threshold 1.0ms interval 100.0ms Sent 7363778701 bytes 4863809 pkt (dropped 0, overlimits 0 requeues 5503) rate 2348Mbit 193919pps backlog 255866b 46p requeues 5503 count 0 lastcount 0 ldelay 1.0ms drop_next 0us maxpacket 68130 ecn_mark 0 drop_overlimit 0 ce_mark 72384 qdisc codel 8064: dev eth1 parent 1:2 limit 1000p target 5.0ms ce_threshold 1.0ms interval 100.0ms Sent 7636486190 bytes 5043942 pkt (dropped 0, overlimits 0 requeues 5186) rate 2319Mbit 191538pps backlog 207418b 64p requeues 5186 count 0 lastcount 0 ldelay 694us drop_next 0us maxpacket 68130 ecn_mark 0 drop_overlimit 0 ce_mark 69873 qdisc codel 8065: dev eth1 parent 1:3 limit 1000p target 5.0ms ce_threshold 1.0ms interval 100.0ms Sent 11569360142 bytes 7641602 pkt (dropped 0, overlimits 0 requeues 5554) rate 3041Mbit 251096pps backlog 210446b 59p requeues 5554 count 0 lastcount 0 ldelay 889us drop_next 0us maxpacket 68130 ecn_mark 0 drop_overlimit 0 ce_mark 37780 ... Signed-off-by: Eric Dumazet Cc: Florian Westphal Cc: Daniel Borkmann Cc: Glenn Judd Cc: Nandita Dukkipati Cc: Neal Cardwell Cc: Yuchung Cheng Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- include/net/codel.h | 12 +++++++++++- include/uapi/linux/pkt_sched.h | 4 ++++ net/sched/sch_codel.c | 15 +++++++++++++-- net/sched/sch_fq_codel.c | 15 ++++++++++++++- 4 files changed, 42 insertions(+), 4 deletions(-) (limited to 'net/sched') diff --git a/include/net/codel.h b/include/net/codel.h index aeee28081245..8c0f78f209e8 100644 --- a/include/net/codel.h +++ b/include/net/codel.h @@ -7,7 +7,7 @@ * Copyright (C) 2011-2012 Kathleen Nichols * Copyright (C) 2011-2012 Van Jacobson * Copyright (C) 2012 Michael D. Taht - * Copyright (C) 2012 Eric Dumazet + * Copyright (C) 2012,2015 Eric Dumazet * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -119,11 +119,13 @@ static inline u32 codel_time_to_us(codel_time_t val) /** * struct codel_params - contains codel parameters * @target: target queue size (in time units) + * @ce_threshold: threshold for marking packets with ECN CE * @interval: width of moving time window * @ecn: is Explicit Congestion Notification enabled */ struct codel_params { codel_time_t target; + codel_time_t ce_threshold; codel_time_t interval; bool ecn; }; @@ -159,17 +161,22 @@ struct codel_vars { * @maxpacket: largest packet we've seen so far * @drop_count: temp count of dropped packets in dequeue() * ecn_mark: number of packets we ECN marked instead of dropping + * ce_mark: number of packets CE marked because sojourn time was above ce_threshold */ struct codel_stats { u32 maxpacket; u32 drop_count; u32 ecn_mark; + u32 ce_mark; }; +#define CODEL_DISABLED_THRESHOLD INT_MAX + static void codel_params_init(struct codel_params *params) { params->interval = MS2TIME(100); params->target = MS2TIME(5); + params->ce_threshold = CODEL_DISABLED_THRESHOLD; params->ecn = false; } @@ -350,6 +357,9 @@ static struct sk_buff *codel_dequeue(struct Qdisc *sch, vars->rec_inv_sqrt); } end: + if (skb && codel_time_after(vars->ldelay, params->ce_threshold) && + INET_ECN_set_ce(skb)) + stats->ce_mark++; return skb; } #endif diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index 534b84710745..69d88b309cc7 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -679,6 +679,7 @@ enum { TCA_CODEL_LIMIT, TCA_CODEL_INTERVAL, TCA_CODEL_ECN, + TCA_CODEL_CE_THRESHOLD, __TCA_CODEL_MAX }; @@ -695,6 +696,7 @@ struct tc_codel_xstats { __u32 drop_overlimit; /* number of time max qdisc packet limit was hit */ __u32 ecn_mark; /* number of packets we ECN marked instead of dropped */ __u32 dropping; /* are we in dropping state ? */ + __u32 ce_mark; /* number of CE marked packets because of ce_threshold */ }; /* FQ_CODEL */ @@ -707,6 +709,7 @@ enum { TCA_FQ_CODEL_ECN, TCA_FQ_CODEL_FLOWS, TCA_FQ_CODEL_QUANTUM, + TCA_FQ_CODEL_CE_THRESHOLD, __TCA_FQ_CODEL_MAX }; @@ -730,6 +733,7 @@ struct tc_fq_codel_qd_stats { */ __u32 new_flows_len; /* count of flows in new list */ __u32 old_flows_len; /* count of flows in old list */ + __u32 ce_mark; /* packets above ce_threshold */ }; struct tc_fq_codel_cl_stats { diff --git a/net/sched/sch_codel.c b/net/sched/sch_codel.c index de28f8e968e8..1474b6560fac 100644 --- a/net/sched/sch_codel.c +++ b/net/sched/sch_codel.c @@ -6,7 +6,7 @@ * * Implemented on linux by : * Copyright (C) 2012 Michael D. Taht - * Copyright (C) 2012 Eric Dumazet + * Copyright (C) 2012,2015 Eric Dumazet * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -109,6 +109,7 @@ static const struct nla_policy codel_policy[TCA_CODEL_MAX + 1] = { [TCA_CODEL_LIMIT] = { .type = NLA_U32 }, [TCA_CODEL_INTERVAL] = { .type = NLA_U32 }, [TCA_CODEL_ECN] = { .type = NLA_U32 }, + [TCA_CODEL_CE_THRESHOLD]= { .type = NLA_U32 }, }; static int codel_change(struct Qdisc *sch, struct nlattr *opt) @@ -133,6 +134,12 @@ static int codel_change(struct Qdisc *sch, struct nlattr *opt) q->params.target = ((u64)target * NSEC_PER_USEC) >> CODEL_SHIFT; } + if (tb[TCA_CODEL_CE_THRESHOLD]) { + u64 val = nla_get_u32(tb[TCA_CODEL_CE_THRESHOLD]); + + q->params.ce_threshold = (val * NSEC_PER_USEC) >> CODEL_SHIFT; + } + if (tb[TCA_CODEL_INTERVAL]) { u32 interval = nla_get_u32(tb[TCA_CODEL_INTERVAL]); @@ -201,7 +208,10 @@ static int codel_dump(struct Qdisc *sch, struct sk_buff *skb) nla_put_u32(skb, TCA_CODEL_ECN, q->params.ecn)) goto nla_put_failure; - + if (q->params.ce_threshold != CODEL_DISABLED_THRESHOLD && + nla_put_u32(skb, TCA_CODEL_CE_THRESHOLD, + codel_time_to_us(q->params.ce_threshold))) + goto nla_put_failure; return nla_nest_end(skb, opts); nla_put_failure: @@ -220,6 +230,7 @@ static int codel_dump_stats(struct Qdisc *sch, struct gnet_dump *d) .ldelay = codel_time_to_us(q->vars.ldelay), .dropping = q->vars.dropping, .ecn_mark = q->stats.ecn_mark, + .ce_mark = q->stats.ce_mark, }; if (q->vars.dropping) { diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index a6fc53d69513..778739786b32 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -6,7 +6,7 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. * - * Copyright (C) 2012 Eric Dumazet + * Copyright (C) 2012,2015 Eric Dumazet */ #include @@ -292,6 +292,7 @@ static const struct nla_policy fq_codel_policy[TCA_FQ_CODEL_MAX + 1] = { [TCA_FQ_CODEL_ECN] = { .type = NLA_U32 }, [TCA_FQ_CODEL_FLOWS] = { .type = NLA_U32 }, [TCA_FQ_CODEL_QUANTUM] = { .type = NLA_U32 }, + [TCA_FQ_CODEL_CE_THRESHOLD] = { .type = NLA_U32 }, }; static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt) @@ -322,6 +323,12 @@ static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt) q->cparams.target = (target * NSEC_PER_USEC) >> CODEL_SHIFT; } + if (tb[TCA_FQ_CODEL_CE_THRESHOLD]) { + u64 val = nla_get_u32(tb[TCA_FQ_CODEL_CE_THRESHOLD]); + + q->cparams.ce_threshold = (val * NSEC_PER_USEC) >> CODEL_SHIFT; + } + if (tb[TCA_FQ_CODEL_INTERVAL]) { u64 interval = nla_get_u32(tb[TCA_FQ_CODEL_INTERVAL]); @@ -441,6 +448,11 @@ static int fq_codel_dump(struct Qdisc *sch, struct sk_buff *skb) q->flows_cnt)) goto nla_put_failure; + if (q->cparams.ce_threshold != CODEL_DISABLED_THRESHOLD && + nla_put_u32(skb, TCA_FQ_CODEL_CE_THRESHOLD, + codel_time_to_us(q->cparams.ce_threshold))) + goto nla_put_failure; + return nla_nest_end(skb, opts); nla_put_failure: @@ -459,6 +471,7 @@ static int fq_codel_dump_stats(struct Qdisc *sch, struct gnet_dump *d) st.qdisc_stats.drop_overlimit = q->drop_overlimit; st.qdisc_stats.ecn_mark = q->cstats.ecn_mark; st.qdisc_stats.new_flow_count = q->new_flow_count; + st.qdisc_stats.ce_mark = q->cstats.ce_mark; list_for_each(pos, &q->new_flows) st.qdisc_stats.new_flows_len++; -- cgit v1.2.3 From d2788d34885d4ce5ba17a8996fd95d28942e574e Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 9 May 2015 22:51:32 +0200 Subject: net: sched: further simplify handle_ing Ingress qdisc has no other purpose than calling into tc_classify() that executes attached classifier(s) and action(s). It has a 1:1 relationship to dev->ingress_queue. After having commit 087c1a601ad7 ("net: sched: run ingress qdisc without locks") removed the central ingress lock, one major contention point is gone. The extra indirection layers however, are not necessary for calling into ingress qdisc. pktgen calling locally into netif_receive_skb() with a dummy u32, single CPU result on a Supermicro X10SLM-F, Xeon E3-1240: before ~21,1 Mpps, after patch ~22,9 Mpps. We can redirect the private classifier list to the netdev directly, without changing any classifier API bits (!) and execute on that from handle_ing() side. The __QDISC_STATE_DEACTIVATE test can be removed, ingress qdisc doesn't have a queue and thus dev_deactivate_queue() is also not applicable, ingress_cl_list provides similar behaviour. In other words, ingress qdisc acts like TCQ_F_BUILTIN qdisc. One next possible step is the removal of the dev's ingress (dummy) netdev_queue, and to only have the list member in the netdevice itself. Note, the filter chain is RCU protected and individual filter elements are being kfree'd by sched subsystem after RCU grace period. RCU read lock is being held by __netif_receive_skb_core(). Joint work with Alexei Starovoitov. Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/netdevice.h | 4 ++++ net/core/dev.c | 30 ++++++++++++++---------- net/sched/sch_ingress.c | 58 ++++++++--------------------------------------- 3 files changed, 31 insertions(+), 61 deletions(-) (limited to 'net/sched') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 1899c74a7127..c4e1caf6056f 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1655,7 +1655,11 @@ struct net_device { rx_handler_func_t __rcu *rx_handler; void __rcu *rx_handler_data; +#if CONFIG_NET_CLS_ACT + struct tcf_proto __rcu *ingress_cl_list; +#endif struct netdev_queue __rcu *ingress_queue; + unsigned char broadcast[MAX_ADDR_LEN]; #ifdef CONFIG_RFS_ACCEL struct cpu_rmap *rx_cpu_rmap; diff --git a/net/core/dev.c b/net/core/dev.c index 8a757464bfa2..e5f77c40bbd1 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3525,31 +3525,37 @@ static inline struct sk_buff *handle_ing(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, struct net_device *orig_dev) { - struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue); - struct Qdisc *q; + struct tcf_proto *cl = rcu_dereference_bh(skb->dev->ingress_cl_list); + struct tcf_result cl_res; /* If there's at least one ingress present somewhere (so * we get here via enabled static key), remaining devices * that are not configured with an ingress qdisc will bail - * out w/o the rcu_dereference(). + * out here. */ - if (!rxq || (q = rcu_dereference(rxq->qdisc)) == &noop_qdisc) + if (!cl) return skb; - if (*pt_prev) { *ret = deliver_skb(skb, *pt_prev, orig_dev); *pt_prev = NULL; } + qdisc_bstats_update_cpu(cl->q, skb); skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS); - if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) { - switch (qdisc_enqueue_root(skb, q)) { - case TC_ACT_SHOT: - case TC_ACT_STOLEN: - kfree_skb(skb); - return NULL; - } + switch (tc_classify(skb, cl, &cl_res)) { + case TC_ACT_OK: + case TC_ACT_RECLASSIFY: + skb->tc_index = TC_H_MIN(cl_res.classid); + break; + case TC_ACT_SHOT: + qdisc_qstats_drop_cpu(cl->q); + case TC_ACT_STOLEN: + case TC_ACT_QUEUED: + kfree_skb(skb); + return NULL; + default: + break; } return skb; diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c index a89cc3278bfb..e7c648fa9dc3 100644 --- a/net/sched/sch_ingress.c +++ b/net/sched/sch_ingress.c @@ -12,16 +12,10 @@ #include #include #include + #include #include - -struct ingress_qdisc_data { - struct tcf_proto __rcu *filter_list; -}; - -/* ------------------------- Class/flow operations ------------------------- */ - static struct Qdisc *ingress_leaf(struct Qdisc *sch, unsigned long arg) { return NULL; @@ -49,45 +43,11 @@ static void ingress_walk(struct Qdisc *sch, struct qdisc_walker *walker) static struct tcf_proto __rcu **ingress_find_tcf(struct Qdisc *sch, unsigned long cl) { - struct ingress_qdisc_data *p = qdisc_priv(sch); - - return &p->filter_list; -} - -/* --------------------------- Qdisc operations ---------------------------- */ + struct net_device *dev = qdisc_dev(sch); -static int ingress_enqueue(struct sk_buff *skb, struct Qdisc *sch) -{ - struct ingress_qdisc_data *p = qdisc_priv(sch); - struct tcf_result res; - struct tcf_proto *fl = rcu_dereference_bh(p->filter_list); - int result; - - result = tc_classify(skb, fl, &res); - - qdisc_bstats_update_cpu(sch, skb); - switch (result) { - case TC_ACT_SHOT: - result = TC_ACT_SHOT; - qdisc_qstats_drop_cpu(sch); - break; - case TC_ACT_STOLEN: - case TC_ACT_QUEUED: - result = TC_ACT_STOLEN; - break; - case TC_ACT_RECLASSIFY: - case TC_ACT_OK: - skb->tc_index = TC_H_MIN(res.classid); - default: - result = TC_ACT_OK; - break; - } - - return result; + return &dev->ingress_cl_list; } -/* ------------------------------------------------------------- */ - static int ingress_init(struct Qdisc *sch, struct nlattr *opt) { net_inc_ingress_queue(); @@ -98,9 +58,9 @@ static int ingress_init(struct Qdisc *sch, struct nlattr *opt) static void ingress_destroy(struct Qdisc *sch) { - struct ingress_qdisc_data *p = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); - tcf_destroy_chain(&p->filter_list); + tcf_destroy_chain(&dev->ingress_cl_list); net_dec_ingress_queue(); } @@ -111,6 +71,7 @@ static int ingress_dump(struct Qdisc *sch, struct sk_buff *skb) nest = nla_nest_start(skb, TCA_OPTIONS); if (nest == NULL) goto nla_put_failure; + return nla_nest_end(skb, nest); nla_put_failure: @@ -131,8 +92,6 @@ static const struct Qdisc_class_ops ingress_class_ops = { static struct Qdisc_ops ingress_qdisc_ops __read_mostly = { .cl_ops = &ingress_class_ops, .id = "ingress", - .priv_size = sizeof(struct ingress_qdisc_data), - .enqueue = ingress_enqueue, .init = ingress_init, .destroy = ingress_destroy, .dump = ingress_dump, @@ -149,6 +108,7 @@ static void __exit ingress_module_exit(void) unregister_qdisc(&ingress_qdisc_ops); } -module_init(ingress_module_init) -module_exit(ingress_module_exit) +module_init(ingress_module_init); +module_exit(ingress_module_exit); + MODULE_LICENSE("GPL"); -- cgit v1.2.3 From b396cca6fafccf16206a5d041d59c9e6b65b6f5a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 11 May 2015 09:06:56 -0700 Subject: net: sched: deprecate enqueue_root() Only left enqueue_root() user is netem, and it looks not necessary : qdisc_skb_cb(skb)->pkt_len is preserved after one skb_clone() Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/sch_generic.h | 6 ------ net/sched/sch_netem.c | 4 ++-- 2 files changed, 2 insertions(+), 8 deletions(-) (limited to 'net/sched') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 994b5a092f33..1b0a2e88ed2b 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -501,12 +501,6 @@ static inline int qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch) return sch->enqueue(skb, sch); } -static inline int qdisc_enqueue_root(struct sk_buff *skb, struct Qdisc *sch) -{ - qdisc_skb_cb(skb)->pkt_len = skb->len; - return qdisc_enqueue(skb, sch) & NET_XMIT_MASK; -} - static inline bool qdisc_is_percpu_stats(const struct Qdisc *q) { return q->flags & TCQ_F_CPUSTATS; diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 956ead2cab9a..5abd1d9de989 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -440,9 +440,9 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch) if (count > 1 && (skb2 = skb_clone(skb, GFP_ATOMIC)) != NULL) { struct Qdisc *rootq = qdisc_root(sch); u32 dupsave = q->duplicate; /* prevent duplicating a dup... */ - q->duplicate = 0; - qdisc_enqueue_root(skb2, rootq); + q->duplicate = 0; + rootq->enqueue(skb2, rootq); q->duplicate = dupsave; } -- cgit v1.2.3 From a3eb95f891d6130b1fc03dd07a8b54cf0a5c8ab8 Mon Sep 17 00:00:00 2001 From: David Ward Date: Sat, 9 May 2015 22:01:46 -0400 Subject: net_sched: gred: add TCA_GRED_LIMIT attribute In a GRED qdisc, if the default "virtual queue" (VQ) does not have drop parameters configured, then packets for the default VQ are not subjected to RED and are only dropped if the queue is larger than the net_device's tx_queue_len. This behavior is useful for WRED mode, since these packets will still influence the calculated average queue length and (therefore) the drop probability for all of the other VQs. However, for some drivers tx_queue_len is zero. In other cases the user may wish to make the limit the same for all VQs (including the default VQ with no drop parameters). This change adds a TCA_GRED_LIMIT attribute to set the GRED queue limit, in bytes, during qdisc setup. (This limit is in bytes to be consistent with the drop parameters.) The default limit is the same as for a bfifo queue (tx_queue_len * psched_mtu). If the drop parameters of any VQ are configured with a smaller limit than the GRED queue limit, that VQ will still observe the smaller limit instead. Signed-off-by: David Ward Signed-off-by: David S. Miller --- include/uapi/linux/pkt_sched.h | 3 ++- net/sched/sch_gred.c | 28 ++++++++++++++++++++++++---- 2 files changed, 26 insertions(+), 5 deletions(-) (limited to 'net/sched') diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index 69d88b309cc7..8d2530daca9f 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -268,7 +268,8 @@ enum { TCA_GRED_STAB, TCA_GRED_DPS, TCA_GRED_MAX_P, - __TCA_GRED_MAX, + TCA_GRED_LIMIT, + __TCA_GRED_MAX, }; #define TCA_GRED_MAX (__TCA_GRED_MAX - 1) diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c index a4ca4517cdc8..826e2994152b 100644 --- a/net/sched/sch_gred.c +++ b/net/sched/sch_gred.c @@ -165,7 +165,8 @@ static int gred_enqueue(struct sk_buff *skb, struct Qdisc *sch) * if no default DP has been configured. This * allows for DP flows to be left untouched. */ - if (skb_queue_len(&sch->q) < qdisc_dev(sch)->tx_queue_len) + if (likely(sch->qstats.backlog + qdisc_pkt_len(skb) <= + sch->limit)) return qdisc_enqueue_tail(skb, sch); else goto drop; @@ -397,7 +398,10 @@ static inline int gred_change_vq(struct Qdisc *sch, int dp, q->DP = dp; q->prio = prio; - q->limit = ctl->limit; + if (ctl->limit > sch->limit) + q->limit = sch->limit; + else + q->limit = ctl->limit; if (q->backlog == 0) red_end_of_idle_period(&q->vars); @@ -414,6 +418,7 @@ static const struct nla_policy gred_policy[TCA_GRED_MAX + 1] = { [TCA_GRED_STAB] = { .len = 256 }, [TCA_GRED_DPS] = { .len = sizeof(struct tc_gred_sopt) }, [TCA_GRED_MAX_P] = { .type = NLA_U32 }, + [TCA_GRED_LIMIT] = { .type = NLA_U32 }, }; static int gred_change(struct Qdisc *sch, struct nlattr *opt) @@ -433,11 +438,15 @@ static int gred_change(struct Qdisc *sch, struct nlattr *opt) if (err < 0) return err; - if (tb[TCA_GRED_PARMS] == NULL && tb[TCA_GRED_STAB] == NULL) + if (tb[TCA_GRED_PARMS] == NULL && tb[TCA_GRED_STAB] == NULL) { + if (tb[TCA_GRED_LIMIT] != NULL) + sch->limit = nla_get_u32(tb[TCA_GRED_LIMIT]); return gred_change_table_def(sch, opt); + } if (tb[TCA_GRED_PARMS] == NULL || - tb[TCA_GRED_STAB] == NULL) + tb[TCA_GRED_STAB] == NULL || + tb[TCA_GRED_LIMIT] != NULL) return -EINVAL; max_P = tb[TCA_GRED_MAX_P] ? nla_get_u32(tb[TCA_GRED_MAX_P]) : 0; @@ -501,6 +510,14 @@ static int gred_init(struct Qdisc *sch, struct nlattr *opt) if (tb[TCA_GRED_PARMS] || tb[TCA_GRED_STAB]) return -EINVAL; + if (tb[TCA_GRED_LIMIT]) + sch->limit = nla_get_u32(tb[TCA_GRED_LIMIT]); + else { + u32 qlen = qdisc_dev(sch)->tx_queue_len ? : 1; + + sch->limit = qlen * psched_mtu(qdisc_dev(sch)); + } + return gred_change_table_def(sch, tb[TCA_GRED_DPS]); } @@ -531,6 +548,9 @@ static int gred_dump(struct Qdisc *sch, struct sk_buff *skb) if (nla_put(skb, TCA_GRED_MAX_P, sizeof(max_p), max_p)) goto nla_put_failure; + if (nla_put_u32(skb, TCA_GRED_LIMIT, sch->limit)) + goto nla_put_failure; + parms = nla_nest_start(skb, TCA_GRED_PARMS); if (parms == NULL) goto nla_put_failure; -- cgit v1.2.3 From e578d9c02587d57bfa7b560767c698a668a468c6 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 11 May 2015 19:50:41 +0200 Subject: net: sched: use counter to break reclassify loops Seems all we want here is to avoid endless 'goto reclassify' loop. tc_classify_compat even resets this counter when something other than TC_ACT_RECLASSIFY is returned, so this skb-counter doesn't break hypothetical loops induced by something other than perpetual TC_ACT_RECLASSIFY return values. skb_act_clone is now identical to skb_clone, so just use that. Tested with following (bogus) filter: tc filter add dev eth0 parent ffff: \ protocol ip u32 match u32 0 0 police rate 10Kbit burst \ 64000 mtu 1500 action reclassify Acked-by: Daniel Borkmann Signed-off-by: Florian Westphal Acked-by: Alexei Starovoitov Acked-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- Documentation/networking/tc-actions-env-rules.txt | 4 ---- include/net/sch_generic.h | 15 --------------- include/uapi/linux/pkt_cls.h | 2 +- net/sched/act_mirred.c | 2 +- net/sched/sch_api.c | 12 +++--------- 5 files changed, 5 insertions(+), 30 deletions(-) (limited to 'net/sched') diff --git a/Documentation/networking/tc-actions-env-rules.txt b/Documentation/networking/tc-actions-env-rules.txt index 95c71716b2e2..f37814693ad3 100644 --- a/Documentation/networking/tc-actions-env-rules.txt +++ b/Documentation/networking/tc-actions-env-rules.txt @@ -8,10 +8,6 @@ For example if your action queues a packet to be processed later, or intentionally branches by redirecting a packet, then you need to clone the packet. -There are certain fields in the skb tc_verd that need to be reset so we -avoid loops, etc. A few are generic enough that skb_act_clone() -resets them for you, so invoke skb_act_clone() rather than skb_clone(). - 2) If you munge any packet thou shalt call pskb_expand_head in the case someone else is referencing the skb. After that you "own" the skb. diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 1b0a2e88ed2b..2738f6f87908 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -739,21 +739,6 @@ static inline u32 qdisc_l2t(struct qdisc_rate_table* rtab, unsigned int pktlen) return rtab->data[slot]; } -#ifdef CONFIG_NET_CLS_ACT -static inline struct sk_buff *skb_act_clone(struct sk_buff *skb, gfp_t gfp_mask, - int action) -{ - struct sk_buff *n; - - n = skb_clone(skb, gfp_mask); - - if (n) { - n->tc_verd = SET_TC_VERD(n->tc_verd, 0); - } - return n; -} -#endif - struct psched_ratecfg { u64 rate_bytes_ps; /* bytes per second */ u32 mult; diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index 596ffa0c7084..ffc112c8e1c2 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -44,13 +44,13 @@ bits 9,10,11: redirect counter - redirect TTL. Loop avoidance #define TC_OK2MUNGE _TC_MAKEMASK1(1) #define SET_TC_OK2MUNGE(v) ( TC_OK2MUNGE | (v & ~TC_OK2MUNGE)) #define CLR_TC_OK2MUNGE(v) ( v & ~TC_OK2MUNGE) -#endif #define S_TC_VERD _TC_MAKE32(2) #define M_TC_VERD _TC_MAKEMASK(4,S_TC_VERD) #define G_TC_VERD(x) _TC_GETVALUE(x,S_TC_VERD,M_TC_VERD) #define V_TC_VERD(x) _TC_MAKEVALUE(x,S_TC_VERD) #define SET_TC_VERD(v,n) ((V_TC_VERD(n)) | (v & ~M_TC_VERD)) +#endif #define S_TC_FROM _TC_MAKE32(6) #define M_TC_FROM _TC_MAKEMASK(2,S_TC_FROM) diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 3f63ceac8e01..a42a3b257226 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -151,7 +151,7 @@ static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a, } at = G_TC_AT(skb->tc_verd); - skb2 = skb_act_clone(skb, GFP_ATOMIC, m->tcf_action); + skb2 = skb_clone(skb, GFP_ATOMIC); if (skb2 == NULL) goto out; diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index ad9eed70bc8f..0b74dc0ede9c 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -1816,13 +1816,8 @@ int tc_classify_compat(struct sk_buff *skb, const struct tcf_proto *tp, continue; err = tp->classify(skb, tp, res); - if (err >= 0) { -#ifdef CONFIG_NET_CLS_ACT - if (err != TC_ACT_RECLASSIFY && skb->tc_verd) - skb->tc_verd = SET_TC_VERD(skb->tc_verd, 0); -#endif + if (err >= 0) return err; - } } return -1; } @@ -1834,23 +1829,22 @@ int tc_classify(struct sk_buff *skb, const struct tcf_proto *tp, int err = 0; #ifdef CONFIG_NET_CLS_ACT const struct tcf_proto *otp = tp; + int limit = 0; reclassify: #endif err = tc_classify_compat(skb, tp, res); #ifdef CONFIG_NET_CLS_ACT if (err == TC_ACT_RECLASSIFY) { - u32 verd = G_TC_VERD(skb->tc_verd); tp = otp; - if (verd++ >= MAX_REC_LOOP) { + if (unlikely(limit++ >= MAX_REC_LOOP)) { net_notice_ratelimited("%s: packet reclassify loop rule prio %u protocol %02x\n", tp->q->ops->id, tp->prio & 0xffff, ntohs(tp->protocol)); return TC_ACT_SHOT; } - skb->tc_verd = SET_TC_VERD(skb->tc_verd, verd); goto reclassify; } #endif -- cgit v1.2.3 From 1bd758eb1cab2fa5b71a23f9e5d3c8076f4ed650 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 12 May 2015 14:56:07 +0200 Subject: net: change name of flow_dissector header to match the .c file name add couple of empty lines on the way. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/bonding/bond_main.c | 2 +- drivers/net/ethernet/cisco/enic/enic_clsf.c | 2 +- include/linux/skbuff.h | 2 +- include/net/flow_dissector.h | 68 +++++++++++++++++++++++++++++ include/net/flow_keys.h | 61 -------------------------- include/net/ip.h | 2 +- include/net/ipv6.h | 2 +- net/core/flow_dissector.c | 2 +- net/sched/cls_flow.c | 2 +- net/sched/sch_choke.c | 2 +- 10 files changed, 76 insertions(+), 69 deletions(-) create mode 100644 include/net/flow_dissector.h delete mode 100644 include/net/flow_keys.h (limited to 'net/sched') diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index a2e25de98bde..fe7c38721775 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -76,7 +76,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/net/ethernet/cisco/enic/enic_clsf.c b/drivers/net/ethernet/cisco/enic/enic_clsf.c index 0be6850be8a2..380f04903a36 100644 --- a/drivers/net/ethernet/cisco/enic/enic_clsf.c +++ b/drivers/net/ethernet/cisco/enic/enic_clsf.c @@ -5,7 +5,7 @@ #include #include #include -#include +#include #include "enic_res.h" #include "enic_clsf.h" diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index c0b574a414e7..e35c2b13d434 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -34,7 +34,7 @@ #include #include #include -#include +#include /* A. Checksumming of received packets by device. * diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h new file mode 100644 index 000000000000..5e99a7b3963c --- /dev/null +++ b/include/net/flow_dissector.h @@ -0,0 +1,68 @@ +#ifndef _NET_FLOW_DISSECTOR_H +#define _NET_FLOW_DISSECTOR_H + +/* struct flow_keys: + * @src: source ip address in case of IPv4 + * For IPv6 it contains 32bit hash of src address + * @dst: destination ip address in case of IPv4 + * For IPv6 it contains 32bit hash of dst address + * @ports: port numbers of Transport header + * port16[0]: src port number + * port16[1]: dst port number + * @thoff: Transport header offset + * @n_proto: Network header protocol (eg. IPv4/IPv6) + * @ip_proto: Transport header protocol (eg. TCP/UDP) + * All the members, except thoff, are in network byte order. + */ +struct flow_keys { + /* (src,dst) must be grouped, in the same way than in IP header */ + __be32 src; + __be32 dst; + union { + __be32 ports; + __be16 port16[2]; + }; + u16 thoff; + __be16 n_proto; + u8 ip_proto; +}; + +bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow, + void *data, __be16 proto, int nhoff, int hlen); + +static inline bool skb_flow_dissect(const struct sk_buff *skb, + struct flow_keys *flow) +{ + return __skb_flow_dissect(skb, flow, NULL, 0, 0, 0); +} + +__be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto, + void *data, int hlen_proto); + +static inline __be32 skb_flow_get_ports(const struct sk_buff *skb, + int thoff, u8 ip_proto) +{ + return __skb_flow_get_ports(skb, thoff, ip_proto, NULL, 0); +} + +u32 flow_hash_from_keys(struct flow_keys *keys); + +unsigned int flow_get_hlen(const unsigned char *data, unsigned int max_len, + __be16 protocol); + +/* struct flow_keys_digest: + * + * This structure is used to hold a digest of the full flow keys. This is a + * larger "hash" of a flow to allow definitively matching specific flows where + * the 32 bit skb->hash is not large enough. The size is limited to 16 bytes so + * that it can by used in CB of skb (see sch_choke for an example). + */ +#define FLOW_KEYS_DIGEST_LEN 16 +struct flow_keys_digest { + u8 data[FLOW_KEYS_DIGEST_LEN]; +}; + +void make_flow_keys_digest(struct flow_keys_digest *digest, + const struct flow_keys *flow); + +#endif diff --git a/include/net/flow_keys.h b/include/net/flow_keys.h deleted file mode 100644 index 6d6ef626811a..000000000000 --- a/include/net/flow_keys.h +++ /dev/null @@ -1,61 +0,0 @@ -#ifndef _NET_FLOW_KEYS_H -#define _NET_FLOW_KEYS_H - -/* struct flow_keys: - * @src: source ip address in case of IPv4 - * For IPv6 it contains 32bit hash of src address - * @dst: destination ip address in case of IPv4 - * For IPv6 it contains 32bit hash of dst address - * @ports: port numbers of Transport header - * port16[0]: src port number - * port16[1]: dst port number - * @thoff: Transport header offset - * @n_proto: Network header protocol (eg. IPv4/IPv6) - * @ip_proto: Transport header protocol (eg. TCP/UDP) - * All the members, except thoff, are in network byte order. - */ -struct flow_keys { - /* (src,dst) must be grouped, in the same way than in IP header */ - __be32 src; - __be32 dst; - union { - __be32 ports; - __be16 port16[2]; - }; - u16 thoff; - __be16 n_proto; - u8 ip_proto; -}; - -bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow, - void *data, __be16 proto, int nhoff, int hlen); -static inline bool skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow) -{ - return __skb_flow_dissect(skb, flow, NULL, 0, 0, 0); -} -__be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto, - void *data, int hlen_proto); -static inline __be32 skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto) -{ - return __skb_flow_get_ports(skb, thoff, ip_proto, NULL, 0); -} -u32 flow_hash_from_keys(struct flow_keys *keys); -unsigned int flow_get_hlen(const unsigned char *data, unsigned int max_len, - __be16 protocol); - -/* struct flow_keys_digest: - * - * This structure is used to hold a digest of the full flow keys. This is a - * larger "hash" of a flow to allow definitively matching specific flows where - * the 32 bit skb->hash is not large enough. The size is limited to 16 bytes so - * that it can by used in CB of skb (see sch_choke for an example). - */ -#define FLOW_KEYS_DIGEST_LEN 16 -struct flow_keys_digest { - u8 data[FLOW_KEYS_DIGEST_LEN]; -}; - -void make_flow_keys_digest(struct flow_keys_digest *digest, - const struct flow_keys *flow); - -#endif diff --git a/include/net/ip.h b/include/net/ip.h index d14af7edd197..562eb653aebc 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -31,7 +31,7 @@ #include #include #include -#include +#include struct sock; diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 53d25ef1699a..9932b86394a9 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -19,7 +19,7 @@ #include #include #include -#include +#include #include #define SIN6_LEN_RFC2133 24 diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index d3acc4dff4ae..eaa86b539221 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -12,7 +12,7 @@ #include #include #include -#include +#include #include /* copy saddr & daddr, possibly using 64bit load/store diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c index a620c4e288a5..4c5a92298906 100644 --- a/net/sched/cls_flow.c +++ b/net/sched/cls_flow.c @@ -26,7 +26,7 @@ #include #include #include -#include +#include #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) #include diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c index dfe3da75594c..2624e991a2d1 100644 --- a/net/sched/sch_choke.c +++ b/net/sched/sch_choke.c @@ -18,7 +18,7 @@ #include #include #include -#include +#include /* CHOKe stateless AQM for fair bandwidth allocation -- cgit v1.2.3 From 06635a35d13d42b95422bba6633f175245cc644e Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 12 May 2015 14:56:16 +0200 Subject: flow_dissect: use programable dissector in skb_flow_dissect and friends Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/bonding/bond_main.c | 18 +-- drivers/net/ethernet/cisco/enic/enic_clsf.c | 27 ++-- drivers/net/ethernet/cisco/enic/enic_ethtool.c | 10 +- drivers/net/hyperv/netvsc_drv.c | 8 +- include/linux/skbuff.h | 4 +- include/net/flow_dissector.h | 63 ++++---- include/net/ip.h | 8 +- include/net/ipv6.h | 8 +- net/core/flow_dissector.c | 199 +++++++++++++++++-------- net/ethernet/eth.c | 6 +- net/sched/cls_flow.c | 20 +-- net/sched/sch_choke.c | 4 +- 12 files changed, 229 insertions(+), 146 deletions(-) (limited to 'net/sched') diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index fe7c38721775..ef26e0147050 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -3051,16 +3051,16 @@ static bool bond_flow_dissect(struct bonding *bond, struct sk_buff *skb, int noff, proto = -1; if (bond->params.xmit_policy > BOND_XMIT_POLICY_LAYER23) - return skb_flow_dissect(skb, fk); + return skb_flow_dissect_flow_keys(skb, fk); - fk->ports = 0; + fk->ports.ports = 0; noff = skb_network_offset(skb); if (skb->protocol == htons(ETH_P_IP)) { if (unlikely(!pskb_may_pull(skb, noff + sizeof(*iph)))) return false; iph = ip_hdr(skb); - fk->src = iph->saddr; - fk->dst = iph->daddr; + fk->addrs.src = iph->saddr; + fk->addrs.dst = iph->daddr; noff += iph->ihl << 2; if (!ip_is_fragment(iph)) proto = iph->protocol; @@ -3068,15 +3068,15 @@ static bool bond_flow_dissect(struct bonding *bond, struct sk_buff *skb, if (unlikely(!pskb_may_pull(skb, noff + sizeof(*iph6)))) return false; iph6 = ipv6_hdr(skb); - fk->src = (__force __be32)ipv6_addr_hash(&iph6->saddr); - fk->dst = (__force __be32)ipv6_addr_hash(&iph6->daddr); + fk->addrs.src = (__force __be32)ipv6_addr_hash(&iph6->saddr); + fk->addrs.dst = (__force __be32)ipv6_addr_hash(&iph6->daddr); noff += sizeof(*iph6); proto = iph6->nexthdr; } else { return false; } if (bond->params.xmit_policy == BOND_XMIT_POLICY_LAYER34 && proto >= 0) - fk->ports = skb_flow_get_ports(skb, noff, proto); + fk->ports.ports = skb_flow_get_ports(skb, noff, proto); return true; } @@ -3102,8 +3102,8 @@ u32 bond_xmit_hash(struct bonding *bond, struct sk_buff *skb) bond->params.xmit_policy == BOND_XMIT_POLICY_ENCAP23) hash = bond_eth_hash(skb); else - hash = (__force u32)flow.ports; - hash ^= (__force u32)flow.dst ^ (__force u32)flow.src; + hash = (__force u32)flow.ports.ports; + hash ^= (__force u32)flow.addrs.dst ^ (__force u32)flow.addrs.src; hash ^= (hash >> 16); hash ^= (hash >> 8); diff --git a/drivers/net/ethernet/cisco/enic/enic_clsf.c b/drivers/net/ethernet/cisco/enic/enic_clsf.c index 380f04903a36..d3d25c7e5b96 100644 --- a/drivers/net/ethernet/cisco/enic/enic_clsf.c +++ b/drivers/net/ethernet/cisco/enic/enic_clsf.c @@ -22,7 +22,7 @@ int enic_addfltr_5t(struct enic *enic, struct flow_keys *keys, u16 rq) int res; struct filter data; - switch (keys->ip_proto) { + switch (keys->basic.ip_proto) { case IPPROTO_TCP: data.u.ipv4.protocol = PROTO_TCP; break; @@ -33,10 +33,10 @@ int enic_addfltr_5t(struct enic *enic, struct flow_keys *keys, u16 rq) return -EPROTONOSUPPORT; }; data.type = FILTER_IPV4_5TUPLE; - data.u.ipv4.src_addr = ntohl(keys->src); - data.u.ipv4.dst_addr = ntohl(keys->dst); - data.u.ipv4.src_port = ntohs(keys->port16[0]); - data.u.ipv4.dst_port = ntohs(keys->port16[1]); + data.u.ipv4.src_addr = ntohl(keys->addrs.src); + data.u.ipv4.dst_addr = ntohl(keys->addrs.dst); + data.u.ipv4.src_port = ntohs(keys->ports.port16[0]); + data.u.ipv4.dst_port = ntohs(keys->ports.port16[1]); data.u.ipv4.flags = FILTER_FIELDS_IPV4_5TUPLE; spin_lock_bh(&enic->devcmd_lock); @@ -158,11 +158,11 @@ static struct enic_rfs_fltr_node *htbl_key_search(struct hlist_head *h, struct enic_rfs_fltr_node *tpos; hlist_for_each_entry(tpos, h, node) - if (tpos->keys.src == k->src && - tpos->keys.dst == k->dst && - tpos->keys.ports == k->ports && - tpos->keys.ip_proto == k->ip_proto && - tpos->keys.n_proto == k->n_proto) + if (tpos->keys.addrs.src == k->addrs.src && + tpos->keys.addrs.dst == k->addrs.dst && + tpos->keys.ports.ports == k->ports.ports && + tpos->keys.basic.ip_proto == k->basic.ip_proto && + tpos->keys.basic.n_proto == k->basic.n_proto) return tpos; return NULL; } @@ -177,9 +177,10 @@ int enic_rx_flow_steer(struct net_device *dev, const struct sk_buff *skb, int res, i; enic = netdev_priv(dev); - res = skb_flow_dissect(skb, &keys); - if (!res || keys.n_proto != htons(ETH_P_IP) || - (keys.ip_proto != IPPROTO_TCP && keys.ip_proto != IPPROTO_UDP)) + res = skb_flow_dissect_flow_keys(skb, &keys); + if (!res || keys.basic.n_proto != htons(ETH_P_IP) || + (keys.basic.ip_proto != IPPROTO_TCP && + keys.basic.ip_proto != IPPROTO_UDP)) return -EPROTONOSUPPORT; tbl_idx = skb_get_hash_raw(skb) & ENIC_RFS_FLW_MASK; diff --git a/drivers/net/ethernet/cisco/enic/enic_ethtool.c b/drivers/net/ethernet/cisco/enic/enic_ethtool.c index 28d9ca675a27..7588f8dcc890 100644 --- a/drivers/net/ethernet/cisco/enic/enic_ethtool.c +++ b/drivers/net/ethernet/cisco/enic/enic_ethtool.c @@ -334,7 +334,7 @@ static int enic_grxclsrule(struct enic *enic, struct ethtool_rxnfc *cmd) n = htbl_fltr_search(enic, (u16)fsp->location); if (!n) return -EINVAL; - switch (n->keys.ip_proto) { + switch (n->keys.basic.ip_proto) { case IPPROTO_TCP: fsp->flow_type = TCP_V4_FLOW; break; @@ -346,16 +346,16 @@ static int enic_grxclsrule(struct enic *enic, struct ethtool_rxnfc *cmd) break; } - fsp->h_u.tcp_ip4_spec.ip4src = n->keys.src; + fsp->h_u.tcp_ip4_spec.ip4src = n->keys.addrs.src; fsp->m_u.tcp_ip4_spec.ip4src = (__u32)~0; - fsp->h_u.tcp_ip4_spec.ip4dst = n->keys.dst; + fsp->h_u.tcp_ip4_spec.ip4dst = n->keys.addrs.dst; fsp->m_u.tcp_ip4_spec.ip4dst = (__u32)~0; - fsp->h_u.tcp_ip4_spec.psrc = n->keys.port16[0]; + fsp->h_u.tcp_ip4_spec.psrc = n->keys.ports.port16[0]; fsp->m_u.tcp_ip4_spec.psrc = (__u16)~0; - fsp->h_u.tcp_ip4_spec.pdst = n->keys.port16[1]; + fsp->h_u.tcp_ip4_spec.pdst = n->keys.ports.port16[1]; fsp->m_u.tcp_ip4_spec.pdst = (__u16)~0; fsp->ring_cookie = n->rq_id; diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c index 5993c7e2d723..8e5fe888a0ec 100644 --- a/drivers/net/hyperv/netvsc_drv.c +++ b/drivers/net/hyperv/netvsc_drv.c @@ -196,12 +196,12 @@ static bool netvsc_set_hash(u32 *hash, struct sk_buff *skb) struct flow_keys flow; int data_len; - if (!skb_flow_dissect(skb, &flow) || - !(flow.n_proto == htons(ETH_P_IP) || - flow.n_proto == htons(ETH_P_IPV6))) + if (!skb_flow_dissect_flow_keys(skb, &flow) || + !(flow.basic.n_proto == htons(ETH_P_IP) || + flow.basic.n_proto == htons(ETH_P_IPV6))) return false; - if (flow.ip_proto == IPPROTO_TCP) + if (flow.basic.ip_proto == IPPROTO_TCP) data_len = 12; else data_len = 8; diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index b01c7fba7c17..f83aa6568cbf 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1935,8 +1935,8 @@ static inline void skb_probe_transport_header(struct sk_buff *skb, if (skb_transport_header_was_set(skb)) return; - else if (skb_flow_dissect(skb, &keys)) - skb_set_transport_header(skb, keys.thoff); + else if (skb_flow_dissect_flow_keys(skb, &keys)) + skb_set_transport_header(skb, keys.basic.thoff); else skb_set_transport_header(skb, offset_hint); } diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h index 20239e807f77..0c8d406fb730 100644 --- a/include/net/flow_dissector.h +++ b/include/net/flow_dissector.h @@ -59,42 +59,47 @@ struct flow_dissector { unsigned short int offset[FLOW_DISSECTOR_KEY_MAX]; }; -/* struct flow_keys: - * @src: source ip address in case of IPv4 - * For IPv6 it contains 32bit hash of src address - * @dst: destination ip address in case of IPv4 - * For IPv6 it contains 32bit hash of dst address - * @ports: port numbers of Transport header - * port16[0]: src port number - * port16[1]: dst port number - * @thoff: Transport header offset - * @n_proto: Network header protocol (eg. IPv4/IPv6) - * @ip_proto: Transport header protocol (eg. TCP/UDP) - * All the members, except thoff, are in network byte order. - */ -struct flow_keys { - /* (src,dst) must be grouped, in the same way than in IP header */ - __be32 src; - __be32 dst; - union { - __be32 ports; - __be16 port16[2]; - }; - u16 thoff; - __be16 n_proto; - u8 ip_proto; -}; - void skb_flow_dissector_init(struct flow_dissector *flow_dissector, const struct flow_dissector_key *key, unsigned int key_count); -bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow, + +bool __skb_flow_dissect(const struct sk_buff *skb, + struct flow_dissector *flow_dissector, + void *target_container, void *data, __be16 proto, int nhoff, int hlen); static inline bool skb_flow_dissect(const struct sk_buff *skb, - struct flow_keys *flow) + struct flow_dissector *flow_dissector, + void *target_container) +{ + return __skb_flow_dissect(skb, flow_dissector, target_container, + NULL, 0, 0, 0); +} + +struct flow_keys { + struct flow_dissector_key_addrs addrs; + struct flow_dissector_key_ports ports; + struct flow_dissector_key_basic basic; +}; + +extern struct flow_dissector flow_keys_dissector; +extern struct flow_dissector flow_keys_buf_dissector; + +static inline bool skb_flow_dissect_flow_keys(const struct sk_buff *skb, + struct flow_keys *flow) +{ + memset(flow, 0, sizeof(*flow)); + return __skb_flow_dissect(skb, &flow_keys_dissector, flow, + NULL, 0, 0, 0); +} + +static inline bool skb_flow_dissect_flow_keys_buf(struct flow_keys *flow, + void *data, __be16 proto, + int nhoff, int hlen) { - return __skb_flow_dissect(skb, flow, NULL, 0, 0, 0); + memset(flow, 0, sizeof(*flow)); + return __skb_flow_dissect(NULL, &flow_keys_buf_dissector, flow, + data, proto, nhoff, hlen); } __be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto, diff --git a/include/net/ip.h b/include/net/ip.h index 562eb653aebc..b0443d4fe13f 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -360,10 +360,10 @@ static inline void inet_set_txhash(struct sock *sk) struct inet_sock *inet = inet_sk(sk); struct flow_keys keys; - keys.src = inet->inet_saddr; - keys.dst = inet->inet_daddr; - keys.port16[0] = inet->inet_sport; - keys.port16[1] = inet->inet_dport; + keys.addrs.src = inet->inet_saddr; + keys.addrs.dst = inet->inet_daddr; + keys.ports.port16[0] = inet->inet_sport; + keys.ports.port16[1] = inet->inet_dport; sk->sk_txhash = flow_hash_from_keys(&keys); } diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 9932b86394a9..9eed9761dfce 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -698,10 +698,10 @@ static inline void ip6_set_txhash(struct sock *sk) struct ipv6_pinfo *np = inet6_sk(sk); struct flow_keys keys; - keys.src = (__force __be32)ipv6_addr_hash(&np->saddr); - keys.dst = (__force __be32)ipv6_addr_hash(&sk->sk_v6_daddr); - keys.port16[0] = inet->inet_sport; - keys.port16[1] = inet->inet_dport; + keys.addrs.src = (__force __be32)ipv6_addr_hash(&np->saddr); + keys.addrs.dst = (__force __be32)ipv6_addr_hash(&sk->sk_v6_daddr); + keys.ports.port16[0] = inet->inet_sport; + keys.ports.port16[1] = inet->inet_dport; sk->sk_txhash = flow_hash_from_keys(&keys); } diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 6883e22b8fc4..6a49acaa6651 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -63,17 +64,6 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector, } EXPORT_SYMBOL(skb_flow_dissector_init); -/* copy saddr & daddr, possibly using 64bit load/store - * Equivalent to : flow->src = iph->saddr; - * flow->dst = iph->daddr; - */ -static void iph_to_flow_copy_addrs(struct flow_keys *flow, const struct iphdr *iph) -{ - BUILD_BUG_ON(offsetof(typeof(*flow), dst) != - offsetof(typeof(*flow), src) + sizeof(flow->src)); - memcpy(&flow->src, &iph->saddr, sizeof(flow->src) + sizeof(flow->dst)); -} - /** * __skb_flow_get_ports - extract the upper layer ports and return them * @skb: sk_buff to extract the ports from @@ -111,17 +101,27 @@ EXPORT_SYMBOL(__skb_flow_get_ports); /** * __skb_flow_dissect - extract the flow_keys struct and return it * @skb: sk_buff to extract the flow from, can be NULL if the rest are specified + * @flow_dissector: list of keys to dissect + * @target_container: target structure to put dissected values into * @data: raw buffer pointer to the packet, if NULL use skb->data * @proto: protocol for which to get the flow, if @data is NULL use skb->protocol * @nhoff: network header offset, if @data is NULL use skb_network_offset(skb) * @hlen: packet header length, if @data is NULL use skb_headlen(skb) * - * The function will try to retrieve the struct flow_keys from either the skbuff - * or a raw buffer specified by the rest parameters + * The function will try to retrieve individual keys into target specified + * by flow_dissector from either the skbuff or a raw buffer specified by the + * rest parameters. + * + * Caller must take care of zeroing target container memory. */ -bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow, +bool __skb_flow_dissect(const struct sk_buff *skb, + struct flow_dissector *flow_dissector, + void *target_container, void *data, __be16 proto, int nhoff, int hlen) { + struct flow_dissector_key_basic *key_basic; + struct flow_dissector_key_addrs *key_addrs; + struct flow_dissector_key_ports *key_ports; u8 ip_proto; if (!data) { @@ -131,7 +131,12 @@ bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow, hlen = skb_headlen(skb); } - memset(flow, 0, sizeof(*flow)); + /* It is ensured by skb_flow_dissector_init() that basic key will + * be always present. + */ + key_basic = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_BASIC, + target_container); again: switch (proto) { @@ -148,14 +153,13 @@ ip: if (ip_is_fragment(iph)) ip_proto = 0; - /* skip the address processing if skb is NULL. The assumption - * here is that if there is no skb we are not looking for flow - * info but lengths and protocols. - */ - if (!skb) + if (!skb_flow_dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_IPV4_ADDRS)) break; - - iph_to_flow_copy_addrs(flow, iph); + key_addrs = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_IPV4_ADDRS, + target_container); + memcpy(key_addrs, &iph->saddr, sizeof(*key_addrs)); break; } case htons(ETH_P_IPV6): { @@ -171,12 +175,15 @@ ipv6: ip_proto = iph->nexthdr; nhoff += sizeof(struct ipv6hdr); - /* see comment above in IPv4 section */ - if (!skb) + if (!skb_flow_dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_IPV6_HASH_ADDRS)) break; + key_addrs = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_IPV6_HASH_ADDRS, + target_container); - flow->src = (__force __be32)ipv6_addr_hash(&iph->saddr); - flow->dst = (__force __be32)ipv6_addr_hash(&iph->daddr); + key_addrs->src = (__force __be32)ipv6_addr_hash(&iph->saddr); + key_addrs->dst = (__force __be32)ipv6_addr_hash(&iph->daddr); flow_label = ip6_flowlabel(iph); if (flow_label) { @@ -184,10 +191,18 @@ ipv6: * use that to represent the ports without any * further dissection. */ - flow->n_proto = proto; - flow->ip_proto = ip_proto; - flow->ports = flow_label; - flow->thoff = (u16)nhoff; + + key_basic->n_proto = proto; + key_basic->ip_proto = ip_proto; + key_basic->thoff = (u16)nhoff; + + if (!skb_flow_dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_PORTS)) + break; + key_ports = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_PORTS, + target_container); + key_ports->ports = flow_label; return true; } @@ -234,14 +249,22 @@ ipv6: hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr); if (!hdr) return false; - flow->src = hdr->srcnode; - flow->dst = 0; - flow->n_proto = proto; - flow->thoff = (u16)nhoff; + key_basic->n_proto = proto; + key_basic->thoff = (u16)nhoff; + + if (skb_flow_dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_IPV6_HASH_ADDRS)) { + return true; + key_addrs = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_IPV6_HASH_ADDRS, + target_container); + key_addrs->src = hdr->srcnode; + key_addrs->dst = 0; + } return true; } case htons(ETH_P_FCOE): - flow->thoff = (u16)(nhoff + FCOE_HEADER_LEN); + key_basic->thoff = (u16)(nhoff + FCOE_HEADER_LEN); /* fall through */ default: return false; @@ -296,14 +319,24 @@ ipv6: break; } - flow->n_proto = proto; - flow->ip_proto = ip_proto; - flow->thoff = (u16) nhoff; - - /* unless skb is set we don't need to record port info */ - if (skb) - flow->ports = __skb_flow_get_ports(skb, nhoff, ip_proto, - data, hlen); + /* It is ensured by skb_flow_dissector_init() that basic key will + * be always present. + */ + key_basic = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_BASIC, + target_container); + key_basic->n_proto = proto; + key_basic->ip_proto = ip_proto; + key_basic->thoff = (u16) nhoff; + + if (skb_flow_dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_PORTS)) { + key_ports = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_PORTS, + target_container); + key_ports->ports = __skb_flow_get_ports(skb, nhoff, ip_proto, + data, hlen); + } return true; } @@ -325,16 +358,16 @@ static inline u32 __flow_hash_from_keys(struct flow_keys *keys, u32 keyval) u32 hash; /* get a consistent hash (same value on both flow directions) */ - if (((__force u32)keys->dst < (__force u32)keys->src) || - (((__force u32)keys->dst == (__force u32)keys->src) && - ((__force u16)keys->port16[1] < (__force u16)keys->port16[0]))) { - swap(keys->dst, keys->src); - swap(keys->port16[0], keys->port16[1]); + if (((__force u32)keys->addrs.dst < (__force u32)keys->addrs.src) || + (((__force u32)keys->addrs.dst == (__force u32)keys->addrs.src) && + ((__force u16)keys->ports.port16[1] < (__force u16)keys->ports.port16[0]))) { + swap(keys->addrs.dst, keys->addrs.src); + swap(keys->ports.port16[0], keys->ports.port16[1]); } - hash = __flow_hash_3words((__force u32)keys->dst, - (__force u32)keys->src, - (__force u32)keys->ports, + hash = __flow_hash_3words((__force u32)keys->addrs.dst, + (__force u32)keys->addrs.src, + (__force u32)keys->ports.ports, keyval); if (!hash) hash = 1; @@ -352,7 +385,7 @@ EXPORT_SYMBOL(flow_hash_from_keys); static inline u32 ___skb_get_hash(const struct sk_buff *skb, struct flow_keys *keys, u32 keyval) { - if (!skb_flow_dissect(skb, keys)) + if (!skb_flow_dissect_flow_keys(skb, keys)) return 0; return __flow_hash_from_keys(keys, keyval); @@ -377,11 +410,11 @@ void make_flow_keys_digest(struct flow_keys_digest *digest, memset(digest, 0, sizeof(*digest)); - data->n_proto = flow->n_proto; - data->ip_proto = flow->ip_proto; - data->ports = flow->ports; - data->src = flow->src; - data->dst = flow->dst; + data->n_proto = flow->basic.n_proto; + data->ip_proto = flow->basic.ip_proto; + data->ports = flow->ports.ports; + data->src = flow->addrs.src; + data->dst = flow->addrs.dst; } EXPORT_SYMBOL(make_flow_keys_digest); @@ -404,7 +437,7 @@ void __skb_get_hash(struct sk_buff *skb) hash = ___skb_get_hash(skb, &keys, hashrnd); if (!hash) return; - if (keys.ports) + if (keys.ports.ports) skb->l4_hash = 1; skb->sw_hash = 1; skb->hash = hash; @@ -422,9 +455,9 @@ EXPORT_SYMBOL(skb_get_hash_perturb); u32 __skb_get_poff(const struct sk_buff *skb, void *data, const struct flow_keys *keys, int hlen) { - u32 poff = keys->thoff; + u32 poff = keys->basic.thoff; - switch (keys->ip_proto) { + switch (keys->basic.ip_proto) { case IPPROTO_TCP: { /* access doff as u8 to avoid unaligned access */ const u8 *doff; @@ -478,8 +511,52 @@ u32 skb_get_poff(const struct sk_buff *skb) { struct flow_keys keys; - if (!skb_flow_dissect(skb, &keys)) + if (!skb_flow_dissect_flow_keys(skb, &keys)) return 0; return __skb_get_poff(skb, skb->data, &keys, skb_headlen(skb)); } + +static const struct flow_dissector_key flow_keys_dissector_keys[] = { + { + .key_id = FLOW_DISSECTOR_KEY_BASIC, + .offset = offsetof(struct flow_keys, basic), + }, + { + .key_id = FLOW_DISSECTOR_KEY_IPV4_ADDRS, + .offset = offsetof(struct flow_keys, addrs), + }, + { + .key_id = FLOW_DISSECTOR_KEY_IPV6_HASH_ADDRS, + .offset = offsetof(struct flow_keys, addrs), + }, + { + .key_id = FLOW_DISSECTOR_KEY_PORTS, + .offset = offsetof(struct flow_keys, ports), + }, +}; + +static const struct flow_dissector_key flow_keys_buf_dissector_keys[] = { + { + .key_id = FLOW_DISSECTOR_KEY_BASIC, + .offset = offsetof(struct flow_keys, basic), + }, +}; + +struct flow_dissector flow_keys_dissector __read_mostly; +EXPORT_SYMBOL(flow_keys_dissector); + +struct flow_dissector flow_keys_buf_dissector __read_mostly; + +static int __init init_default_flow_dissectors(void) +{ + skb_flow_dissector_init(&flow_keys_dissector, + flow_keys_dissector_keys, + ARRAY_SIZE(flow_keys_dissector_keys)); + skb_flow_dissector_init(&flow_keys_buf_dissector, + flow_keys_buf_dissector_keys, + ARRAY_SIZE(flow_keys_buf_dissector_keys)); + return 0; +} + +late_initcall_sync(init_default_flow_dissectors); diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index 9332a0ab0698..c3325bd2f3fb 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -131,9 +131,9 @@ u32 eth_get_headlen(void *data, unsigned int len) return len; /* parse any remaining L2/L3 headers, check for L4 */ - if (!__skb_flow_dissect(NULL, &keys, data, - eth->h_proto, sizeof(*eth), len)) - return max_t(u32, keys.thoff, sizeof(*eth)); + if (!skb_flow_dissect_flow_keys_buf(&keys, data, eth->h_proto, + sizeof(*eth), len)) + return max_t(u32, keys.basic.thoff, sizeof(*eth)); /* parse for any L4 headers */ return min_t(u32, __skb_get_poff(NULL, data, &keys, len), len); diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c index 4c5a92298906..4b3e3e30bf4d 100644 --- a/net/sched/cls_flow.c +++ b/net/sched/cls_flow.c @@ -68,35 +68,35 @@ static inline u32 addr_fold(void *addr) static u32 flow_get_src(const struct sk_buff *skb, const struct flow_keys *flow) { - if (flow->src) - return ntohl(flow->src); + if (flow->addrs.src) + return ntohl(flow->addrs.src); return addr_fold(skb->sk); } static u32 flow_get_dst(const struct sk_buff *skb, const struct flow_keys *flow) { - if (flow->dst) - return ntohl(flow->dst); + if (flow->addrs.dst) + return ntohl(flow->addrs.dst); return addr_fold(skb_dst(skb)) ^ (__force u16) tc_skb_protocol(skb); } static u32 flow_get_proto(const struct sk_buff *skb, const struct flow_keys *flow) { - return flow->ip_proto; + return flow->basic.ip_proto; } static u32 flow_get_proto_src(const struct sk_buff *skb, const struct flow_keys *flow) { - if (flow->ports) - return ntohs(flow->port16[0]); + if (flow->ports.ports) + return ntohs(flow->ports.port16[0]); return addr_fold(skb->sk); } static u32 flow_get_proto_dst(const struct sk_buff *skb, const struct flow_keys *flow) { - if (flow->ports) - return ntohs(flow->port16[1]); + if (flow->ports.ports) + return ntohs(flow->ports.port16[1]); return addr_fold(skb_dst(skb)) ^ (__force u16) tc_skb_protocol(skb); } @@ -295,7 +295,7 @@ static int flow_classify(struct sk_buff *skb, const struct tcf_proto *tp, keymask = f->keymask; if (keymask & FLOW_KEYS_NEEDED) - skb_flow_dissect(skb, &flow_keys); + skb_flow_dissect_flow_keys(skb, &flow_keys); for (n = 0; n < f->nkeys; n++) { key = ffs(keymask) - 1; diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c index 2624e991a2d1..93d5742dc7e0 100644 --- a/net/sched/sch_choke.c +++ b/net/sched/sch_choke.c @@ -170,13 +170,13 @@ static bool choke_match_flow(struct sk_buff *skb1, if (!choke_skb_cb(skb1)->keys_valid) { choke_skb_cb(skb1)->keys_valid = 1; - skb_flow_dissect(skb1, &temp); + skb_flow_dissect_flow_keys(skb1, &temp); make_flow_keys_digest(&choke_skb_cb(skb1)->keys, &temp); } if (!choke_skb_cb(skb2)->keys_valid) { choke_skb_cb(skb2)->keys_valid = 1; - skb_flow_dissect(skb2, &temp); + skb_flow_dissect_flow_keys(skb2, &temp); make_flow_keys_digest(&choke_skb_cb(skb2)->keys, &temp); } -- cgit v1.2.3 From 59346afe7a5548ab3e9730aeff33993faa76abbe Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 12 May 2015 14:56:20 +0200 Subject: flow_dissector: change port array into src, dst tuple Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/ethernet/cisco/enic/enic_clsf.c | 4 ++-- drivers/net/ethernet/cisco/enic/enic_ethtool.c | 4 ++-- include/net/flow_dissector.h | 9 ++++++--- include/net/ip.h | 4 ++-- include/net/ipv6.h | 4 ++-- net/core/flow_dissector.c | 4 ++-- net/sched/cls_flow.c | 4 ++-- 7 files changed, 18 insertions(+), 15 deletions(-) (limited to 'net/sched') diff --git a/drivers/net/ethernet/cisco/enic/enic_clsf.c b/drivers/net/ethernet/cisco/enic/enic_clsf.c index d3d25c7e5b96..6739ebc08c47 100644 --- a/drivers/net/ethernet/cisco/enic/enic_clsf.c +++ b/drivers/net/ethernet/cisco/enic/enic_clsf.c @@ -35,8 +35,8 @@ int enic_addfltr_5t(struct enic *enic, struct flow_keys *keys, u16 rq) data.type = FILTER_IPV4_5TUPLE; data.u.ipv4.src_addr = ntohl(keys->addrs.src); data.u.ipv4.dst_addr = ntohl(keys->addrs.dst); - data.u.ipv4.src_port = ntohs(keys->ports.port16[0]); - data.u.ipv4.dst_port = ntohs(keys->ports.port16[1]); + data.u.ipv4.src_port = ntohs(keys->ports.src); + data.u.ipv4.dst_port = ntohs(keys->ports.dst); data.u.ipv4.flags = FILTER_FIELDS_IPV4_5TUPLE; spin_lock_bh(&enic->devcmd_lock); diff --git a/drivers/net/ethernet/cisco/enic/enic_ethtool.c b/drivers/net/ethernet/cisco/enic/enic_ethtool.c index 7588f8dcc890..117c0968dd0b 100644 --- a/drivers/net/ethernet/cisco/enic/enic_ethtool.c +++ b/drivers/net/ethernet/cisco/enic/enic_ethtool.c @@ -352,10 +352,10 @@ static int enic_grxclsrule(struct enic *enic, struct ethtool_rxnfc *cmd) fsp->h_u.tcp_ip4_spec.ip4dst = n->keys.addrs.dst; fsp->m_u.tcp_ip4_spec.ip4dst = (__u32)~0; - fsp->h_u.tcp_ip4_spec.psrc = n->keys.ports.port16[0]; + fsp->h_u.tcp_ip4_spec.psrc = n->keys.ports.src; fsp->m_u.tcp_ip4_spec.psrc = (__u16)~0; - fsp->h_u.tcp_ip4_spec.pdst = n->keys.ports.port16[1]; + fsp->h_u.tcp_ip4_spec.pdst = n->keys.ports.dst; fsp->m_u.tcp_ip4_spec.pdst = (__u16)~0; fsp->ring_cookie = n->rq_id; diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h index 5eac9870689c..bac9c1421f58 100644 --- a/include/net/flow_dissector.h +++ b/include/net/flow_dissector.h @@ -34,13 +34,16 @@ struct flow_dissector_key_addrs { /** * flow_dissector_key_tp_ports: * @ports: port numbers of Transport header - * port16[0]: src port number - * port16[1]: dst port number + * src: source port number + * dst: destination port number */ struct flow_dissector_key_ports { union { __be32 ports; - __be16 port16[2]; + struct { + __be16 src; + __be16 dst; + }; }; }; diff --git a/include/net/ip.h b/include/net/ip.h index b0443d4fe13f..0ed6d768e606 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -362,8 +362,8 @@ static inline void inet_set_txhash(struct sock *sk) keys.addrs.src = inet->inet_saddr; keys.addrs.dst = inet->inet_daddr; - keys.ports.port16[0] = inet->inet_sport; - keys.ports.port16[1] = inet->inet_dport; + keys.ports.src = inet->inet_sport; + keys.ports.dst = inet->inet_dport; sk->sk_txhash = flow_hash_from_keys(&keys); } diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 9eed9761dfce..aab8190d16e8 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -700,8 +700,8 @@ static inline void ip6_set_txhash(struct sock *sk) keys.addrs.src = (__force __be32)ipv6_addr_hash(&np->saddr); keys.addrs.dst = (__force __be32)ipv6_addr_hash(&sk->sk_v6_daddr); - keys.ports.port16[0] = inet->inet_sport; - keys.ports.port16[1] = inet->inet_dport; + keys.ports.src = inet->inet_sport; + keys.ports.dst = inet->inet_dport; sk->sk_txhash = flow_hash_from_keys(&keys); } diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 7a0b391114a5..204d09c42510 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -385,9 +385,9 @@ static inline u32 __flow_hash_from_keys(struct flow_keys *keys, u32 keyval) /* get a consistent hash (same value on both flow directions) */ if (((__force u32)keys->addrs.dst < (__force u32)keys->addrs.src) || (((__force u32)keys->addrs.dst == (__force u32)keys->addrs.src) && - ((__force u16)keys->ports.port16[1] < (__force u16)keys->ports.port16[0]))) { + ((__force u16)keys->ports.dst < (__force u16)keys->ports.src))) { swap(keys->addrs.dst, keys->addrs.src); - swap(keys->ports.port16[0], keys->ports.port16[1]); + swap(keys->ports.src, keys->ports.dst); } hash = __flow_hash_3words((__force u32)keys->addrs.dst, diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c index 4b3e3e30bf4d..b4359924846c 100644 --- a/net/sched/cls_flow.c +++ b/net/sched/cls_flow.c @@ -88,7 +88,7 @@ static u32 flow_get_proto(const struct sk_buff *skb, const struct flow_keys *flo static u32 flow_get_proto_src(const struct sk_buff *skb, const struct flow_keys *flow) { if (flow->ports.ports) - return ntohs(flow->ports.port16[0]); + return ntohs(flow->ports.src); return addr_fold(skb->sk); } @@ -96,7 +96,7 @@ static u32 flow_get_proto_src(const struct sk_buff *skb, const struct flow_keys static u32 flow_get_proto_dst(const struct sk_buff *skb, const struct flow_keys *flow) { if (flow->ports.ports) - return ntohs(flow->ports.port16[1]); + return ntohs(flow->ports.dst); return addr_fold(skb_dst(skb)) ^ (__force u16) tc_skb_protocol(skb); } -- cgit v1.2.3 From 77b9900ef53ae047e36a37d13a2aa33bb2d60641 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 12 May 2015 14:56:21 +0200 Subject: tc: introduce Flower classifier This patch introduces a flow-based filter. So far, the very essential packet fields are supported. This patch is only the first step. There is a lot of potential performance improvements possible to implement. Also a lot of features are missing now. They will be addressed in follow-up patches. Signed-off-by: Jiri Pirko Acked-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- include/uapi/linux/pkt_cls.h | 30 ++ net/sched/Kconfig | 10 + net/sched/Makefile | 1 + net/sched/cls_flower.c | 688 +++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 729 insertions(+) create mode 100644 net/sched/cls_flower.c (limited to 'net/sched') diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index ffc112c8e1c2..39fb53d67b11 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -409,6 +409,36 @@ enum { #define TCA_BPF_MAX (__TCA_BPF_MAX - 1) +/* Flower classifier */ + +enum { + TCA_FLOWER_UNSPEC, + TCA_FLOWER_CLASSID, + TCA_FLOWER_INDEV, + TCA_FLOWER_ACT, + TCA_FLOWER_KEY_ETH_DST, /* ETH_ALEN */ + TCA_FLOWER_KEY_ETH_DST_MASK, /* ETH_ALEN */ + TCA_FLOWER_KEY_ETH_SRC, /* ETH_ALEN */ + TCA_FLOWER_KEY_ETH_SRC_MASK, /* ETH_ALEN */ + TCA_FLOWER_KEY_ETH_TYPE, /* be16 */ + TCA_FLOWER_KEY_IP_PROTO, /* u8 */ + TCA_FLOWER_KEY_IPV4_SRC, /* be32 */ + TCA_FLOWER_KEY_IPV4_SRC_MASK, /* be32 */ + TCA_FLOWER_KEY_IPV4_DST, /* be32 */ + TCA_FLOWER_KEY_IPV4_DST_MASK, /* be32 */ + TCA_FLOWER_KEY_IPV6_SRC, /* struct in6_addr */ + TCA_FLOWER_KEY_IPV6_SRC_MASK, /* struct in6_addr */ + TCA_FLOWER_KEY_IPV6_DST, /* struct in6_addr */ + TCA_FLOWER_KEY_IPV6_DST_MASK, /* struct in6_addr */ + TCA_FLOWER_KEY_TCP_SRC, /* be16 */ + TCA_FLOWER_KEY_TCP_DST, /* be16 */ + TCA_FLOWER_KEY_UDP_SRC, /* be16 */ + TCA_FLOWER_KEY_UDP_DST, /* be16 */ + __TCA_FLOWER_MAX, +}; + +#define TCA_FLOWER_MAX (__TCA_FLOWER_MAX - 1) + /* Extended Matches */ struct tcf_ematch_tree_hdr { diff --git a/net/sched/Kconfig b/net/sched/Kconfig index 2274e723a3df..5fd1c2f487d2 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -477,6 +477,16 @@ config NET_CLS_BPF To compile this code as a module, choose M here: the module will be called cls_bpf. +config NET_CLS_FLOWER + tristate "Flower classifier" + select NET_CLS + ---help--- + If you say Y here, you will be able to classify packets based on + a configurable combination of packet keys and masks. + + To compile this code as a module, choose M here: the module will + be called cls_flower. + config NET_EMATCH bool "Extended Matches" select NET_CLS diff --git a/net/sched/Makefile b/net/sched/Makefile index 7ca7f4c1b8c2..690c1689e090 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile @@ -56,6 +56,7 @@ obj-$(CONFIG_NET_CLS_BASIC) += cls_basic.o obj-$(CONFIG_NET_CLS_FLOW) += cls_flow.o obj-$(CONFIG_NET_CLS_CGROUP) += cls_cgroup.o obj-$(CONFIG_NET_CLS_BPF) += cls_bpf.o +obj-$(CONFIG_NET_CLS_FLOWER) += cls_flower.o obj-$(CONFIG_NET_EMATCH) += ematch.o obj-$(CONFIG_NET_EMATCH_CMP) += em_cmp.o obj-$(CONFIG_NET_EMATCH_NBYTE) += em_nbyte.o diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c new file mode 100644 index 000000000000..9bc654c764cd --- /dev/null +++ b/net/sched/cls_flower.c @@ -0,0 +1,688 @@ +/* + * net/sched/cls_flower.c Flower classifier + * + * Copyright (c) 2015 Jiri Pirko + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include + +struct fl_flow_key { + int indev_ifindex; + struct flow_dissector_key_basic basic; + struct flow_dissector_key_eth_addrs eth; + union { + struct flow_dissector_key_addrs ipv4; + struct flow_dissector_key_ipv6_addrs ipv6; + }; + struct flow_dissector_key_ports tp; +} __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */ + +struct fl_flow_mask_range { + unsigned short int start; + unsigned short int end; +}; + +struct fl_flow_mask { + struct fl_flow_key key; + struct fl_flow_mask_range range; + struct rcu_head rcu; +}; + +struct cls_fl_head { + struct rhashtable ht; + struct fl_flow_mask mask; + struct flow_dissector dissector; + u32 hgen; + bool mask_assigned; + struct list_head filters; + struct rhashtable_params ht_params; + struct rcu_head rcu; +}; + +struct cls_fl_filter { + struct rhash_head ht_node; + struct fl_flow_key mkey; + struct tcf_exts exts; + struct tcf_result res; + struct fl_flow_key key; + struct list_head list; + u32 handle; + struct rcu_head rcu; +}; + +static unsigned short int fl_mask_range(const struct fl_flow_mask *mask) +{ + return mask->range.end - mask->range.start; +} + +static void fl_mask_update_range(struct fl_flow_mask *mask) +{ + const u8 *bytes = (const u8 *) &mask->key; + size_t size = sizeof(mask->key); + size_t i, first = 0, last = size - 1; + + for (i = 0; i < sizeof(mask->key); i++) { + if (bytes[i]) { + if (!first && i) + first = i; + last = i; + } + } + mask->range.start = rounddown(first, sizeof(long)); + mask->range.end = roundup(last + 1, sizeof(long)); +} + +static void *fl_key_get_start(struct fl_flow_key *key, + const struct fl_flow_mask *mask) +{ + return (u8 *) key + mask->range.start; +} + +static void fl_set_masked_key(struct fl_flow_key *mkey, struct fl_flow_key *key, + struct fl_flow_mask *mask) +{ + const long *lkey = fl_key_get_start(key, mask); + const long *lmask = fl_key_get_start(&mask->key, mask); + long *lmkey = fl_key_get_start(mkey, mask); + int i; + + for (i = 0; i < fl_mask_range(mask); i += sizeof(long)) + *lmkey++ = *lkey++ & *lmask++; +} + +static void fl_clear_masked_range(struct fl_flow_key *key, + struct fl_flow_mask *mask) +{ + memset(fl_key_get_start(key, mask), 0, fl_mask_range(mask)); +} + +static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp, + struct tcf_result *res) +{ + struct cls_fl_head *head = rcu_dereference_bh(tp->root); + struct cls_fl_filter *f; + struct fl_flow_key skb_key; + struct fl_flow_key skb_mkey; + + fl_clear_masked_range(&skb_key, &head->mask); + skb_key.indev_ifindex = skb->skb_iif; + /* skb_flow_dissect() does not set n_proto in case an unknown protocol, + * so do it rather here. + */ + skb_key.basic.n_proto = skb->protocol; + skb_flow_dissect(skb, &head->dissector, &skb_key); + + fl_set_masked_key(&skb_mkey, &skb_key, &head->mask); + + f = rhashtable_lookup_fast(&head->ht, + fl_key_get_start(&skb_mkey, &head->mask), + head->ht_params); + if (f) { + *res = f->res; + return tcf_exts_exec(skb, &f->exts, res); + } + return -1; +} + +static int fl_init(struct tcf_proto *tp) +{ + struct cls_fl_head *head; + + head = kzalloc(sizeof(*head), GFP_KERNEL); + if (!head) + return -ENOBUFS; + + INIT_LIST_HEAD_RCU(&head->filters); + rcu_assign_pointer(tp->root, head); + + return 0; +} + +static void fl_destroy_filter(struct rcu_head *head) +{ + struct cls_fl_filter *f = container_of(head, struct cls_fl_filter, rcu); + + tcf_exts_destroy(&f->exts); + kfree(f); +} + +static bool fl_destroy(struct tcf_proto *tp, bool force) +{ + struct cls_fl_head *head = rtnl_dereference(tp->root); + struct cls_fl_filter *f, *next; + + if (!force && !list_empty(&head->filters)) + return false; + + list_for_each_entry_safe(f, next, &head->filters, list) { + list_del_rcu(&f->list); + call_rcu(&f->rcu, fl_destroy_filter); + } + RCU_INIT_POINTER(tp->root, NULL); + if (head->mask_assigned) + rhashtable_destroy(&head->ht); + kfree_rcu(head, rcu); + return true; +} + +static unsigned long fl_get(struct tcf_proto *tp, u32 handle) +{ + struct cls_fl_head *head = rtnl_dereference(tp->root); + struct cls_fl_filter *f; + + list_for_each_entry(f, &head->filters, list) + if (f->handle == handle) + return (unsigned long) f; + return 0; +} + +static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = { + [TCA_FLOWER_UNSPEC] = { .type = NLA_UNSPEC }, + [TCA_FLOWER_CLASSID] = { .type = NLA_U32 }, + [TCA_FLOWER_INDEV] = { .type = NLA_STRING, + .len = IFNAMSIZ }, + [TCA_FLOWER_KEY_ETH_DST] = { .len = ETH_ALEN }, + [TCA_FLOWER_KEY_ETH_DST_MASK] = { .len = ETH_ALEN }, + [TCA_FLOWER_KEY_ETH_SRC] = { .len = ETH_ALEN }, + [TCA_FLOWER_KEY_ETH_SRC_MASK] = { .len = ETH_ALEN }, + [TCA_FLOWER_KEY_ETH_TYPE] = { .type = NLA_U16 }, + [TCA_FLOWER_KEY_IP_PROTO] = { .type = NLA_U8 }, + [TCA_FLOWER_KEY_IPV4_SRC] = { .type = NLA_U32 }, + [TCA_FLOWER_KEY_IPV4_SRC_MASK] = { .type = NLA_U32 }, + [TCA_FLOWER_KEY_IPV4_DST] = { .type = NLA_U32 }, + [TCA_FLOWER_KEY_IPV4_DST_MASK] = { .type = NLA_U32 }, + [TCA_FLOWER_KEY_IPV6_SRC] = { .len = sizeof(struct in6_addr) }, + [TCA_FLOWER_KEY_IPV6_SRC_MASK] = { .len = sizeof(struct in6_addr) }, + [TCA_FLOWER_KEY_IPV6_DST] = { .len = sizeof(struct in6_addr) }, + [TCA_FLOWER_KEY_IPV6_DST_MASK] = { .len = sizeof(struct in6_addr) }, + [TCA_FLOWER_KEY_TCP_SRC] = { .type = NLA_U16 }, + [TCA_FLOWER_KEY_TCP_DST] = { .type = NLA_U16 }, + [TCA_FLOWER_KEY_TCP_SRC] = { .type = NLA_U16 }, + [TCA_FLOWER_KEY_TCP_DST] = { .type = NLA_U16 }, +}; + +static void fl_set_key_val(struct nlattr **tb, + void *val, int val_type, + void *mask, int mask_type, int len) +{ + if (!tb[val_type]) + return; + memcpy(val, nla_data(tb[val_type]), len); + if (mask_type == TCA_FLOWER_UNSPEC || !tb[mask_type]) + memset(mask, 0xff, len); + else + memcpy(mask, nla_data(tb[mask_type]), len); +} + +static int fl_set_key(struct net *net, struct nlattr **tb, + struct fl_flow_key *key, struct fl_flow_key *mask) +{ + int err; + + if (tb[TCA_FLOWER_INDEV]) { + err = tcf_change_indev(net, tb[TCA_FLOWER_INDEV]); + if (err < 0) + return err; + key->indev_ifindex = err; + mask->indev_ifindex = 0xffffffff; + } + + fl_set_key_val(tb, key->eth.dst, TCA_FLOWER_KEY_ETH_DST, + mask->eth.dst, TCA_FLOWER_KEY_ETH_DST_MASK, + sizeof(key->eth.dst)); + fl_set_key_val(tb, key->eth.src, TCA_FLOWER_KEY_ETH_SRC, + mask->eth.src, TCA_FLOWER_KEY_ETH_SRC_MASK, + sizeof(key->eth.src)); + fl_set_key_val(tb, &key->basic.n_proto, TCA_FLOWER_KEY_ETH_TYPE, + &mask->basic.n_proto, TCA_FLOWER_UNSPEC, + sizeof(key->basic.n_proto)); + if (key->basic.n_proto == htons(ETH_P_IP) || + key->basic.n_proto == htons(ETH_P_IPV6)) { + fl_set_key_val(tb, &key->basic.ip_proto, TCA_FLOWER_KEY_IP_PROTO, + &mask->basic.ip_proto, TCA_FLOWER_UNSPEC, + sizeof(key->basic.ip_proto)); + } + if (key->basic.n_proto == htons(ETH_P_IP)) { + fl_set_key_val(tb, &key->ipv4.src, TCA_FLOWER_KEY_IPV4_SRC, + &mask->ipv4.src, TCA_FLOWER_KEY_IPV4_SRC_MASK, + sizeof(key->ipv4.src)); + fl_set_key_val(tb, &key->ipv4.dst, TCA_FLOWER_KEY_IPV4_DST, + &mask->ipv4.dst, TCA_FLOWER_KEY_IPV4_DST_MASK, + sizeof(key->ipv4.dst)); + } else if (key->basic.n_proto == htons(ETH_P_IPV6)) { + fl_set_key_val(tb, &key->ipv6.src, TCA_FLOWER_KEY_IPV6_SRC, + &mask->ipv6.src, TCA_FLOWER_KEY_IPV6_SRC_MASK, + sizeof(key->ipv6.src)); + fl_set_key_val(tb, &key->ipv6.dst, TCA_FLOWER_KEY_IPV6_DST, + &mask->ipv6.dst, TCA_FLOWER_KEY_IPV6_DST_MASK, + sizeof(key->ipv6.dst)); + } + if (key->basic.ip_proto == IPPROTO_TCP) { + fl_set_key_val(tb, &key->tp.src, TCA_FLOWER_KEY_TCP_SRC, + &mask->tp.src, TCA_FLOWER_UNSPEC, + sizeof(key->tp.src)); + fl_set_key_val(tb, &key->tp.dst, TCA_FLOWER_KEY_TCP_DST, + &mask->tp.dst, TCA_FLOWER_UNSPEC, + sizeof(key->tp.dst)); + } else if (key->basic.ip_proto == IPPROTO_UDP) { + fl_set_key_val(tb, &key->tp.src, TCA_FLOWER_KEY_UDP_SRC, + &mask->tp.src, TCA_FLOWER_UNSPEC, + sizeof(key->tp.src)); + fl_set_key_val(tb, &key->tp.dst, TCA_FLOWER_KEY_UDP_DST, + &mask->tp.dst, TCA_FLOWER_UNSPEC, + sizeof(key->tp.dst)); + } + + return 0; +} + +static bool fl_mask_eq(struct fl_flow_mask *mask1, + struct fl_flow_mask *mask2) +{ + const long *lmask1 = fl_key_get_start(&mask1->key, mask1); + const long *lmask2 = fl_key_get_start(&mask2->key, mask2); + + return !memcmp(&mask1->range, &mask2->range, sizeof(mask1->range)) && + !memcmp(lmask1, lmask2, fl_mask_range(mask1)); +} + +static const struct rhashtable_params fl_ht_params = { + .key_offset = offsetof(struct cls_fl_filter, mkey), /* base offset */ + .head_offset = offsetof(struct cls_fl_filter, ht_node), + .automatic_shrinking = true, +}; + +static int fl_init_hashtable(struct cls_fl_head *head, + struct fl_flow_mask *mask) +{ + head->ht_params = fl_ht_params; + head->ht_params.key_len = fl_mask_range(mask); + head->ht_params.key_offset += mask->range.start; + + return rhashtable_init(&head->ht, &head->ht_params); +} + +#define FL_KEY_MEMBER_OFFSET(member) offsetof(struct fl_flow_key, member) +#define FL_KEY_MEMBER_SIZE(member) (sizeof(((struct fl_flow_key *) 0)->member)) +#define FL_KEY_MEMBER_END_OFFSET(member) \ + (FL_KEY_MEMBER_OFFSET(member) + FL_KEY_MEMBER_SIZE(member)) + +#define FL_KEY_IN_RANGE(mask, member) \ + (FL_KEY_MEMBER_OFFSET(member) <= (mask)->range.end && \ + FL_KEY_MEMBER_END_OFFSET(member) >= (mask)->range.start) + +#define FL_KEY_SET(keys, cnt, id, member) \ + do { \ + keys[cnt].key_id = id; \ + keys[cnt].offset = FL_KEY_MEMBER_OFFSET(member); \ + cnt++; \ + } while(0); + +#define FL_KEY_SET_IF_IN_RANGE(mask, keys, cnt, id, member) \ + do { \ + if (FL_KEY_IN_RANGE(mask, member)) \ + FL_KEY_SET(keys, cnt, id, member); \ + } while(0); + +static void fl_init_dissector(struct cls_fl_head *head, + struct fl_flow_mask *mask) +{ + struct flow_dissector_key keys[FLOW_DISSECTOR_KEY_MAX]; + size_t cnt = 0; + + FL_KEY_SET(keys, cnt, FLOW_DISSECTOR_KEY_BASIC, basic); + FL_KEY_SET_IF_IN_RANGE(mask, keys, cnt, + FLOW_DISSECTOR_KEY_ETH_ADDRS, eth); + FL_KEY_SET_IF_IN_RANGE(mask, keys, cnt, + FLOW_DISSECTOR_KEY_IPV4_ADDRS, ipv4); + FL_KEY_SET_IF_IN_RANGE(mask, keys, cnt, + FLOW_DISSECTOR_KEY_IPV6_ADDRS, ipv6); + FL_KEY_SET_IF_IN_RANGE(mask, keys, cnt, + FLOW_DISSECTOR_KEY_PORTS, tp); + + skb_flow_dissector_init(&head->dissector, keys, cnt); +} + +static int fl_check_assign_mask(struct cls_fl_head *head, + struct fl_flow_mask *mask) +{ + int err; + + if (head->mask_assigned) { + if (!fl_mask_eq(&head->mask, mask)) + return -EINVAL; + else + return 0; + } + + /* Mask is not assigned yet. So assign it and init hashtable + * according to that. + */ + err = fl_init_hashtable(head, mask); + if (err) + return err; + memcpy(&head->mask, mask, sizeof(head->mask)); + head->mask_assigned = true; + + fl_init_dissector(head, mask); + + return 0; +} + +static int fl_set_parms(struct net *net, struct tcf_proto *tp, + struct cls_fl_filter *f, struct fl_flow_mask *mask, + unsigned long base, struct nlattr **tb, + struct nlattr *est, bool ovr) +{ + struct tcf_exts e; + int err; + + tcf_exts_init(&e, TCA_FLOWER_ACT, 0); + err = tcf_exts_validate(net, tp, tb, est, &e, ovr); + if (err < 0) + return err; + + if (tb[TCA_FLOWER_CLASSID]) { + f->res.classid = nla_get_u32(tb[TCA_FLOWER_CLASSID]); + tcf_bind_filter(tp, &f->res, base); + } + + err = fl_set_key(net, tb, &f->key, &mask->key); + if (err) + goto errout; + + fl_mask_update_range(mask); + fl_set_masked_key(&f->mkey, &f->key, mask); + + tcf_exts_change(tp, &f->exts, &e); + + return 0; +errout: + tcf_exts_destroy(&e); + return err; +} + +static u32 fl_grab_new_handle(struct tcf_proto *tp, + struct cls_fl_head *head) +{ + unsigned int i = 0x80000000; + u32 handle; + + do { + if (++head->hgen == 0x7FFFFFFF) + head->hgen = 1; + } while (--i > 0 && fl_get(tp, head->hgen)); + + if (unlikely(i == 0)) { + pr_err("Insufficient number of handles\n"); + handle = 0; + } else { + handle = head->hgen; + } + + return handle; +} + +static int fl_change(struct net *net, struct sk_buff *in_skb, + struct tcf_proto *tp, unsigned long base, + u32 handle, struct nlattr **tca, + unsigned long *arg, bool ovr) +{ + struct cls_fl_head *head = rtnl_dereference(tp->root); + struct cls_fl_filter *fold = (struct cls_fl_filter *) *arg; + struct cls_fl_filter *fnew; + struct nlattr *tb[TCA_FLOWER_MAX + 1]; + struct fl_flow_mask mask = {}; + int err; + + if (!tca[TCA_OPTIONS]) + return -EINVAL; + + err = nla_parse_nested(tb, TCA_FLOWER_MAX, tca[TCA_OPTIONS], fl_policy); + if (err < 0) + return err; + + if (fold && handle && fold->handle != handle) + return -EINVAL; + + fnew = kzalloc(sizeof(*fnew), GFP_KERNEL); + if (!fnew) + return -ENOBUFS; + + tcf_exts_init(&fnew->exts, TCA_FLOWER_ACT, 0); + + if (!handle) { + handle = fl_grab_new_handle(tp, head); + if (!handle) { + err = -EINVAL; + goto errout; + } + } + fnew->handle = handle; + + err = fl_set_parms(net, tp, fnew, &mask, base, tb, tca[TCA_RATE], ovr); + if (err) + goto errout; + + err = fl_check_assign_mask(head, &mask); + if (err) + goto errout; + + err = rhashtable_insert_fast(&head->ht, &fnew->ht_node, + head->ht_params); + if (err) + goto errout; + if (fold) + rhashtable_remove_fast(&head->ht, &fold->ht_node, + head->ht_params); + + *arg = (unsigned long) fnew; + + if (fold) { + list_replace_rcu(&fnew->list, &fold->list); + tcf_unbind_filter(tp, &fold->res); + call_rcu(&fold->rcu, fl_destroy_filter); + } else { + list_add_tail_rcu(&fnew->list, &head->filters); + } + + return 0; + +errout: + kfree(fnew); + return err; +} + +static int fl_delete(struct tcf_proto *tp, unsigned long arg) +{ + struct cls_fl_head *head = rtnl_dereference(tp->root); + struct cls_fl_filter *f = (struct cls_fl_filter *) arg; + + rhashtable_remove_fast(&head->ht, &f->ht_node, + head->ht_params); + list_del_rcu(&f->list); + tcf_unbind_filter(tp, &f->res); + call_rcu(&f->rcu, fl_destroy_filter); + return 0; +} + +static void fl_walk(struct tcf_proto *tp, struct tcf_walker *arg) +{ + struct cls_fl_head *head = rtnl_dereference(tp->root); + struct cls_fl_filter *f; + + list_for_each_entry_rcu(f, &head->filters, list) { + if (arg->count < arg->skip) + goto skip; + if (arg->fn(tp, (unsigned long) f, arg) < 0) { + arg->stop = 1; + break; + } +skip: + arg->count++; + } +} + +static int fl_dump_key_val(struct sk_buff *skb, + void *val, int val_type, + void *mask, int mask_type, int len) +{ + int err; + + if (!memchr_inv(mask, 0, len)) + return 0; + err = nla_put(skb, val_type, len, val); + if (err) + return err; + if (mask_type != TCA_FLOWER_UNSPEC) { + err = nla_put(skb, mask_type, len, mask); + if (err) + return err; + } + return 0; +} + +static int fl_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, + struct sk_buff *skb, struct tcmsg *t) +{ + struct cls_fl_head *head = rtnl_dereference(tp->root); + struct cls_fl_filter *f = (struct cls_fl_filter *) fh; + struct nlattr *nest; + struct fl_flow_key *key, *mask; + + if (!f) + return skb->len; + + t->tcm_handle = f->handle; + + nest = nla_nest_start(skb, TCA_OPTIONS); + if (!nest) + goto nla_put_failure; + + if (f->res.classid && + nla_put_u32(skb, TCA_FLOWER_CLASSID, f->res.classid)) + goto nla_put_failure; + + key = &f->key; + mask = &head->mask.key; + + if (mask->indev_ifindex) { + struct net_device *dev; + + dev = __dev_get_by_index(net, key->indev_ifindex); + if (dev && nla_put_string(skb, TCA_FLOWER_INDEV, dev->name)) + goto nla_put_failure; + } + + if (fl_dump_key_val(skb, key->eth.dst, TCA_FLOWER_KEY_ETH_DST, + mask->eth.dst, TCA_FLOWER_KEY_ETH_DST_MASK, + sizeof(key->eth.dst)) || + fl_dump_key_val(skb, key->eth.src, TCA_FLOWER_KEY_ETH_SRC, + mask->eth.src, TCA_FLOWER_KEY_ETH_SRC_MASK, + sizeof(key->eth.src)) || + fl_dump_key_val(skb, &key->basic.n_proto, TCA_FLOWER_KEY_ETH_TYPE, + &mask->basic.n_proto, TCA_FLOWER_UNSPEC, + sizeof(key->basic.n_proto))) + goto nla_put_failure; + if ((key->basic.n_proto == htons(ETH_P_IP) || + key->basic.n_proto == htons(ETH_P_IPV6)) && + fl_dump_key_val(skb, &key->basic.ip_proto, TCA_FLOWER_KEY_IP_PROTO, + &mask->basic.ip_proto, TCA_FLOWER_UNSPEC, + sizeof(key->basic.ip_proto))) + goto nla_put_failure; + + if (key->basic.n_proto == htons(ETH_P_IP) && + (fl_dump_key_val(skb, &key->ipv4.src, TCA_FLOWER_KEY_IPV4_SRC, + &mask->ipv4.src, TCA_FLOWER_KEY_IPV4_SRC_MASK, + sizeof(key->ipv4.src)) || + fl_dump_key_val(skb, &key->ipv4.dst, TCA_FLOWER_KEY_IPV4_DST, + &mask->ipv4.dst, TCA_FLOWER_KEY_IPV4_DST_MASK, + sizeof(key->ipv4.dst)))) + goto nla_put_failure; + else if (key->basic.n_proto == htons(ETH_P_IPV6) && + (fl_dump_key_val(skb, &key->ipv6.src, TCA_FLOWER_KEY_IPV6_SRC, + &mask->ipv6.src, TCA_FLOWER_KEY_IPV6_SRC_MASK, + sizeof(key->ipv6.src)) || + fl_dump_key_val(skb, &key->ipv6.dst, TCA_FLOWER_KEY_IPV6_DST, + &mask->ipv6.dst, TCA_FLOWER_KEY_IPV6_DST_MASK, + sizeof(key->ipv6.dst)))) + goto nla_put_failure; + + if (key->basic.ip_proto == IPPROTO_TCP && + (fl_dump_key_val(skb, &key->tp.src, TCA_FLOWER_KEY_TCP_SRC, + &mask->tp.src, TCA_FLOWER_UNSPEC, + sizeof(key->tp.src)) || + fl_dump_key_val(skb, &key->tp.dst, TCA_FLOWER_KEY_TCP_DST, + &mask->tp.dst, TCA_FLOWER_UNSPEC, + sizeof(key->tp.dst)))) + goto nla_put_failure; + else if (key->basic.ip_proto == IPPROTO_UDP && + (fl_dump_key_val(skb, &key->tp.src, TCA_FLOWER_KEY_UDP_SRC, + &mask->tp.src, TCA_FLOWER_UNSPEC, + sizeof(key->tp.src)) || + fl_dump_key_val(skb, &key->tp.dst, TCA_FLOWER_KEY_UDP_DST, + &mask->tp.dst, TCA_FLOWER_UNSPEC, + sizeof(key->tp.dst)))) + goto nla_put_failure; + + if (tcf_exts_dump(skb, &f->exts)) + goto nla_put_failure; + + nla_nest_end(skb, nest); + + if (tcf_exts_dump_stats(skb, &f->exts) < 0) + goto nla_put_failure; + + return skb->len; + +nla_put_failure: + nla_nest_cancel(skb, nest); + return -1; +} + +static struct tcf_proto_ops cls_fl_ops __read_mostly = { + .kind = "flower", + .classify = fl_classify, + .init = fl_init, + .destroy = fl_destroy, + .get = fl_get, + .change = fl_change, + .delete = fl_delete, + .walk = fl_walk, + .dump = fl_dump, + .owner = THIS_MODULE, +}; + +static int __init cls_fl_init(void) +{ + return register_tcf_proto_ops(&cls_fl_ops); +} + +static void __exit cls_fl_exit(void) +{ + unregister_tcf_proto_ops(&cls_fl_ops); +} + +module_init(cls_fl_init); +module_exit(cls_fl_exit); + +MODULE_AUTHOR("Jiri Pirko "); +MODULE_DESCRIPTION("Flower classifier"); +MODULE_LICENSE("GPL v2"); -- cgit v1.2.3 From 1cf51900f8545b358b5deaacfda348d990f671db Mon Sep 17 00:00:00 2001 From: Pablo Neira Date: Wed, 13 May 2015 18:19:37 +0200 Subject: net: add CONFIG_NET_INGRESS to enable ingress filtering This new config switch enables the ingress filtering infrastructure that is controlled through the ingress_needed static key. This prepares the introduction of the Netfilter ingress hook that resides under this unique static key. Note that CONFIG_SCH_INGRESS automatically selects this, that should be no problem since this also depends on CONFIG_NET_CLS_ACT. Signed-off-by: Pablo Neira Ayuso Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/rtnetlink.h | 2 +- net/Kconfig | 3 +++ net/core/dev.c | 7 ++++--- net/sched/Kconfig | 1 + 4 files changed, 9 insertions(+), 4 deletions(-) (limited to 'net/sched') diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index bd29ab4b0941..a2324fb45cf4 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -79,7 +79,7 @@ static inline struct netdev_queue *dev_ingress_queue(struct net_device *dev) struct netdev_queue *dev_ingress_queue_create(struct net_device *dev); -#ifdef CONFIG_NET_CLS_ACT +#ifdef CONFIG_NET_INGRESS void net_inc_ingress_queue(void); void net_dec_ingress_queue(void); #endif diff --git a/net/Kconfig b/net/Kconfig index 44dd5786ee91..57a7c5af3175 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -45,6 +45,9 @@ config COMPAT_NETLINK_MESSAGES Newly written code should NEVER need this option but do compat-independent messages instead! +config NET_INGRESS + bool + menu "Networking options" source "net/packet/Kconfig" diff --git a/net/core/dev.c b/net/core/dev.c index af549062ae8e..a5ef90016ce7 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1630,7 +1630,7 @@ int call_netdevice_notifiers(unsigned long val, struct net_device *dev) } EXPORT_SYMBOL(call_netdevice_notifiers); -#ifdef CONFIG_NET_CLS_ACT +#ifdef CONFIG_NET_INGRESS static struct static_key ingress_needed __read_mostly; void net_inc_ingress_queue(void) @@ -3798,13 +3798,14 @@ another_round: } skip_taps: -#ifdef CONFIG_NET_CLS_ACT +#ifdef CONFIG_NET_INGRESS if (static_key_false(&ingress_needed)) { skb = handle_ing(skb, &pt_prev, &ret, orig_dev); if (!skb) goto unlock; } - +#endif +#ifdef CONFIG_NET_CLS_ACT skb->tc_verd = 0; ncls: #endif diff --git a/net/sched/Kconfig b/net/sched/Kconfig index 5fd1c2f487d2..daa33432b716 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -312,6 +312,7 @@ config NET_SCH_PIE config NET_SCH_INGRESS tristate "Ingress Qdisc" depends on NET_CLS_ACT + select NET_INGRESS ---help--- Say Y here if you want to use classifiers for incoming packets. If unsure, say Y. -- cgit v1.2.3 From dd3aa3b5fbf61adb0ef5d44a6753646b767b9c77 Mon Sep 17 00:00:00 2001 From: Brian Haley Date: Thu, 14 May 2015 13:20:15 -0400 Subject: cls_flower: Fix compile error MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix compile error in net/sched/cls_flower.c net/sched/cls_flower.c: In function ‘fl_set_key’: net/sched/cls_flower.c:240:3: error: implicit declaration of function ‘tcf_change_indev’ [-Werror=implicit-function-declaration] err = tcf_change_indev(net, tb[TCA_FLOWER_INDEV]); Introduced in 77b9900ef53ae Fixes: 77b9900ef53ae ("tc: introduce Flower classifier") Signed-off-by: Brian Haley Signed-off-by: David S. Miller --- net/sched/cls_flower.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net/sched') diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 9bc654c764cd..8c8f34ef6980 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -234,15 +234,15 @@ static void fl_set_key_val(struct nlattr **tb, static int fl_set_key(struct net *net, struct nlattr **tb, struct fl_flow_key *key, struct fl_flow_key *mask) { - int err; - +#ifdef CONFIG_NET_CLS_IND if (tb[TCA_FLOWER_INDEV]) { - err = tcf_change_indev(net, tb[TCA_FLOWER_INDEV]); + int err = tcf_change_indev(net, tb[TCA_FLOWER_INDEV]); if (err < 0) return err; key->indev_ifindex = err; mask->indev_ifindex = 0xffffffff; } +#endif fl_set_key_val(tb, key->eth.dst, TCA_FLOWER_KEY_ETH_DST, mask->eth.dst, TCA_FLOWER_KEY_ETH_DST_MASK, -- cgit v1.2.3 From 42aecaa9bb2bd57eb8d61b4565cee5d3640863fb Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Thu, 4 Jun 2015 09:16:39 -0700 Subject: net: Get skb hash over flow_keys structure This patch changes flow hashing to use jhash2 over the flow_keys structure instead just doing jhash_3words over src, dst, and ports. This method will allow us take more input into the hashing function so that we can include full IPv6 addresses, VLAN, flow labels etc. without needing to resort to xor'ing which makes for a poor hash. Acked-by: Jiri Pirko Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/linux/skbuff.h | 2 +- include/net/flow_dissector.h | 21 ++++++++++++++--- include/net/ip.h | 2 ++ include/net/ipv6.h | 2 ++ net/core/flow_dissector.c | 54 +++++++++++++++++++++++++++++++++----------- net/sched/cls_flower.c | 2 ++ 6 files changed, 66 insertions(+), 17 deletions(-) (limited to 'net/sched') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 6b41c15efa27..cc612fc0a894 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1943,7 +1943,7 @@ static inline void skb_probe_transport_header(struct sk_buff *skb, if (skb_transport_header_was_set(skb)) return; else if (skb_flow_dissect_flow_keys(skb, &keys)) - skb_set_transport_header(skb, keys.basic.thoff); + skb_set_transport_header(skb, keys.control.thoff); else skb_set_transport_header(skb, offset_hint); } diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h index bac9c1421f58..cba6a10b214a 100644 --- a/include/net/flow_dissector.h +++ b/include/net/flow_dissector.h @@ -6,6 +6,15 @@ #include #include +/** + * struct flow_dissector_key_control: + * @thoff: Transport header offset + */ +struct flow_dissector_key_control { + u16 thoff; + u16 padding; +}; + /** * struct flow_dissector_key_basic: * @thoff: Transport header offset @@ -13,9 +22,9 @@ * @ip_proto: Transport header protocol (eg. TCP/UDP) */ struct flow_dissector_key_basic { - u16 thoff; __be16 n_proto; u8 ip_proto; + u8 padding; }; /** @@ -70,6 +79,7 @@ struct flow_dissector_key_eth_addrs { }; enum flow_dissector_key_id { + FLOW_DISSECTOR_KEY_CONTROL, /* struct flow_dissector_key_control */ FLOW_DISSECTOR_KEY_BASIC, /* struct flow_dissector_key_basic */ FLOW_DISSECTOR_KEY_IPV4_ADDRS, /* struct flow_dissector_key_addrs */ FLOW_DISSECTOR_KEY_IPV6_HASH_ADDRS, /* struct flow_dissector_key_addrs */ @@ -109,11 +119,16 @@ static inline bool skb_flow_dissect(const struct sk_buff *skb, } struct flow_keys { - struct flow_dissector_key_addrs addrs; - struct flow_dissector_key_ports ports; + struct flow_dissector_key_control control; +#define FLOW_KEYS_HASH_START_FIELD basic struct flow_dissector_key_basic basic; + struct flow_dissector_key_ports ports; + struct flow_dissector_key_addrs addrs; }; +#define FLOW_KEYS_HASH_OFFSET \ + offsetof(struct flow_keys, FLOW_KEYS_HASH_START_FIELD) + extern struct flow_dissector flow_keys_dissector; extern struct flow_dissector flow_keys_buf_dissector; diff --git a/include/net/ip.h b/include/net/ip.h index 9b976cf99122..16cfc87fed6c 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -360,6 +360,8 @@ static inline void inet_set_txhash(struct sock *sk) struct inet_sock *inet = inet_sk(sk); struct flow_keys keys; + memset(&keys, 0, sizeof(keys)); + keys.addrs.src = inet->inet_saddr; keys.addrs.dst = inet->inet_daddr; keys.ports.src = inet->inet_sport; diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 35d485c78080..474ca466a091 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -699,6 +699,8 @@ static inline void ip6_set_txhash(struct sock *sk) struct ipv6_pinfo *np = inet6_sk(sk); struct flow_keys keys; + memset(&keys, 0, sizeof(keys)); + keys.addrs.src = (__force __be32)ipv6_addr_hash(&np->saddr); keys.addrs.dst = (__force __be32)ipv6_addr_hash(&sk->sk_v6_daddr); keys.ports.src = inet->inet_sport; diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 0763795bea8f..55b5f2962afa 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -57,9 +57,11 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector, flow_dissector->offset[key->key_id] = key->offset; } - /* Ensure that the dissector always includes basic key. That way - * we are able to avoid handling lack of it in fast path. + /* Ensure that the dissector always includes control and basic key. + * That way we are able to avoid handling lack of these in fast path. */ + BUG_ON(!skb_flow_dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_CONTROL)); BUG_ON(!skb_flow_dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_BASIC)); } @@ -120,6 +122,7 @@ bool __skb_flow_dissect(const struct sk_buff *skb, void *target_container, void *data, __be16 proto, int nhoff, int hlen) { + struct flow_dissector_key_control *key_control; struct flow_dissector_key_basic *key_basic; struct flow_dissector_key_addrs *key_addrs; struct flow_dissector_key_ports *key_ports; @@ -132,6 +135,13 @@ bool __skb_flow_dissect(const struct sk_buff *skb, hlen = skb_headlen(skb); } + /* It is ensured by skb_flow_dissector_init() that control key will + * be always present. + */ + key_control = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_CONTROL, + target_container); + /* It is ensured by skb_flow_dissector_init() that basic key will * be always present. */ @@ -219,7 +229,7 @@ flow_label: key_basic->n_proto = proto; key_basic->ip_proto = ip_proto; - key_basic->thoff = (u16)nhoff; + key_control->thoff = (u16)nhoff; if (skb_flow_dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS)) { @@ -275,7 +285,7 @@ flow_label: if (!hdr) return false; key_basic->n_proto = proto; - key_basic->thoff = (u16)nhoff; + key_control->thoff = (u16)nhoff; if (skb_flow_dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IPV6_HASH_ADDRS)) { @@ -288,7 +298,7 @@ flow_label: return true; } case htons(ETH_P_FCOE): - key_basic->thoff = (u16)(nhoff + FCOE_HEADER_LEN); + key_control->thoff = (u16)(nhoff + FCOE_HEADER_LEN); /* fall through */ default: return false; @@ -345,7 +355,7 @@ flow_label: key_basic->n_proto = proto; key_basic->ip_proto = ip_proto; - key_basic->thoff = (u16) nhoff; + key_control->thoff = (u16)nhoff; if (skb_flow_dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS)) { @@ -366,9 +376,21 @@ static __always_inline void __flow_hash_secret_init(void) net_get_random_once(&hashrnd, sizeof(hashrnd)); } -static __always_inline u32 __flow_hash_3words(u32 a, u32 b, u32 c, u32 keyval) +static __always_inline u32 __flow_hash_words(u32 *words, u32 length, u32 keyval) +{ + return jhash2(words, length, keyval); +} + +static inline void *flow_keys_hash_start(struct flow_keys *flow) { - return jhash_3words(a, b, c, keyval); + BUILD_BUG_ON(FLOW_KEYS_HASH_OFFSET % sizeof(u32)); + return (void *)flow + FLOW_KEYS_HASH_OFFSET; +} + +static inline size_t flow_keys_hash_length(struct flow_keys *flow) +{ + BUILD_BUG_ON((sizeof(*flow) - FLOW_KEYS_HASH_OFFSET) % sizeof(u32)); + return (sizeof(*flow) - FLOW_KEYS_HASH_OFFSET) / sizeof(u32); } static inline u32 __flow_hash_from_keys(struct flow_keys *keys, u32 keyval) @@ -383,10 +405,8 @@ static inline u32 __flow_hash_from_keys(struct flow_keys *keys, u32 keyval) swap(keys->ports.src, keys->ports.dst); } - hash = __flow_hash_3words((__force u32)keys->addrs.dst, - (__force u32)keys->addrs.src, - (__force u32)keys->ports.ports, - keyval); + hash = __flow_hash_words((u32 *)flow_keys_hash_start(keys), + flow_keys_hash_length(keys), keyval); if (!hash) hash = 1; @@ -473,7 +493,7 @@ EXPORT_SYMBOL(skb_get_hash_perturb); u32 __skb_get_poff(const struct sk_buff *skb, void *data, const struct flow_keys *keys, int hlen) { - u32 poff = keys->basic.thoff; + u32 poff = keys->control.thoff; switch (keys->basic.ip_proto) { case IPPROTO_TCP: { @@ -536,6 +556,10 @@ u32 skb_get_poff(const struct sk_buff *skb) } static const struct flow_dissector_key flow_keys_dissector_keys[] = { + { + .key_id = FLOW_DISSECTOR_KEY_CONTROL, + .offset = offsetof(struct flow_keys, control), + }, { .key_id = FLOW_DISSECTOR_KEY_BASIC, .offset = offsetof(struct flow_keys, basic), @@ -555,6 +579,10 @@ static const struct flow_dissector_key flow_keys_dissector_keys[] = { }; static const struct flow_dissector_key flow_keys_buf_dissector_keys[] = { + { + .key_id = FLOW_DISSECTOR_KEY_CONTROL, + .offset = offsetof(struct flow_keys, control), + }, { .key_id = FLOW_DISSECTOR_KEY_BASIC, .offset = offsetof(struct flow_keys, basic), diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 8c8f34ef6980..5a7d66c59684 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -25,6 +25,7 @@ struct fl_flow_key { int indev_ifindex; + struct flow_dissector_key_control control; struct flow_dissector_key_basic basic; struct flow_dissector_key_eth_addrs eth; union { @@ -347,6 +348,7 @@ static void fl_init_dissector(struct cls_fl_head *head, struct flow_dissector_key keys[FLOW_DISSECTOR_KEY_MAX]; size_t cnt = 0; + FL_KEY_SET(keys, cnt, FLOW_DISSECTOR_KEY_CONTROL, control); FL_KEY_SET(keys, cnt, FLOW_DISSECTOR_KEY_BASIC, basic); FL_KEY_SET_IF_IN_RANGE(mask, keys, cnt, FLOW_DISSECTOR_KEY_ETH_ADDRS, eth); -- cgit v1.2.3 From c3f8324188fa80178f20c8209b492ca6191177e8 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Thu, 4 Jun 2015 09:16:40 -0700 Subject: net: Add full IPv6 addresses to flow_keys This patch adds full IPv6 addresses into flow_keys and uses them as input to the flow hash function. The implementation supports either IPv4 or IPv6 addresses in a union, and selector is used to determine how may words to input to jhash2. We also add flow_get_u32_dst and flow_get_u32_src functions which are used to get a u32 representation of the source and destination addresses. For IPv6, ipv6_addr_hash is called. These functions retain getting the legacy values of src and dst in flow_keys. With this patch, Ethertype and IP protocol are now included in the flow hash input. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- drivers/net/bonding/bond_main.c | 9 +- drivers/net/ethernet/cisco/enic/enic_clsf.c | 8 +- drivers/net/ethernet/cisco/enic/enic_ethtool.c | 4 +- include/net/flow_dissector.h | 52 +++++++---- include/net/ip.h | 19 +++- include/net/ipv6.h | 21 ++++- net/core/flow_dissector.c | 116 +++++++++++++++++++++---- net/ethernet/eth.c | 2 +- net/sched/cls_flow.c | 14 ++- net/sched/cls_flower.c | 11 +-- 10 files changed, 193 insertions(+), 63 deletions(-) (limited to 'net/sched') diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 2268438f3f63..19eb990d398c 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -3059,8 +3059,7 @@ static bool bond_flow_dissect(struct bonding *bond, struct sk_buff *skb, if (unlikely(!pskb_may_pull(skb, noff + sizeof(*iph)))) return false; iph = ip_hdr(skb); - fk->addrs.src = iph->saddr; - fk->addrs.dst = iph->daddr; + iph_to_flow_copy_v4addrs(fk, iph); noff += iph->ihl << 2; if (!ip_is_fragment(iph)) proto = iph->protocol; @@ -3068,8 +3067,7 @@ static bool bond_flow_dissect(struct bonding *bond, struct sk_buff *skb, if (unlikely(!pskb_may_pull(skb, noff + sizeof(*iph6)))) return false; iph6 = ipv6_hdr(skb); - fk->addrs.src = (__force __be32)ipv6_addr_hash(&iph6->saddr); - fk->addrs.dst = (__force __be32)ipv6_addr_hash(&iph6->daddr); + iph_to_flow_copy_v6addrs(fk, iph6); noff += sizeof(*iph6); proto = iph6->nexthdr; } else { @@ -3103,7 +3101,8 @@ u32 bond_xmit_hash(struct bonding *bond, struct sk_buff *skb) hash = bond_eth_hash(skb); else hash = (__force u32)flow.ports.ports; - hash ^= (__force u32)flow.addrs.dst ^ (__force u32)flow.addrs.src; + hash ^= (__force u32)flow_get_u32_dst(&flow) ^ + (__force u32)flow_get_u32_src(&flow); hash ^= (hash >> 16); hash ^= (hash >> 8); diff --git a/drivers/net/ethernet/cisco/enic/enic_clsf.c b/drivers/net/ethernet/cisco/enic/enic_clsf.c index a31b57adcb37..d106186f4f4a 100644 --- a/drivers/net/ethernet/cisco/enic/enic_clsf.c +++ b/drivers/net/ethernet/cisco/enic/enic_clsf.c @@ -33,8 +33,8 @@ int enic_addfltr_5t(struct enic *enic, struct flow_keys *keys, u16 rq) return -EPROTONOSUPPORT; }; data.type = FILTER_IPV4_5TUPLE; - data.u.ipv4.src_addr = ntohl(keys->addrs.src); - data.u.ipv4.dst_addr = ntohl(keys->addrs.dst); + data.u.ipv4.src_addr = ntohl(keys->addrs.v4addrs.src); + data.u.ipv4.dst_addr = ntohl(keys->addrs.v4addrs.dst); data.u.ipv4.src_port = ntohs(keys->ports.src); data.u.ipv4.dst_port = ntohs(keys->ports.dst); data.u.ipv4.flags = FILTER_FIELDS_IPV4_5TUPLE; @@ -158,8 +158,8 @@ static struct enic_rfs_fltr_node *htbl_key_search(struct hlist_head *h, struct enic_rfs_fltr_node *tpos; hlist_for_each_entry(tpos, h, node) - if (tpos->keys.addrs.src == k->addrs.src && - tpos->keys.addrs.dst == k->addrs.dst && + if (tpos->keys.addrs.v4addrs.src == k->addrs.v4addrs.src && + tpos->keys.addrs.v4addrs.dst == k->addrs.v4addrs.dst && tpos->keys.ports.ports == k->ports.ports && tpos->keys.basic.ip_proto == k->basic.ip_proto && tpos->keys.basic.n_proto == k->basic.n_proto) diff --git a/drivers/net/ethernet/cisco/enic/enic_ethtool.c b/drivers/net/ethernet/cisco/enic/enic_ethtool.c index 117c0968dd0b..73874b2575bf 100644 --- a/drivers/net/ethernet/cisco/enic/enic_ethtool.c +++ b/drivers/net/ethernet/cisco/enic/enic_ethtool.c @@ -346,10 +346,10 @@ static int enic_grxclsrule(struct enic *enic, struct ethtool_rxnfc *cmd) break; } - fsp->h_u.tcp_ip4_spec.ip4src = n->keys.addrs.src; + fsp->h_u.tcp_ip4_spec.ip4src = flow_get_u32_src(&n->keys); fsp->m_u.tcp_ip4_spec.ip4src = (__u32)~0; - fsp->h_u.tcp_ip4_spec.ip4dst = n->keys.addrs.dst; + fsp->h_u.tcp_ip4_spec.ip4dst = flow_get_u32_dst(&n->keys); fsp->m_u.tcp_ip4_spec.ip4dst = (__u32)~0; fsp->h_u.tcp_ip4_spec.psrc = n->keys.ports.src; diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h index cba6a10b214a..306d4613abbb 100644 --- a/include/net/flow_dissector.h +++ b/include/net/flow_dissector.h @@ -12,7 +12,7 @@ */ struct flow_dissector_key_control { u16 thoff; - u16 padding; + u16 addr_type; }; /** @@ -28,18 +28,39 @@ struct flow_dissector_key_basic { }; /** - * struct flow_dissector_key_addrs: - * @src: source ip address in case of IPv4 - * For IPv6 it contains 32bit hash of src address - * @dst: destination ip address in case of IPv4 - * For IPv6 it contains 32bit hash of dst address + * struct flow_dissector_key_ipv4_addrs: + * @src: source ip address + * @dst: destination ip address */ -struct flow_dissector_key_addrs { +struct flow_dissector_key_ipv4_addrs { /* (src,dst) must be grouped, in the same way than in IP header */ __be32 src; __be32 dst; }; +/** + * struct flow_dissector_key_ipv6_addrs: + * @src: source ip address + * @dst: destination ip address + */ +struct flow_dissector_key_ipv6_addrs { + /* (src,dst) must be grouped, in the same way than in IP header */ + struct in6_addr src; + struct in6_addr dst; +}; + +/** + * struct flow_dissector_key_addrs: + * @v4addrs: IPv4 addresses + * @v6addrs: IPv6 addresses + */ +struct flow_dissector_key_addrs { + union { + struct flow_dissector_key_ipv4_addrs v4addrs; + struct flow_dissector_key_ipv6_addrs v6addrs; + }; +}; + /** * flow_dissector_key_tp_ports: * @ports: port numbers of Transport header @@ -56,16 +77,6 @@ struct flow_dissector_key_ports { }; }; -/** - * struct flow_dissector_key_ipv6_addrs: - * @src: source ip address - * @dst: destination ip address - */ -struct flow_dissector_key_ipv6_addrs { - /* (src,dst) must be grouped, in the same way than in IP header */ - struct in6_addr src; - struct in6_addr dst; -}; /** * struct flow_dissector_key_eth_addrs: @@ -81,10 +92,10 @@ struct flow_dissector_key_eth_addrs { enum flow_dissector_key_id { FLOW_DISSECTOR_KEY_CONTROL, /* struct flow_dissector_key_control */ FLOW_DISSECTOR_KEY_BASIC, /* struct flow_dissector_key_basic */ - FLOW_DISSECTOR_KEY_IPV4_ADDRS, /* struct flow_dissector_key_addrs */ + FLOW_DISSECTOR_KEY_IPV4_ADDRS, /* struct flow_dissector_key_ipv4_addrs */ + FLOW_DISSECTOR_KEY_IPV6_ADDRS, /* struct flow_dissector_key_ipv6_addrs */ FLOW_DISSECTOR_KEY_IPV6_HASH_ADDRS, /* struct flow_dissector_key_addrs */ FLOW_DISSECTOR_KEY_PORTS, /* struct flow_dissector_key_ports */ - FLOW_DISSECTOR_KEY_IPV6_ADDRS, /* struct flow_dissector_key_ipv6_addrs */ FLOW_DISSECTOR_KEY_ETH_ADDRS, /* struct flow_dissector_key_eth_addrs */ FLOW_DISSECTOR_KEY_MAX, @@ -129,6 +140,9 @@ struct flow_keys { #define FLOW_KEYS_HASH_OFFSET \ offsetof(struct flow_keys, FLOW_KEYS_HASH_START_FIELD) +__be32 flow_get_u32_src(const struct flow_keys *flow); +__be32 flow_get_u32_dst(const struct flow_keys *flow); + extern struct flow_dissector flow_keys_dissector; extern struct flow_dissector flow_keys_buf_dissector; diff --git a/include/net/ip.h b/include/net/ip.h index 16cfc87fed6c..0750a186ea63 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -355,6 +355,20 @@ static inline __wsum inet_compute_pseudo(struct sk_buff *skb, int proto) skb->len, proto, 0); } +/* copy IPv4 saddr & daddr to flow_keys, possibly using 64bit load/store + * Equivalent to : flow->v4addrs.src = iph->saddr; + * flow->v4addrs.dst = iph->daddr; + */ +static inline void iph_to_flow_copy_v4addrs(struct flow_keys *flow, + const struct iphdr *iph) +{ + BUILD_BUG_ON(offsetof(typeof(flow->addrs), v4addrs.dst) != + offsetof(typeof(flow->addrs), v4addrs.src) + + sizeof(flow->addrs.v4addrs.src)); + memcpy(&flow->addrs.v4addrs, &iph->saddr, sizeof(flow->addrs.v4addrs)); + flow->control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; +} + static inline void inet_set_txhash(struct sock *sk) { struct inet_sock *inet = inet_sk(sk); @@ -362,8 +376,9 @@ static inline void inet_set_txhash(struct sock *sk) memset(&keys, 0, sizeof(keys)); - keys.addrs.src = inet->inet_saddr; - keys.addrs.dst = inet->inet_daddr; + keys.addrs.v4addrs.src = inet->inet_saddr; + keys.addrs.v4addrs.dst = inet->inet_daddr; + keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; keys.ports.src = inet->inet_sport; keys.ports.dst = inet->inet_dport; diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 474ca466a091..82dbdb092a5d 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -692,6 +692,20 @@ static inline int ip6_sk_dst_hoplimit(struct ipv6_pinfo *np, struct flowi6 *fl6, return hlimit; } +/* copy IPv6 saddr & daddr to flow_keys, possibly using 64bit load/store + * Equivalent to : flow->v6addrs.src = iph->saddr; + * flow->v6addrs.dst = iph->daddr; + */ +static inline void iph_to_flow_copy_v6addrs(struct flow_keys *flow, + const struct ipv6hdr *iph) +{ + BUILD_BUG_ON(offsetof(typeof(flow->addrs), v6addrs.dst) != + offsetof(typeof(flow->addrs), v6addrs.src) + + sizeof(flow->addrs.v6addrs.src)); + memcpy(&flow->addrs.v6addrs, &iph->saddr, sizeof(flow->addrs.v6addrs)); + flow->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; +} + #if IS_ENABLED(CONFIG_IPV6) static inline void ip6_set_txhash(struct sock *sk) { @@ -701,8 +715,11 @@ static inline void ip6_set_txhash(struct sock *sk) memset(&keys, 0, sizeof(keys)); - keys.addrs.src = (__force __be32)ipv6_addr_hash(&np->saddr); - keys.addrs.dst = (__force __be32)ipv6_addr_hash(&sk->sk_v6_daddr); + memcpy(&keys.addrs.v6addrs.src, &np->saddr, + sizeof(keys.addrs.v6addrs.src)); + memcpy(&keys.addrs.v6addrs.dst, &sk->sk_v6_daddr, + sizeof(keys.addrs.v6addrs.dst)); + keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; keys.ports.src = inet->inet_sport; keys.ports.dst = inet->inet_dport; diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 55b5f2962afa..ca9d22488cfb 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -178,10 +178,12 @@ ip: if (!skb_flow_dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IPV4_ADDRS)) break; + key_addrs = skb_flow_dissector_target(flow_dissector, - FLOW_DISSECTOR_KEY_IPV4_ADDRS, - target_container); - memcpy(key_addrs, &iph->saddr, sizeof(*key_addrs)); + FLOW_DISSECTOR_KEY_IPV4_ADDRS, target_container); + memcpy(&key_addrs->v4addrs, &iph->saddr, + sizeof(key_addrs->v4addrs)); + key_control->addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; break; } case htons(ETH_P_IPV6): { @@ -203,8 +205,11 @@ ipv6: FLOW_DISSECTOR_KEY_IPV6_HASH_ADDRS, target_container); - key_addrs->src = (__force __be32)ipv6_addr_hash(&iph->saddr); - key_addrs->dst = (__force __be32)ipv6_addr_hash(&iph->daddr); + key_addrs->v4addrs.src = + (__force __be32)ipv6_addr_hash(&iph->saddr); + key_addrs->v4addrs.dst = + (__force __be32)ipv6_addr_hash(&iph->daddr); + key_control->addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; goto flow_label; } if (skb_flow_dissector_uses_key(flow_dissector, @@ -216,6 +221,7 @@ ipv6: target_container); memcpy(key_ipv6_addrs, &iph->saddr, sizeof(*key_ipv6_addrs)); + key_control->addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; goto flow_label; } break; @@ -292,8 +298,9 @@ flow_label: key_addrs = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_IPV6_HASH_ADDRS, target_container); - key_addrs->src = hdr->srcnode; - key_addrs->dst = 0; + key_addrs->v4addrs.src = hdr->srcnode; + key_addrs->v4addrs.dst = 0; + key_control->addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; } return true; } @@ -389,21 +396,88 @@ static inline void *flow_keys_hash_start(struct flow_keys *flow) static inline size_t flow_keys_hash_length(struct flow_keys *flow) { + size_t diff = FLOW_KEYS_HASH_OFFSET + sizeof(flow->addrs); BUILD_BUG_ON((sizeof(*flow) - FLOW_KEYS_HASH_OFFSET) % sizeof(u32)); - return (sizeof(*flow) - FLOW_KEYS_HASH_OFFSET) / sizeof(u32); + BUILD_BUG_ON(offsetof(typeof(*flow), addrs) != + sizeof(*flow) - sizeof(flow->addrs)); + + switch (flow->control.addr_type) { + case FLOW_DISSECTOR_KEY_IPV4_ADDRS: + diff -= sizeof(flow->addrs.v4addrs); + break; + case FLOW_DISSECTOR_KEY_IPV6_ADDRS: + diff -= sizeof(flow->addrs.v6addrs); + break; + } + return (sizeof(*flow) - diff) / sizeof(u32); +} + +__be32 flow_get_u32_src(const struct flow_keys *flow) +{ + switch (flow->control.addr_type) { + case FLOW_DISSECTOR_KEY_IPV4_ADDRS: + return flow->addrs.v4addrs.src; + case FLOW_DISSECTOR_KEY_IPV6_ADDRS: + return (__force __be32)ipv6_addr_hash( + &flow->addrs.v6addrs.src); + default: + return 0; + } +} +EXPORT_SYMBOL(flow_get_u32_src); + +__be32 flow_get_u32_dst(const struct flow_keys *flow) +{ + switch (flow->control.addr_type) { + case FLOW_DISSECTOR_KEY_IPV4_ADDRS: + return flow->addrs.v4addrs.dst; + case FLOW_DISSECTOR_KEY_IPV6_ADDRS: + return (__force __be32)ipv6_addr_hash( + &flow->addrs.v6addrs.dst); + default: + return 0; + } +} +EXPORT_SYMBOL(flow_get_u32_dst); + +static inline void __flow_hash_consistentify(struct flow_keys *keys) +{ + int addr_diff, i; + + switch (keys->control.addr_type) { + case FLOW_DISSECTOR_KEY_IPV4_ADDRS: + addr_diff = (__force u32)keys->addrs.v4addrs.dst - + (__force u32)keys->addrs.v4addrs.src; + if ((addr_diff < 0) || + (addr_diff == 0 && + ((__force u16)keys->ports.dst < + (__force u16)keys->ports.src))) { + swap(keys->addrs.v4addrs.src, keys->addrs.v4addrs.dst); + swap(keys->ports.src, keys->ports.dst); + } + break; + case FLOW_DISSECTOR_KEY_IPV6_ADDRS: + addr_diff = memcmp(&keys->addrs.v6addrs.dst, + &keys->addrs.v6addrs.src, + sizeof(keys->addrs.v6addrs.dst)); + if ((addr_diff < 0) || + (addr_diff == 0 && + ((__force u16)keys->ports.dst < + (__force u16)keys->ports.src))) { + for (i = 0; i < 4; i++) + swap(keys->addrs.v6addrs.src.s6_addr32[i], + keys->addrs.v6addrs.dst.s6_addr32[i]); + swap(keys->ports.src, keys->ports.dst); + } + break; + } } static inline u32 __flow_hash_from_keys(struct flow_keys *keys, u32 keyval) { u32 hash; - /* get a consistent hash (same value on both flow directions) */ - if (((__force u32)keys->addrs.dst < (__force u32)keys->addrs.src) || - (((__force u32)keys->addrs.dst == (__force u32)keys->addrs.src) && - ((__force u16)keys->ports.dst < (__force u16)keys->ports.src))) { - swap(keys->addrs.dst, keys->addrs.src); - swap(keys->ports.src, keys->ports.dst); - } + __flow_hash_consistentify(keys); hash = __flow_hash_words((u32 *)flow_keys_hash_start(keys), flow_keys_hash_length(keys), keyval); @@ -451,8 +525,8 @@ void make_flow_keys_digest(struct flow_keys_digest *digest, data->n_proto = flow->basic.n_proto; data->ip_proto = flow->basic.ip_proto; data->ports = flow->ports.ports; - data->src = flow->addrs.src; - data->dst = flow->addrs.dst; + data->src = flow->addrs.v4addrs.src; + data->dst = flow->addrs.v4addrs.dst; } EXPORT_SYMBOL(make_flow_keys_digest); @@ -566,11 +640,15 @@ static const struct flow_dissector_key flow_keys_dissector_keys[] = { }, { .key_id = FLOW_DISSECTOR_KEY_IPV4_ADDRS, - .offset = offsetof(struct flow_keys, addrs), + .offset = offsetof(struct flow_keys, addrs.v4addrs), + }, + { + .key_id = FLOW_DISSECTOR_KEY_IPV6_ADDRS, + .offset = offsetof(struct flow_keys, addrs.v6addrs), }, { .key_id = FLOW_DISSECTOR_KEY_IPV6_HASH_ADDRS, - .offset = offsetof(struct flow_keys, addrs), + .offset = offsetof(struct flow_keys, addrs.v4addrs), }, { .key_id = FLOW_DISSECTOR_KEY_PORTS, diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index 7d0e239a6755..77e0f0e7a88e 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -133,7 +133,7 @@ u32 eth_get_headlen(void *data, unsigned int len) /* parse any remaining L2/L3 headers, check for L4 */ if (!skb_flow_dissect_flow_keys_buf(&keys, data, eth->h_proto, sizeof(*eth), len)) - return max_t(u32, keys.basic.thoff, sizeof(*eth)); + return max_t(u32, keys.control.thoff, sizeof(*eth)); /* parse for any L4 headers */ return min_t(u32, __skb_get_poff(NULL, data, &keys, len), len); diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c index b4359924846c..76bc3a20ffdb 100644 --- a/net/sched/cls_flow.c +++ b/net/sched/cls_flow.c @@ -68,15 +68,21 @@ static inline u32 addr_fold(void *addr) static u32 flow_get_src(const struct sk_buff *skb, const struct flow_keys *flow) { - if (flow->addrs.src) - return ntohl(flow->addrs.src); + __be32 src = flow_get_u32_src(flow); + + if (src) + return ntohl(src); + return addr_fold(skb->sk); } static u32 flow_get_dst(const struct sk_buff *skb, const struct flow_keys *flow) { - if (flow->addrs.dst) - return ntohl(flow->addrs.dst); + __be32 dst = flow_get_u32_dst(flow); + + if (dst) + return ntohl(dst); + return addr_fold(skb_dst(skb)) ^ (__force u16) tc_skb_protocol(skb); } diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 5a7d66c59684..b92d3f49c23e 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -28,8 +28,9 @@ struct fl_flow_key { struct flow_dissector_key_control control; struct flow_dissector_key_basic basic; struct flow_dissector_key_eth_addrs eth; + struct flow_dissector_key_addrs ipaddrs; union { - struct flow_dissector_key_addrs ipv4; + struct flow_dissector_key_ipv4_addrs ipv4; struct flow_dissector_key_ipv6_addrs ipv6; }; struct flow_dissector_key_ports tp; @@ -260,14 +261,14 @@ static int fl_set_key(struct net *net, struct nlattr **tb, &mask->basic.ip_proto, TCA_FLOWER_UNSPEC, sizeof(key->basic.ip_proto)); } - if (key->basic.n_proto == htons(ETH_P_IP)) { + if (key->control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) { fl_set_key_val(tb, &key->ipv4.src, TCA_FLOWER_KEY_IPV4_SRC, &mask->ipv4.src, TCA_FLOWER_KEY_IPV4_SRC_MASK, sizeof(key->ipv4.src)); fl_set_key_val(tb, &key->ipv4.dst, TCA_FLOWER_KEY_IPV4_DST, &mask->ipv4.dst, TCA_FLOWER_KEY_IPV4_DST_MASK, sizeof(key->ipv4.dst)); - } else if (key->basic.n_proto == htons(ETH_P_IPV6)) { + } else if (key->control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) { fl_set_key_val(tb, &key->ipv6.src, TCA_FLOWER_KEY_IPV6_SRC, &mask->ipv6.src, TCA_FLOWER_KEY_IPV6_SRC_MASK, sizeof(key->ipv6.src)); @@ -610,7 +611,7 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, sizeof(key->basic.ip_proto))) goto nla_put_failure; - if (key->basic.n_proto == htons(ETH_P_IP) && + if (key->control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS && (fl_dump_key_val(skb, &key->ipv4.src, TCA_FLOWER_KEY_IPV4_SRC, &mask->ipv4.src, TCA_FLOWER_KEY_IPV4_SRC_MASK, sizeof(key->ipv4.src)) || @@ -618,7 +619,7 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, &mask->ipv4.dst, TCA_FLOWER_KEY_IPV4_DST_MASK, sizeof(key->ipv4.dst)))) goto nla_put_failure; - else if (key->basic.n_proto == htons(ETH_P_IPV6) && + else if (key->control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS && (fl_dump_key_val(skb, &key->ipv6.src, TCA_FLOWER_KEY_IPV6_SRC, &mask->ipv6.src, TCA_FLOWER_KEY_IPV6_SRC_MASK, sizeof(key->ipv6.src)) || -- cgit v1.2.3 From 3431205e03977aaf32bce6d4b16fb8244b510056 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 4 Jun 2015 10:11:53 -0700 Subject: bpf: make programs see skb->data == L2 for ingress and egress eBPF programs attached to ingress and egress qdiscs see inconsistent skb->data. For ingress L2 header is already pulled, whereas for egress it's present. This is known to program writers which are currently forced to use BPF_LL_OFF workaround. Since programs don't change skb internal pointers it is safe to do pull/push right around invocation of the program and earlier taps and later pt->func() will not be affected. Multiple taps via packet_rcv(), tpacket_rcv() are doing the same trick around run_filter/BPF_PROG_RUN even if skb_shared. This fix finally allows programs to use optimized LD_ABS/IND instructions without BPF_LL_OFF for higher performance. tc ingress + cls_bpf + samples/bpf/tcbpf1_kern.o w/o JIT w/JIT before 20.5 23.6 Mpps after 21.8 26.6 Mpps Old programs with BPF_LL_OFF will still work as-is. We can now undo most of the earlier workaround commit: a166151cbe33 ("bpf: fix bpf helpers to use skb->mac_header relative offsets") Signed-off-by: Alexei Starovoitov Acked-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- net/core/filter.c | 26 +++----------------------- net/sched/act_bpf.c | 9 ++++++++- net/sched/cls_bpf.c | 16 +++++++++++++++- samples/bpf/tcbpf1_kern.c | 8 ++++---- 4 files changed, 30 insertions(+), 29 deletions(-) (limited to 'net/sched') diff --git a/net/core/filter.c b/net/core/filter.c index 09b2062eb5b8..36a69e33d76b 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1238,21 +1238,6 @@ int sk_attach_bpf(u32 ufd, struct sock *sk) return 0; } -/** - * bpf_skb_clone_not_writable - is the header of a clone not writable - * @skb: buffer to check - * @len: length up to which to write, can be negative - * - * Returns true if modifying the header part of the cloned buffer - * does require the data to be copied. I.e. this version works with - * negative lengths needed for eBPF case! - */ -static bool bpf_skb_clone_unwritable(const struct sk_buff *skb, int len) -{ - return skb_header_cloned(skb) || - (int) skb_headroom(skb) + len > skb->hdr_len; -} - #define BPF_RECOMPUTE_CSUM(flags) ((flags) & 1) static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags) @@ -1275,9 +1260,8 @@ static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags) if (unlikely((u32) offset > 0xffff || len > sizeof(buf))) return -EFAULT; - offset -= skb->data - skb_mac_header(skb); if (unlikely(skb_cloned(skb) && - bpf_skb_clone_unwritable(skb, offset + len))) + !skb_clone_writable(skb, offset + len))) return -EFAULT; ptr = skb_header_pointer(skb, offset, len, buf); @@ -1321,9 +1305,8 @@ static u64 bpf_l3_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags) if (unlikely((u32) offset > 0xffff)) return -EFAULT; - offset -= skb->data - skb_mac_header(skb); if (unlikely(skb_cloned(skb) && - bpf_skb_clone_unwritable(skb, offset + sizeof(sum)))) + !skb_clone_writable(skb, offset + sizeof(sum)))) return -EFAULT; ptr = skb_header_pointer(skb, offset, sizeof(sum), &sum); @@ -1369,9 +1352,8 @@ static u64 bpf_l4_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags) if (unlikely((u32) offset > 0xffff)) return -EFAULT; - offset -= skb->data - skb_mac_header(skb); if (unlikely(skb_cloned(skb) && - bpf_skb_clone_unwritable(skb, offset + sizeof(sum)))) + !skb_clone_writable(skb, offset + sizeof(sum)))) return -EFAULT; ptr = skb_header_pointer(skb, offset, sizeof(sum), &sum); @@ -1425,8 +1407,6 @@ static u64 bpf_clone_redirect(u64 r1, u64 ifindex, u64 flags, u64 r4, u64 r5) if (unlikely(!skb2)) return -ENOMEM; - skb_push(skb2, skb2->data - skb_mac_header(skb2)); - if (BPF_IS_REDIRECT_INGRESS(flags)) return dev_forward_skb(dev, skb2); diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index dc6a2d324bd8..1d56903fd4c7 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -37,6 +37,7 @@ static int tcf_bpf(struct sk_buff *skb, const struct tc_action *act, { struct tcf_bpf *prog = act->priv; int action, filter_res; + bool at_ingress = G_TC_AT(skb->tc_verd) & AT_INGRESS; if (unlikely(!skb_mac_header_was_set(skb))) return TC_ACT_UNSPEC; @@ -48,7 +49,13 @@ static int tcf_bpf(struct sk_buff *skb, const struct tc_action *act, /* Needed here for accessing maps. */ rcu_read_lock(); - filter_res = BPF_PROG_RUN(prog->filter, skb); + if (at_ingress) { + __skb_push(skb, skb->mac_len); + filter_res = BPF_PROG_RUN(prog->filter, skb); + __skb_pull(skb, skb->mac_len); + } else { + filter_res = BPF_PROG_RUN(prog->filter, skb); + } rcu_read_unlock(); /* A BPF program may overwrite the default action opcode. diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index 91bd9c19471d..c79ecfd36e0f 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -64,6 +64,11 @@ static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp, { struct cls_bpf_head *head = rcu_dereference_bh(tp->root); struct cls_bpf_prog *prog; +#ifdef CONFIG_NET_CLS_ACT + bool at_ingress = G_TC_AT(skb->tc_verd) & AT_INGRESS; +#else + bool at_ingress = false; +#endif int ret = -1; if (unlikely(!skb_mac_header_was_set(skb))) @@ -72,7 +77,16 @@ static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp, /* Needed here for accessing maps. */ rcu_read_lock(); list_for_each_entry_rcu(prog, &head->plist, link) { - int filter_res = BPF_PROG_RUN(prog->filter, skb); + int filter_res; + + if (at_ingress) { + /* It is safe to push/pull even if skb_shared() */ + __skb_push(skb, skb->mac_len); + filter_res = BPF_PROG_RUN(prog->filter, skb); + __skb_pull(skb, skb->mac_len); + } else { + filter_res = BPF_PROG_RUN(prog->filter, skb); + } if (filter_res == 0) continue; diff --git a/samples/bpf/tcbpf1_kern.c b/samples/bpf/tcbpf1_kern.c index 7c27710f8296..9bfb2eb34563 100644 --- a/samples/bpf/tcbpf1_kern.c +++ b/samples/bpf/tcbpf1_kern.c @@ -21,7 +21,7 @@ static inline void set_dst_mac(struct __sk_buff *skb, char *mac) static inline void set_ip_tos(struct __sk_buff *skb, __u8 new_tos) { - __u8 old_tos = load_byte(skb, BPF_LL_OFF + TOS_OFF); + __u8 old_tos = load_byte(skb, TOS_OFF); bpf_l3_csum_replace(skb, IP_CSUM_OFF, htons(old_tos), htons(new_tos), 2); bpf_skb_store_bytes(skb, TOS_OFF, &new_tos, sizeof(new_tos), 0); @@ -34,7 +34,7 @@ static inline void set_ip_tos(struct __sk_buff *skb, __u8 new_tos) static inline void set_tcp_ip_src(struct __sk_buff *skb, __u32 new_ip) { - __u32 old_ip = _htonl(load_word(skb, BPF_LL_OFF + IP_SRC_OFF)); + __u32 old_ip = _htonl(load_word(skb, IP_SRC_OFF)); bpf_l4_csum_replace(skb, TCP_CSUM_OFF, old_ip, new_ip, IS_PSEUDO | sizeof(new_ip)); bpf_l3_csum_replace(skb, IP_CSUM_OFF, old_ip, new_ip, sizeof(new_ip)); @@ -44,7 +44,7 @@ static inline void set_tcp_ip_src(struct __sk_buff *skb, __u32 new_ip) #define TCP_DPORT_OFF (ETH_HLEN + sizeof(struct iphdr) + offsetof(struct tcphdr, dest)) static inline void set_tcp_dest_port(struct __sk_buff *skb, __u16 new_port) { - __u16 old_port = htons(load_half(skb, BPF_LL_OFF + TCP_DPORT_OFF)); + __u16 old_port = htons(load_half(skb, TCP_DPORT_OFF)); bpf_l4_csum_replace(skb, TCP_CSUM_OFF, old_port, new_port, sizeof(new_port)); bpf_skb_store_bytes(skb, TCP_DPORT_OFF, &new_port, sizeof(new_port), 0); @@ -53,7 +53,7 @@ static inline void set_tcp_dest_port(struct __sk_buff *skb, __u16 new_port) SEC("classifier") int bpf_prog1(struct __sk_buff *skb) { - __u8 proto = load_byte(skb, BPF_LL_OFF + ETH_HLEN + offsetof(struct iphdr, protocol)); + __u8 proto = load_byte(skb, ETH_HLEN + offsetof(struct iphdr, protocol)); long *value; if (proto == IPPROTO_TCP) { -- cgit v1.2.3 From 17cebfd097fe8a21902602db5196f48e5a9d34e8 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 17 Jun 2015 10:28:17 -0500 Subject: net: sched: Simplify em_ipset_match em->net is always set and always available, use it in preference to dev_net(skb->dev). Signed-off-by: "Eric W. Biederman" Signed-off-by: Pablo Neira Ayuso --- net/sched/em_ipset.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/sched') diff --git a/net/sched/em_ipset.c b/net/sched/em_ipset.c index a3d79c8bf3b8..df0328ba6a48 100644 --- a/net/sched/em_ipset.c +++ b/net/sched/em_ipset.c @@ -92,8 +92,8 @@ static int em_ipset_match(struct sk_buff *skb, struct tcf_ematch *em, rcu_read_lock(); - if (dev && skb->skb_iif) - indev = dev_get_by_index_rcu(dev_net(dev), skb->skb_iif); + if (skb->skb_iif) + indev = dev_get_by_index_rcu(em->net, skb->skb_iif); acpar.in = indev ? indev : dev; acpar.out = dev; -- cgit v1.2.3 From a55e1c5c2640549ff3beb843f7ecaa242d69c351 Mon Sep 17 00:00:00 2001 From: Andrea Parri Date: Wed, 17 Jun 2015 00:16:59 +0200 Subject: pkt_sched: sch_qfq: remove redundant -if- control statement The control !hlist_unhashed() in qfq_destroy_agg() is unnecessary because already performed in hlist_del_init(), so remove it. Signed-off-by: Andrea Parri Signed-off-by: David S. Miller --- net/sched/sch_qfq.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net/sched') diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index 3ec7e88a43ca..b8d73bca683c 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -339,8 +339,7 @@ static struct qfq_aggregate *qfq_choose_next_agg(struct qfq_sched *); static void qfq_destroy_agg(struct qfq_sched *q, struct qfq_aggregate *agg) { - if (!hlist_unhashed(&agg->nonfull_next)) - hlist_del_init(&agg->nonfull_next); + hlist_del_init(&agg->nonfull_next); q->wsum -= agg->class_weight; if (q->wsum != 0) q->iwsum = ONE_FP / q->wsum; -- cgit v1.2.3