diff options
Diffstat (limited to 'net/sched')
-rw-r--r-- | net/sched/Kconfig | 2 | ||||
-rw-r--r-- | net/sched/act_api.c | 43 | ||||
-rw-r--r-- | net/sched/act_bpf.c | 3 | ||||
-rw-r--r-- | net/sched/act_ct.c | 569 | ||||
-rw-r--r-- | net/sched/act_pedit.c | 11 | ||||
-rw-r--r-- | net/sched/act_sample.c | 2 | ||||
-rw-r--r-- | net/sched/act_skbedit.c | 11 | ||||
-rw-r--r-- | net/sched/cls_api.c | 168 | ||||
-rw-r--r-- | net/sched/cls_flower.c | 70 | ||||
-rw-r--r-- | net/sched/cls_matchall.c | 8 | ||||
-rw-r--r-- | net/sched/em_ipt.c | 2 | ||||
-rw-r--r-- | net/sched/em_nbyte.c | 2 | ||||
-rw-r--r-- | net/sched/sch_api.c | 21 | ||||
-rw-r--r-- | net/sched/sch_atm.c | 2 | ||||
-rw-r--r-- | net/sched/sch_fifo.c | 97 | ||||
-rw-r--r-- | net/sched/sch_fq.c | 21 | ||||
-rw-r--r-- | net/sched/sch_fq_pie.c | 1 | ||||
-rw-r--r-- | net/sched/sch_generic.c | 8 | ||||
-rw-r--r-- | net/sched/sch_ingress.c | 11 | ||||
-rw-r--r-- | net/sched/sch_netem.c | 2 | ||||
-rw-r--r-- | net/sched/sch_pie.c | 49 | ||||
-rw-r--r-- | net/sched/sch_red.c | 69 |
22 files changed, 1045 insertions, 127 deletions
diff --git a/net/sched/Kconfig b/net/sched/Kconfig index edde0e519438..bfbefb7bff9d 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -972,7 +972,7 @@ config NET_ACT_TUNNEL_KEY config NET_ACT_CT tristate "connection tracking tc action" - depends on NET_CLS_ACT && NF_CONNTRACK && NF_NAT + depends on NET_CLS_ACT && NF_CONNTRACK && NF_NAT && NF_FLOW_TABLE help Say Y here to allow sending the packets to conntrack module. diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 8c466a712cda..df4560909157 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -185,6 +185,7 @@ static size_t tcf_action_shared_attrs_size(const struct tc_action *act) return nla_total_size(0) /* action number nested */ + nla_total_size(IFNAMSIZ) /* TCA_ACT_KIND */ + cookie_len /* TCA_ACT_COOKIE */ + + nla_total_size(sizeof(struct nla_bitfield32)) /* TCA_ACT_HW_STATS */ + nla_total_size(0) /* TCA_ACT_STATS nested */ + nla_total_size(sizeof(struct nla_bitfield32)) /* TCA_ACT_FLAGS */ /* TCA_STATS_BASIC */ @@ -788,13 +789,20 @@ tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int bind, int ref) } rcu_read_unlock(); - if (a->tcfa_flags) { - struct nla_bitfield32 flags = { a->tcfa_flags, - a->tcfa_flags, }; + if (a->hw_stats != TCA_ACT_HW_STATS_ANY && + nla_put_bitfield32(skb, TCA_ACT_HW_STATS, + a->hw_stats, TCA_ACT_HW_STATS_ANY)) + goto nla_put_failure; - if (nla_put(skb, TCA_ACT_FLAGS, sizeof(flags), &flags)) - goto nla_put_failure; - } + if (a->used_hw_stats_valid && + nla_put_bitfield32(skb, TCA_ACT_USED_HW_STATS, + a->used_hw_stats, TCA_ACT_HW_STATS_ANY)) + goto nla_put_failure; + + if (a->tcfa_flags && + nla_put_bitfield32(skb, TCA_ACT_FLAGS, + a->tcfa_flags, a->tcfa_flags)) + goto nla_put_failure; nest = nla_nest_start_noflag(skb, TCA_OPTIONS); if (nest == NULL) @@ -854,7 +862,23 @@ static struct tc_cookie *nla_memdup_cookie(struct nlattr **tb) return c; } +static u8 tcf_action_hw_stats_get(struct nlattr *hw_stats_attr) +{ + struct nla_bitfield32 hw_stats_bf; + + /* If the user did not pass the attr, that means he does + * not care about the type. Return "any" in that case + * which is setting on all supported types. + */ + if (!hw_stats_attr) + return TCA_ACT_HW_STATS_ANY; + hw_stats_bf = nla_get_bitfield32(hw_stats_attr); + return hw_stats_bf.value; +} + static const u32 tca_act_flags_allowed = TCA_ACT_FLAGS_NO_PERCPU_STATS; +static const u32 tca_act_hw_stats_allowed = TCA_ACT_HW_STATS_ANY; + static const struct nla_policy tcf_action_policy[TCA_ACT_MAX + 1] = { [TCA_ACT_KIND] = { .type = NLA_STRING }, [TCA_ACT_INDEX] = { .type = NLA_U32 }, @@ -863,6 +887,8 @@ static const struct nla_policy tcf_action_policy[TCA_ACT_MAX + 1] = { [TCA_ACT_OPTIONS] = { .type = NLA_NESTED }, [TCA_ACT_FLAGS] = { .type = NLA_BITFIELD32, .validation_data = &tca_act_flags_allowed }, + [TCA_ACT_HW_STATS] = { .type = NLA_BITFIELD32, + .validation_data = &tca_act_hw_stats_allowed }, }; struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, @@ -872,6 +898,7 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, struct netlink_ext_ack *extack) { struct nla_bitfield32 flags = { 0, 0 }; + u8 hw_stats = TCA_ACT_HW_STATS_ANY; struct tc_action *a; struct tc_action_ops *a_o; struct tc_cookie *cookie = NULL; @@ -903,6 +930,7 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, goto err_out; } } + hw_stats = tcf_action_hw_stats_get(tb[TCA_ACT_HW_STATS]); if (tb[TCA_ACT_FLAGS]) flags = nla_get_bitfield32(tb[TCA_ACT_FLAGS]); } else { @@ -953,6 +981,9 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, if (!name && tb[TCA_ACT_COOKIE]) tcf_set_action_cookie(&a->act_cookie, cookie); + if (!name) + a->hw_stats = hw_stats; + /* module count goes up only when brand new policy is created * if it exists and is only bound to in a_o->init() then * ACT_P_CREATED is not returned (a zero is). diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index 46f47e58b3be..54d5652cfe6c 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -12,6 +12,7 @@ #include <linux/bpf.h> #include <net/netlink.h> +#include <net/sock.h> #include <net/pkt_sched.h> #include <net/pkt_cls.h> @@ -53,6 +54,8 @@ static int tcf_bpf_act(struct sk_buff *skb, const struct tc_action *act, bpf_compute_data_pointers(skb); filter_res = BPF_PROG_RUN(filter, skb); } + if (skb_sk_is_prefetched(skb) && filter_res != TC_ACT_OK) + skb_orphan(skb); rcu_read_unlock(); /* A BPF program may overwrite the default action opcode. diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index 41114b463161..1a766393be62 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -15,6 +15,7 @@ #include <linux/pkt_cls.h> #include <linux/ip.h> #include <linux/ipv6.h> +#include <linux/rhashtable.h> #include <net/netlink.h> #include <net/pkt_sched.h> #include <net/pkt_cls.h> @@ -24,6 +25,7 @@ #include <uapi/linux/tc_act/tc_ct.h> #include <net/tc_act/tc_ct.h> +#include <net/netfilter/nf_flow_table.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_zones.h> @@ -31,6 +33,523 @@ #include <net/netfilter/ipv6/nf_defrag_ipv6.h> #include <uapi/linux/netfilter/nf_nat.h> +static struct workqueue_struct *act_ct_wq; +static struct rhashtable zones_ht; +static DEFINE_MUTEX(zones_mutex); + +struct tcf_ct_flow_table { + struct rhash_head node; /* In zones tables */ + + struct rcu_work rwork; + struct nf_flowtable nf_ft; + refcount_t ref; + u16 zone; + + bool dying; +}; + +static const struct rhashtable_params zones_params = { + .head_offset = offsetof(struct tcf_ct_flow_table, node), + .key_offset = offsetof(struct tcf_ct_flow_table, zone), + .key_len = sizeof_field(struct tcf_ct_flow_table, zone), + .automatic_shrinking = true, +}; + +static struct flow_action_entry * +tcf_ct_flow_table_flow_action_get_next(struct flow_action *flow_action) +{ + int i = flow_action->num_entries++; + + return &flow_action->entries[i]; +} + +static void tcf_ct_add_mangle_action(struct flow_action *action, + enum flow_action_mangle_base htype, + u32 offset, + u32 mask, + u32 val) +{ + struct flow_action_entry *entry; + + entry = tcf_ct_flow_table_flow_action_get_next(action); + entry->id = FLOW_ACTION_MANGLE; + entry->mangle.htype = htype; + entry->mangle.mask = ~mask; + entry->mangle.offset = offset; + entry->mangle.val = val; +} + +/* The following nat helper functions check if the inverted reverse tuple + * (target) is different then the current dir tuple - meaning nat for ports + * and/or ip is needed, and add the relevant mangle actions. + */ +static void +tcf_ct_flow_table_add_action_nat_ipv4(const struct nf_conntrack_tuple *tuple, + struct nf_conntrack_tuple target, + struct flow_action *action) +{ + if (memcmp(&target.src.u3, &tuple->src.u3, sizeof(target.src.u3))) + tcf_ct_add_mangle_action(action, FLOW_ACT_MANGLE_HDR_TYPE_IP4, + offsetof(struct iphdr, saddr), + 0xFFFFFFFF, + be32_to_cpu(target.src.u3.ip)); + if (memcmp(&target.dst.u3, &tuple->dst.u3, sizeof(target.dst.u3))) + tcf_ct_add_mangle_action(action, FLOW_ACT_MANGLE_HDR_TYPE_IP4, + offsetof(struct iphdr, daddr), + 0xFFFFFFFF, + be32_to_cpu(target.dst.u3.ip)); +} + +static void +tcf_ct_add_ipv6_addr_mangle_action(struct flow_action *action, + union nf_inet_addr *addr, + u32 offset) +{ + int i; + + for (i = 0; i < sizeof(struct in6_addr) / sizeof(u32); i++) + tcf_ct_add_mangle_action(action, FLOW_ACT_MANGLE_HDR_TYPE_IP6, + i * sizeof(u32) + offset, + 0xFFFFFFFF, be32_to_cpu(addr->ip6[i])); +} + +static void +tcf_ct_flow_table_add_action_nat_ipv6(const struct nf_conntrack_tuple *tuple, + struct nf_conntrack_tuple target, + struct flow_action *action) +{ + if (memcmp(&target.src.u3, &tuple->src.u3, sizeof(target.src.u3))) + tcf_ct_add_ipv6_addr_mangle_action(action, &target.src.u3, + offsetof(struct ipv6hdr, + saddr)); + if (memcmp(&target.dst.u3, &tuple->dst.u3, sizeof(target.dst.u3))) + tcf_ct_add_ipv6_addr_mangle_action(action, &target.dst.u3, + offsetof(struct ipv6hdr, + daddr)); +} + +static void +tcf_ct_flow_table_add_action_nat_tcp(const struct nf_conntrack_tuple *tuple, + struct nf_conntrack_tuple target, + struct flow_action *action) +{ + __be16 target_src = target.src.u.tcp.port; + __be16 target_dst = target.dst.u.tcp.port; + + if (target_src != tuple->src.u.tcp.port) + tcf_ct_add_mangle_action(action, FLOW_ACT_MANGLE_HDR_TYPE_TCP, + offsetof(struct tcphdr, source), + 0xFFFF, be16_to_cpu(target_src)); + if (target_dst != tuple->dst.u.tcp.port) + tcf_ct_add_mangle_action(action, FLOW_ACT_MANGLE_HDR_TYPE_TCP, + offsetof(struct tcphdr, dest), + 0xFFFF, be16_to_cpu(target_dst)); +} + +static void +tcf_ct_flow_table_add_action_nat_udp(const struct nf_conntrack_tuple *tuple, + struct nf_conntrack_tuple target, + struct flow_action *action) +{ + __be16 target_src = target.src.u.udp.port; + __be16 target_dst = target.dst.u.udp.port; + + if (target_src != tuple->src.u.udp.port) + tcf_ct_add_mangle_action(action, FLOW_ACT_MANGLE_HDR_TYPE_TCP, + offsetof(struct udphdr, source), + 0xFFFF, be16_to_cpu(target_src)); + if (target_dst != tuple->dst.u.udp.port) + tcf_ct_add_mangle_action(action, FLOW_ACT_MANGLE_HDR_TYPE_TCP, + offsetof(struct udphdr, dest), + 0xFFFF, be16_to_cpu(target_dst)); +} + +static void tcf_ct_flow_table_add_action_meta(struct nf_conn *ct, + enum ip_conntrack_dir dir, + struct flow_action *action) +{ + struct nf_conn_labels *ct_labels; + struct flow_action_entry *entry; + enum ip_conntrack_info ctinfo; + u32 *act_ct_labels; + + entry = tcf_ct_flow_table_flow_action_get_next(action); + entry->id = FLOW_ACTION_CT_METADATA; +#if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) + entry->ct_metadata.mark = ct->mark; +#endif + ctinfo = dir == IP_CT_DIR_ORIGINAL ? IP_CT_ESTABLISHED : + IP_CT_ESTABLISHED_REPLY; + /* aligns with the CT reference on the SKB nf_ct_set */ + entry->ct_metadata.cookie = (unsigned long)ct | ctinfo; + + act_ct_labels = entry->ct_metadata.labels; + ct_labels = nf_ct_labels_find(ct); + if (ct_labels) + memcpy(act_ct_labels, ct_labels->bits, NF_CT_LABELS_MAX_SIZE); + else + memset(act_ct_labels, 0, NF_CT_LABELS_MAX_SIZE); +} + +static int tcf_ct_flow_table_add_action_nat(struct net *net, + struct nf_conn *ct, + enum ip_conntrack_dir dir, + struct flow_action *action) +{ + const struct nf_conntrack_tuple *tuple = &ct->tuplehash[dir].tuple; + struct nf_conntrack_tuple target; + + nf_ct_invert_tuple(&target, &ct->tuplehash[!dir].tuple); + + switch (tuple->src.l3num) { + case NFPROTO_IPV4: + tcf_ct_flow_table_add_action_nat_ipv4(tuple, target, + action); + break; + case NFPROTO_IPV6: + tcf_ct_flow_table_add_action_nat_ipv6(tuple, target, + action); + break; + default: + return -EOPNOTSUPP; + } + + switch (nf_ct_protonum(ct)) { + case IPPROTO_TCP: + tcf_ct_flow_table_add_action_nat_tcp(tuple, target, action); + break; + case IPPROTO_UDP: + tcf_ct_flow_table_add_action_nat_udp(tuple, target, action); + break; + default: + return -EOPNOTSUPP; + } + + return 0; +} + +static int tcf_ct_flow_table_fill_actions(struct net *net, + const struct flow_offload *flow, + enum flow_offload_tuple_dir tdir, + struct nf_flow_rule *flow_rule) +{ + struct flow_action *action = &flow_rule->rule->action; + int num_entries = action->num_entries; + struct nf_conn *ct = flow->ct; + enum ip_conntrack_dir dir; + int i, err; + + switch (tdir) { + case FLOW_OFFLOAD_DIR_ORIGINAL: + dir = IP_CT_DIR_ORIGINAL; + break; + case FLOW_OFFLOAD_DIR_REPLY: + dir = IP_CT_DIR_REPLY; + break; + default: + return -EOPNOTSUPP; + } + + err = tcf_ct_flow_table_add_action_nat(net, ct, dir, action); + if (err) + goto err_nat; + + tcf_ct_flow_table_add_action_meta(ct, dir, action); + return 0; + +err_nat: + /* Clear filled actions */ + for (i = num_entries; i < action->num_entries; i++) + memset(&action->entries[i], 0, sizeof(action->entries[i])); + action->num_entries = num_entries; + + return err; +} + +static struct nf_flowtable_type flowtable_ct = { + .action = tcf_ct_flow_table_fill_actions, + .owner = THIS_MODULE, +}; + +static int tcf_ct_flow_table_get(struct tcf_ct_params *params) +{ + struct tcf_ct_flow_table *ct_ft; + int err = -ENOMEM; + + mutex_lock(&zones_mutex); + ct_ft = rhashtable_lookup_fast(&zones_ht, ¶ms->zone, zones_params); + if (ct_ft && refcount_inc_not_zero(&ct_ft->ref)) + goto out_unlock; + + ct_ft = kzalloc(sizeof(*ct_ft), GFP_KERNEL); + if (!ct_ft) + goto err_alloc; + refcount_set(&ct_ft->ref, 1); + + ct_ft->zone = params->zone; + err = rhashtable_insert_fast(&zones_ht, &ct_ft->node, zones_params); + if (err) + goto err_insert; + + ct_ft->nf_ft.type = &flowtable_ct; + ct_ft->nf_ft.flags |= NF_FLOWTABLE_HW_OFFLOAD; + err = nf_flow_table_init(&ct_ft->nf_ft); + if (err) + goto err_init; + + __module_get(THIS_MODULE); +out_unlock: + params->ct_ft = ct_ft; + params->nf_ft = &ct_ft->nf_ft; + mutex_unlock(&zones_mutex); + + return 0; + +err_init: + rhashtable_remove_fast(&zones_ht, &ct_ft->node, zones_params); +err_insert: + kfree(ct_ft); +err_alloc: + mutex_unlock(&zones_mutex); + return err; +} + +static void tcf_ct_flow_table_cleanup_work(struct work_struct *work) +{ + struct tcf_ct_flow_table *ct_ft; + + ct_ft = container_of(to_rcu_work(work), struct tcf_ct_flow_table, + rwork); + nf_flow_table_free(&ct_ft->nf_ft); + kfree(ct_ft); + + module_put(THIS_MODULE); +} + +static void tcf_ct_flow_table_put(struct tcf_ct_params *params) +{ + struct tcf_ct_flow_table *ct_ft = params->ct_ft; + + if (refcount_dec_and_test(¶ms->ct_ft->ref)) { + rhashtable_remove_fast(&zones_ht, &ct_ft->node, zones_params); + INIT_RCU_WORK(&ct_ft->rwork, tcf_ct_flow_table_cleanup_work); + queue_rcu_work(act_ct_wq, &ct_ft->rwork); + } +} + +static void tcf_ct_flow_table_add(struct tcf_ct_flow_table *ct_ft, + struct nf_conn *ct, + bool tcp) +{ + struct flow_offload *entry; + int err; + + if (test_and_set_bit(IPS_OFFLOAD_BIT, &ct->status)) + return; + + entry = flow_offload_alloc(ct); + if (!entry) { + WARN_ON_ONCE(1); + goto err_alloc; + } + + if (tcp) { + ct->proto.tcp.seen[0].flags |= IP_CT_TCP_FLAG_BE_LIBERAL; + ct->proto.tcp.seen[1].flags |= IP_CT_TCP_FLAG_BE_LIBERAL; + } + + err = flow_offload_add(&ct_ft->nf_ft, entry); + if (err) + goto err_add; + + return; + +err_add: + flow_offload_free(entry); +err_alloc: + clear_bit(IPS_OFFLOAD_BIT, &ct->status); +} + +static void tcf_ct_flow_table_process_conn(struct tcf_ct_flow_table *ct_ft, + struct nf_conn *ct, + enum ip_conntrack_info ctinfo) +{ + bool tcp = false; + + if (ctinfo != IP_CT_ESTABLISHED && ctinfo != IP_CT_ESTABLISHED_REPLY) + return; + + switch (nf_ct_protonum(ct)) { + case IPPROTO_TCP: + tcp = true; + if (ct->proto.tcp.state != TCP_CONNTRACK_ESTABLISHED) + return; + break; + case IPPROTO_UDP: + break; + default: + return; + } + + if (nf_ct_ext_exist(ct, NF_CT_EXT_HELPER) || + ct->status & IPS_SEQ_ADJUST) + return; + + tcf_ct_flow_table_add(ct_ft, ct, tcp); +} + +static bool +tcf_ct_flow_table_fill_tuple_ipv4(struct sk_buff *skb, + struct flow_offload_tuple *tuple, + struct tcphdr **tcph) +{ + struct flow_ports *ports; + unsigned int thoff; + struct iphdr *iph; + + if (!pskb_network_may_pull(skb, sizeof(*iph))) + return false; + + iph = ip_hdr(skb); + thoff = iph->ihl * 4; + + if (ip_is_fragment(iph) || + unlikely(thoff != sizeof(struct iphdr))) + return false; + + if (iph->protocol != IPPROTO_TCP && + iph->protocol != IPPROTO_UDP) + return false; + + if (iph->ttl <= 1) + return false; + + if (!pskb_network_may_pull(skb, iph->protocol == IPPROTO_TCP ? + thoff + sizeof(struct tcphdr) : + thoff + sizeof(*ports))) + return false; + + iph = ip_hdr(skb); + if (iph->protocol == IPPROTO_TCP) + *tcph = (void *)(skb_network_header(skb) + thoff); + + ports = (struct flow_ports *)(skb_network_header(skb) + thoff); + tuple->src_v4.s_addr = iph->saddr; + tuple->dst_v4.s_addr = iph->daddr; + tuple->src_port = ports->source; + tuple->dst_port = ports->dest; + tuple->l3proto = AF_INET; + tuple->l4proto = iph->protocol; + + return true; +} + +static bool +tcf_ct_flow_table_fill_tuple_ipv6(struct sk_buff *skb, + struct flow_offload_tuple *tuple, + struct tcphdr **tcph) +{ + struct flow_ports *ports; + struct ipv6hdr *ip6h; + unsigned int thoff; + + if (!pskb_network_may_pull(skb, sizeof(*ip6h))) + return false; + + ip6h = ipv6_hdr(skb); + + if (ip6h->nexthdr != IPPROTO_TCP && + ip6h->nexthdr != IPPROTO_UDP) + return false; + + if (ip6h->hop_limit <= 1) + return false; + + thoff = sizeof(*ip6h); + if (!pskb_network_may_pull(skb, ip6h->nexthdr == IPPROTO_TCP ? + thoff + sizeof(struct tcphdr) : + thoff + sizeof(*ports))) + return false; + + ip6h = ipv6_hdr(skb); + if (ip6h->nexthdr == IPPROTO_TCP) + *tcph = (void *)(skb_network_header(skb) + thoff); + + ports = (struct flow_ports *)(skb_network_header(skb) + thoff); + tuple->src_v6 = ip6h->saddr; + tuple->dst_v6 = ip6h->daddr; + tuple->src_port = ports->source; + tuple->dst_port = ports->dest; + tuple->l3proto = AF_INET6; + tuple->l4proto = ip6h->nexthdr; + + return true; +} + +static bool tcf_ct_flow_table_lookup(struct tcf_ct_params *p, + struct sk_buff *skb, + u8 family) +{ + struct nf_flowtable *nf_ft = &p->ct_ft->nf_ft; + struct flow_offload_tuple_rhash *tuplehash; + struct flow_offload_tuple tuple = {}; + enum ip_conntrack_info ctinfo; + struct tcphdr *tcph = NULL; + struct flow_offload *flow; + struct nf_conn *ct; + u8 dir; + + /* Previously seen or loopback */ + ct = nf_ct_get(skb, &ctinfo); + if ((ct && !nf_ct_is_template(ct)) || ctinfo == IP_CT_UNTRACKED) + return false; + + switch (family) { + case NFPROTO_IPV4: + if (!tcf_ct_flow_table_fill_tuple_ipv4(skb, &tuple, &tcph)) + return false; + break; + case NFPROTO_IPV6: + if (!tcf_ct_flow_table_fill_tuple_ipv6(skb, &tuple, &tcph)) + return false; + break; + default: + return false; + } + + tuplehash = flow_offload_lookup(nf_ft, &tuple); + if (!tuplehash) + return false; + + dir = tuplehash->tuple.dir; + flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]); + ct = flow->ct; + + if (tcph && (unlikely(tcph->fin || tcph->rst))) { + flow_offload_teardown(flow); + return false; + } + + ctinfo = dir == FLOW_OFFLOAD_DIR_ORIGINAL ? IP_CT_ESTABLISHED : + IP_CT_ESTABLISHED_REPLY; + + flow_offload_refresh(nf_ft, flow); + nf_conntrack_get(&ct->ct_general); + nf_ct_set(skb, ct, ctinfo); + + return true; +} + +static int tcf_ct_flow_tables_init(void) +{ + return rhashtable_init(&zones_ht, &zones_params); +} + +static void tcf_ct_flow_tables_uninit(void) +{ + rhashtable_destroy(&zones_ht); +} + static struct tc_action_ops act_ct_ops; static unsigned int ct_net_id; @@ -207,6 +726,8 @@ static void tcf_ct_params_free(struct rcu_head *head) struct tcf_ct_params *params = container_of(head, struct tcf_ct_params, rcu); + tcf_ct_flow_table_put(params); + if (params->tmpl) nf_conntrack_put(¶ms->tmpl->ct_general); kfree(params); @@ -387,6 +908,7 @@ static int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a, struct nf_hook_state state; int nh_ofs, err, retval; struct tcf_ct_params *p; + bool skip_add = false; struct nf_conn *ct; u8 family; @@ -436,6 +958,11 @@ static int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a, */ cached = tcf_ct_skb_nfct_cached(net, skb, p->zone, force); if (!cached) { + if (!commit && tcf_ct_flow_table_lookup(p, skb, family)) { + skip_add = true; + goto do_nat; + } + /* Associate skb with specified zone. */ if (tmpl) { ct = nf_ct_get(skb, &ctinfo); @@ -453,6 +980,7 @@ static int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a, goto out_push; } +do_nat: ct = nf_ct_get(skb, &ctinfo); if (!ct) goto out_push; @@ -470,6 +998,8 @@ static int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a, * even if the connection is already confirmed. */ nf_conntrack_confirm(skb); + } else if (!skip_add) { + tcf_ct_flow_table_process_conn(p->ct_ft, ct, ctinfo); } out_push: @@ -730,6 +1260,10 @@ static int tcf_ct_init(struct net *net, struct nlattr *nla, if (err) goto cleanup; + err = tcf_ct_flow_table_get(params); + if (err) + goto cleanup; + spin_lock_bh(&c->tcf_lock); goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); params = rcu_replace_pointer(c->params, params, @@ -974,13 +1508,46 @@ static struct pernet_operations ct_net_ops = { static int __init ct_init_module(void) { - return tcf_register_action(&act_ct_ops, &ct_net_ops); + int err; + + act_ct_wq = alloc_ordered_workqueue("act_ct_workqueue", 0); + if (!act_ct_wq) + return -ENOMEM; + + err = tcf_ct_flow_tables_init(); + if (err) + goto err_tbl_init; + + err = tcf_register_action(&act_ct_ops, &ct_net_ops); + if (err) + goto err_register; + + return 0; + +err_tbl_init: + destroy_workqueue(act_ct_wq); +err_register: + tcf_ct_flow_tables_uninit(); + return err; } static void __exit ct_cleanup_module(void) { tcf_unregister_action(&act_ct_ops, &ct_net_ops); + tcf_ct_flow_tables_uninit(); + destroy_workqueue(act_ct_wq); +} + +void tcf_ct_flow_table_restore_skb(struct sk_buff *skb, unsigned long cookie) +{ + enum ip_conntrack_info ctinfo = cookie & NFCT_INFOMASK; + struct nf_conn *ct; + + ct = (struct nf_conn *)(cookie & NFCT_PTRMASK); + nf_conntrack_get(&ct->ct_general); + nf_ct_set(skb, ct, ctinfo); } +EXPORT_SYMBOL_GPL(tcf_ct_flow_table_restore_skb); module_init(ct_init_module); module_exit(ct_cleanup_module); diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index 3ad718576304..d41d6200d9de 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -409,6 +409,16 @@ done: return p->tcf_action; } +static void tcf_pedit_stats_update(struct tc_action *a, u64 bytes, u32 packets, + u64 lastuse, bool hw) +{ + struct tcf_pedit *d = to_pedit(a); + struct tcf_t *tm = &d->tcf_tm; + + tcf_action_update_stats(a, bytes, packets, false, hw); + tm->lastuse = max_t(u64, tm->lastuse, lastuse); +} + static int tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { @@ -485,6 +495,7 @@ static struct tc_action_ops act_pedit_ops = { .id = TCA_ID_PEDIT, .owner = THIS_MODULE, .act = tcf_pedit_act, + .stats_update = tcf_pedit_stats_update, .dump = tcf_pedit_dump, .cleanup = tcf_pedit_cleanup, .init = tcf_pedit_init, diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c index ce948c1e24dc..5e2df590bb58 100644 --- a/net/sched/act_sample.c +++ b/net/sched/act_sample.c @@ -267,14 +267,12 @@ tcf_sample_get_group(const struct tc_action *a, struct tcf_sample *s = to_sample(a); struct psample_group *group; - spin_lock_bh(&s->tcf_lock); group = rcu_dereference_protected(s->psample_group, lockdep_is_held(&s->tcf_lock)); if (group) { psample_group_take(group); *destructor = tcf_psample_group_put; } - spin_unlock_bh(&s->tcf_lock); return group; } diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index e857424c387c..b125b2be4467 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -73,6 +73,16 @@ err: return TC_ACT_SHOT; } +static void tcf_skbedit_stats_update(struct tc_action *a, u64 bytes, + u32 packets, u64 lastuse, bool hw) +{ + struct tcf_skbedit *d = to_skbedit(a); + struct tcf_t *tm = &d->tcf_tm; + + tcf_action_update_stats(a, bytes, packets, false, hw); + tm->lastuse = max_t(u64, tm->lastuse, lastuse); +} + static const struct nla_policy skbedit_policy[TCA_SKBEDIT_MAX + 1] = { [TCA_SKBEDIT_PARMS] = { .len = sizeof(struct tc_skbedit) }, [TCA_SKBEDIT_PRIORITY] = { .len = sizeof(u32) }, @@ -323,6 +333,7 @@ static struct tc_action_ops act_skbedit_ops = { .id = TCA_ID_SKBEDIT, .owner = THIS_MODULE, .act = tcf_skbedit_act, + .stats_update = tcf_skbedit_stats_update, .dump = tcf_skbedit_dump, .init = tcf_skbedit_init, .cleanup = tcf_skbedit_cleanup, diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index c2cdd0fc2e70..f6a3b969ead0 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -22,6 +22,7 @@ #include <linux/idr.h> #include <linux/rhashtable.h> #include <linux/jhash.h> +#include <linux/rculist.h> #include <net/net_namespace.h> #include <net/sock.h> #include <net/netlink.h> @@ -354,7 +355,7 @@ static struct tcf_chain *tcf_chain_create(struct tcf_block *block, chain = kzalloc(sizeof(*chain), GFP_KERNEL); if (!chain) return NULL; - list_add_tail(&chain->list, &block->chain_list); + list_add_tail_rcu(&chain->list, &block->chain_list); mutex_init(&chain->filter_chain_lock); chain->block = block; chain->index = chain_index; @@ -394,7 +395,7 @@ static bool tcf_chain_detach(struct tcf_chain *chain) ASSERT_BLOCK_LOCKED(block); - list_del(&chain->list); + list_del_rcu(&chain->list); if (!chain->index) block->chain0.chain = NULL; @@ -453,6 +454,20 @@ static struct tcf_chain *tcf_chain_lookup(struct tcf_block *block, return NULL; } +#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) +static struct tcf_chain *tcf_chain_lookup_rcu(const struct tcf_block *block, + u32 chain_index) +{ + struct tcf_chain *chain; + + list_for_each_entry_rcu(chain, &block->chain_list, list) { + if (chain->index == chain_index) + return chain; + } + return NULL; +} +#endif + static int tc_chain_notify(struct tcf_chain *chain, struct sk_buff *oskb, u32 seq, u16 flags, int event, bool unicast); @@ -693,7 +708,7 @@ static void tc_indr_block_call(struct tcf_block *block, }; INIT_LIST_HEAD(&bo.cb_list); - flow_indr_block_call(dev, &bo, command); + flow_indr_block_call(dev, &bo, command, TC_SETUP_BLOCK); tcf_block_setup(block, &bo); } @@ -1559,12 +1574,15 @@ static int tcf_block_setup(struct tcf_block *block, * to this qdisc, (optionally) tests for protocol and asks * specific classifiers. */ -int tcf_classify(struct sk_buff *skb, const struct tcf_proto *tp, - struct tcf_result *res, bool compat_mode) +static inline int __tcf_classify(struct sk_buff *skb, + const struct tcf_proto *tp, + const struct tcf_proto *orig_tp, + struct tcf_result *res, + bool compat_mode, + u32 *last_executed_chain) { #ifdef CONFIG_NET_CLS_ACT const int max_reclassify_loop = 4; - const struct tcf_proto *orig_tp = tp; const struct tcf_proto *first_tp; int limit = 0; @@ -1582,21 +1600,11 @@ reclassify: #ifdef CONFIG_NET_CLS_ACT if (unlikely(err == TC_ACT_RECLASSIFY && !compat_mode)) { first_tp = orig_tp; + *last_executed_chain = first_tp->chain->index; goto reset; } else if (unlikely(TC_ACT_EXT_CMP(err, TC_ACT_GOTO_CHAIN))) { first_tp = res->goto_tp; - -#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) - { - struct tc_skb_ext *ext; - - ext = skb_ext_add(skb, TC_SKB_EXT); - if (WARN_ON_ONCE(!ext)) - return TC_ACT_SHOT; - - ext->chain = err & TC_ACT_EXT_VAL_MASK; - } -#endif + *last_executed_chain = err & TC_ACT_EXT_VAL_MASK; goto reset; } #endif @@ -1619,8 +1627,64 @@ reset: goto reclassify; #endif } + +int tcf_classify(struct sk_buff *skb, const struct tcf_proto *tp, + struct tcf_result *res, bool compat_mode) +{ + u32 last_executed_chain = 0; + + return __tcf_classify(skb, tp, tp, res, compat_mode, + &last_executed_chain); +} EXPORT_SYMBOL(tcf_classify); +int tcf_classify_ingress(struct sk_buff *skb, + const struct tcf_block *ingress_block, + const struct tcf_proto *tp, + struct tcf_result *res, bool compat_mode) +{ +#if !IS_ENABLED(CONFIG_NET_TC_SKB_EXT) + u32 last_executed_chain = 0; + + return __tcf_classify(skb, tp, tp, res, compat_mode, + &last_executed_chain); +#else + u32 last_executed_chain = tp ? tp->chain->index : 0; + const struct tcf_proto *orig_tp = tp; + struct tc_skb_ext *ext; + int ret; + + ext = skb_ext_find(skb, TC_SKB_EXT); + + if (ext && ext->chain) { + struct tcf_chain *fchain; + + fchain = tcf_chain_lookup_rcu(ingress_block, ext->chain); + if (!fchain) + return TC_ACT_SHOT; + + /* Consume, so cloned/redirect skbs won't inherit ext */ + skb_ext_del(skb, TC_SKB_EXT); + + tp = rcu_dereference_bh(fchain->filter_chain); + } + + ret = __tcf_classify(skb, tp, orig_tp, res, compat_mode, + &last_executed_chain); + + /* If we missed on some chain */ + if (ret == TC_ACT_UNSPEC && last_executed_chain) { + ext = skb_ext_add(skb, TC_SKB_EXT); + if (WARN_ON_ONCE(!ext)) + return TC_ACT_SHOT; + ext->chain = last_executed_chain; + } + + return ret; +#endif +} +EXPORT_SYMBOL(tcf_classify_ingress); + struct tcf_chain_info { struct tcf_proto __rcu **pprev; struct tcf_proto __rcu *next; @@ -3382,14 +3446,40 @@ int tc_setup_cb_reoffload(struct tcf_block *block, struct tcf_proto *tp, } EXPORT_SYMBOL(tc_setup_cb_reoffload); +static int tcf_act_get_cookie(struct flow_action_entry *entry, + const struct tc_action *act) +{ + struct tc_cookie *cookie; + int err = 0; + + rcu_read_lock(); + cookie = rcu_dereference(act->act_cookie); + if (cookie) { + entry->cookie = flow_action_cookie_create(cookie->data, + cookie->len, + GFP_ATOMIC); + if (!entry->cookie) + err = -ENOMEM; + } + rcu_read_unlock(); + return err; +} + +static void tcf_act_put_cookie(struct flow_action_entry *entry) +{ + flow_action_cookie_destroy(entry->cookie); +} + void tc_cleanup_flow_action(struct flow_action *flow_action) { struct flow_action_entry *entry; int i; - flow_action_for_each(i, entry, flow_action) + flow_action_for_each(i, entry, flow_action) { + tcf_act_put_cookie(entry); if (entry->destructor) entry->destructor(entry->destructor_priv); + } } EXPORT_SYMBOL(tc_cleanup_flow_action); @@ -3433,22 +3523,30 @@ static void tcf_sample_get_group(struct flow_action_entry *entry, } int tc_setup_flow_action(struct flow_action *flow_action, - const struct tcf_exts *exts, bool rtnl_held) + const struct tcf_exts *exts) { - const struct tc_action *act; + struct tc_action *act; int i, j, k, err = 0; + BUILD_BUG_ON(TCA_ACT_HW_STATS_ANY != FLOW_ACTION_HW_STATS_ANY); + BUILD_BUG_ON(TCA_ACT_HW_STATS_IMMEDIATE != FLOW_ACTION_HW_STATS_IMMEDIATE); + BUILD_BUG_ON(TCA_ACT_HW_STATS_DELAYED != FLOW_ACTION_HW_STATS_DELAYED); + if (!exts) return 0; - if (!rtnl_held) - rtnl_lock(); - j = 0; tcf_exts_for_each_action(i, act, exts) { struct flow_action_entry *entry; entry = &flow_action->entries[j]; + spin_lock_bh(&act->tcfa_lock); + err = tcf_act_get_cookie(entry, act); + if (err) + goto err_out_locked; + + entry->hw_stats = act->hw_stats; + if (is_tcf_gact_ok(act)) { entry->id = FLOW_ACTION_ACCEPT; } else if (is_tcf_gact_shot(act)) { @@ -3489,13 +3587,13 @@ int tc_setup_flow_action(struct flow_action *flow_action, break; default: err = -EOPNOTSUPP; - goto err_out; + goto err_out_locked; } } else if (is_tcf_tunnel_set(act)) { entry->id = FLOW_ACTION_TUNNEL_ENCAP; err = tcf_tunnel_encap_get_tunnel(entry, act); if (err) - goto err_out; + goto err_out_locked; } else if (is_tcf_tunnel_release(act)) { entry->id = FLOW_ACTION_TUNNEL_DECAP; } else if (is_tcf_pedit(act)) { @@ -3509,12 +3607,13 @@ int tc_setup_flow_action(struct flow_action *flow_action, break; default: err = -EOPNOTSUPP; - goto err_out; + goto err_out_locked; } entry->mangle.htype = tcf_pedit_htype(act, k); entry->mangle.mask = tcf_pedit_mask(act, k); entry->mangle.val = tcf_pedit_val(act, k); entry->mangle.offset = tcf_pedit_offset(act, k); + entry->hw_stats = act->hw_stats; entry = &flow_action->entries[++j]; } } else if (is_tcf_csum(act)) { @@ -3538,6 +3637,7 @@ int tc_setup_flow_action(struct flow_action *flow_action, entry->id = FLOW_ACTION_CT; entry->ct.action = tcf_ct_action(act); entry->ct.zone = tcf_ct_zone(act); + entry->ct.flow_table = tcf_ct_ft(act); } else if (is_tcf_mpls(act)) { switch (tcf_mpls_action(act)) { case TCA_MPLS_ACT_PUSH: @@ -3560,28 +3660,32 @@ int tc_setup_flow_action(struct flow_action *flow_action, entry->mpls_mangle.ttl = tcf_mpls_ttl(act); break; default: - goto err_out; + goto err_out_locked; } } else if (is_tcf_skbedit_ptype(act)) { entry->id = FLOW_ACTION_PTYPE; entry->ptype = tcf_skbedit_ptype(act); + } else if (is_tcf_skbedit_priority(act)) { + entry->id = FLOW_ACTION_PRIORITY; + entry->priority = tcf_skbedit_priority(act); } else { err = -EOPNOTSUPP; - goto err_out; + goto err_out_locked; } + spin_unlock_bh(&act->tcfa_lock); if (!is_tcf_pedit(act)) j++; } err_out: - if (!rtnl_held) - rtnl_unlock(); - if (err) tc_cleanup_flow_action(flow_action); return err; +err_out_locked: + spin_unlock_bh(&act->tcfa_lock); + goto err_out; } EXPORT_SYMBOL(tc_setup_flow_action); diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index d32d4233d337..74a0febcafb8 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -450,8 +450,7 @@ static int fl_hw_replace_filter(struct tcf_proto *tp, cls_flower.rule->match.key = &f->mkey; cls_flower.classid = f->res.classid; - err = tc_setup_flow_action(&cls_flower.rule->action, &f->exts, - rtnl_held); + err = tc_setup_flow_action(&cls_flower.rule->action, &f->exts); if (err) { kfree(cls_flower.rule); if (skip_sw) { @@ -493,7 +492,9 @@ static void fl_hw_update_stats(struct tcf_proto *tp, struct cls_fl_filter *f, tcf_exts_stats_update(&f->exts, cls_flower.stats.bytes, cls_flower.stats.pkts, - cls_flower.stats.lastused); + cls_flower.stats.lastused, + cls_flower.stats.used_hw_stats, + cls_flower.stats.used_hw_stats_valid); } static void __fl_put(struct cls_fl_filter *f) @@ -739,7 +740,8 @@ static void fl_set_key_val(struct nlattr **tb, } static int fl_set_key_port_range(struct nlattr **tb, struct fl_flow_key *key, - struct fl_flow_key *mask) + struct fl_flow_key *mask, + struct netlink_ext_ack *extack) { fl_set_key_val(tb, &key->tp_range.tp_min.dst, TCA_FLOWER_KEY_PORT_DST_MIN, &mask->tp_range.tp_min.dst, @@ -754,20 +756,30 @@ static int fl_set_key_port_range(struct nlattr **tb, struct fl_flow_key *key, TCA_FLOWER_KEY_PORT_SRC_MAX, &mask->tp_range.tp_max.src, TCA_FLOWER_UNSPEC, sizeof(key->tp_range.tp_max.src)); - if ((mask->tp_range.tp_min.dst && mask->tp_range.tp_max.dst && - htons(key->tp_range.tp_max.dst) <= - htons(key->tp_range.tp_min.dst)) || - (mask->tp_range.tp_min.src && mask->tp_range.tp_max.src && - htons(key->tp_range.tp_max.src) <= - htons(key->tp_range.tp_min.src))) + if (mask->tp_range.tp_min.dst && mask->tp_range.tp_max.dst && + htons(key->tp_range.tp_max.dst) <= + htons(key->tp_range.tp_min.dst)) { + NL_SET_ERR_MSG_ATTR(extack, + tb[TCA_FLOWER_KEY_PORT_DST_MIN], + "Invalid destination port range (min must be strictly smaller than max)"); return -EINVAL; + } + if (mask->tp_range.tp_min.src && mask->tp_range.tp_max.src && + htons(key->tp_range.tp_max.src) <= + htons(key->tp_range.tp_min.src)) { + NL_SET_ERR_MSG_ATTR(extack, + tb[TCA_FLOWER_KEY_PORT_SRC_MIN], + "Invalid source port range (min must be strictly smaller than max)"); + return -EINVAL; + } return 0; } static int fl_set_key_mpls(struct nlattr **tb, struct flow_dissector_key_mpls *key_val, - struct flow_dissector_key_mpls *key_mask) + struct flow_dissector_key_mpls *key_mask, + struct netlink_ext_ack *extack) { if (tb[TCA_FLOWER_KEY_MPLS_TTL]) { key_val->mpls_ttl = nla_get_u8(tb[TCA_FLOWER_KEY_MPLS_TTL]); @@ -776,24 +788,36 @@ static int fl_set_key_mpls(struct nlattr **tb, if (tb[TCA_FLOWER_KEY_MPLS_BOS]) { u8 bos = nla_get_u8(tb[TCA_FLOWER_KEY_MPLS_BOS]); - if (bos & ~MPLS_BOS_MASK) + if (bos & ~MPLS_BOS_MASK) { + NL_SET_ERR_MSG_ATTR(extack, + tb[TCA_FLOWER_KEY_MPLS_BOS], + "Bottom Of Stack (BOS) must be 0 or 1"); return -EINVAL; + } key_val->mpls_bos = bos; key_mask->mpls_bos = MPLS_BOS_MASK; } if (tb[TCA_FLOWER_KEY_MPLS_TC]) { u8 tc = nla_get_u8(tb[TCA_FLOWER_KEY_MPLS_TC]); - if (tc & ~MPLS_TC_MASK) + if (tc & ~MPLS_TC_MASK) { + NL_SET_ERR_MSG_ATTR(extack, + tb[TCA_FLOWER_KEY_MPLS_TC], + "Traffic Class (TC) must be between 0 and 7"); return -EINVAL; + } key_val->mpls_tc = tc; key_mask->mpls_tc = MPLS_TC_MASK; } if (tb[TCA_FLOWER_KEY_MPLS_LABEL]) { u32 label = nla_get_u32(tb[TCA_FLOWER_KEY_MPLS_LABEL]); - if (label & ~MPLS_LABEL_MASK) + if (label & ~MPLS_LABEL_MASK) { + NL_SET_ERR_MSG_ATTR(extack, + tb[TCA_FLOWER_KEY_MPLS_LABEL], + "Label must be between 0 and 1048575"); return -EINVAL; + } key_val->mpls_label = label; key_mask->mpls_label = MPLS_LABEL_MASK; } @@ -834,14 +858,16 @@ static void fl_set_key_flag(u32 flower_key, u32 flower_mask, } } -static int fl_set_key_flags(struct nlattr **tb, - u32 *flags_key, u32 *flags_mask) +static int fl_set_key_flags(struct nlattr **tb, u32 *flags_key, + u32 *flags_mask, struct netlink_ext_ack *extack) { u32 key, mask; /* mask is mandatory for flags */ - if (!tb[TCA_FLOWER_KEY_FLAGS_MASK]) + if (!tb[TCA_FLOWER_KEY_FLAGS_MASK]) { + NL_SET_ERR_MSG(extack, "Missing flags mask"); return -EINVAL; + } key = be32_to_cpu(nla_get_u32(tb[TCA_FLOWER_KEY_FLAGS])); mask = be32_to_cpu(nla_get_u32(tb[TCA_FLOWER_KEY_FLAGS_MASK])); @@ -1365,7 +1391,7 @@ static int fl_set_key(struct net *net, struct nlattr **tb, sizeof(key->icmp.code)); } else if (key->basic.n_proto == htons(ETH_P_MPLS_UC) || key->basic.n_proto == htons(ETH_P_MPLS_MC)) { - ret = fl_set_key_mpls(tb, &key->mpls, &mask->mpls); + ret = fl_set_key_mpls(tb, &key->mpls, &mask->mpls, extack); if (ret) return ret; } else if (key->basic.n_proto == htons(ETH_P_ARP) || @@ -1390,7 +1416,7 @@ static int fl_set_key(struct net *net, struct nlattr **tb, if (key->basic.ip_proto == IPPROTO_TCP || key->basic.ip_proto == IPPROTO_UDP || key->basic.ip_proto == IPPROTO_SCTP) { - ret = fl_set_key_port_range(tb, key, mask); + ret = fl_set_key_port_range(tb, key, mask, extack); if (ret) return ret; } @@ -1452,7 +1478,8 @@ static int fl_set_key(struct net *net, struct nlattr **tb, return ret; if (tb[TCA_FLOWER_KEY_FLAGS]) - ret = fl_set_key_flags(tb, &key->control.flags, &mask->control.flags); + ret = fl_set_key_flags(tb, &key->control.flags, + &mask->control.flags, extack); return ret; } @@ -2001,8 +2028,7 @@ static int fl_reoffload(struct tcf_proto *tp, bool add, flow_setup_cb_t *cb, cls_flower.rule->match.mask = &f->mask->key; cls_flower.rule->match.key = &f->mkey; - err = tc_setup_flow_action(&cls_flower.rule->action, &f->exts, - true); + err = tc_setup_flow_action(&cls_flower.rule->action, &f->exts); if (err) { kfree(cls_flower.rule); if (tc_skip_sw(f->flags)) { diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c index 610a0b728161..8d39dbcf1746 100644 --- a/net/sched/cls_matchall.c +++ b/net/sched/cls_matchall.c @@ -97,7 +97,7 @@ static int mall_replace_hw_filter(struct tcf_proto *tp, cls_mall.command = TC_CLSMATCHALL_REPLACE; cls_mall.cookie = cookie; - err = tc_setup_flow_action(&cls_mall.rule->action, &head->exts, true); + err = tc_setup_flow_action(&cls_mall.rule->action, &head->exts); if (err) { kfree(cls_mall.rule); mall_destroy_hw_filter(tp, head, cookie, NULL); @@ -302,7 +302,7 @@ static int mall_reoffload(struct tcf_proto *tp, bool add, flow_setup_cb_t *cb, TC_CLSMATCHALL_REPLACE : TC_CLSMATCHALL_DESTROY; cls_mall.cookie = (unsigned long)head; - err = tc_setup_flow_action(&cls_mall.rule->action, &head->exts, true); + err = tc_setup_flow_action(&cls_mall.rule->action, &head->exts); if (err) { kfree(cls_mall.rule); if (add && tc_skip_sw(head->flags)) { @@ -338,7 +338,9 @@ static void mall_stats_hw_filter(struct tcf_proto *tp, tc_setup_cb_call(block, TC_SETUP_CLSMATCHALL, &cls_mall, false, true); tcf_exts_stats_update(&head->exts, cls_mall.stats.bytes, - cls_mall.stats.pkts, cls_mall.stats.lastused); + cls_mall.stats.pkts, cls_mall.stats.lastused, + cls_mall.stats.used_hw_stats, + cls_mall.stats.used_hw_stats_valid); } static int mall_dump(struct net *net, struct tcf_proto *tp, void *fh, diff --git a/net/sched/em_ipt.c b/net/sched/em_ipt.c index 9fff6480acc6..eecfe072c508 100644 --- a/net/sched/em_ipt.c +++ b/net/sched/em_ipt.c @@ -22,7 +22,7 @@ struct em_ipt_match { const struct xt_match *match; u32 hook; u8 nfproto; - u8 match_data[0] __aligned(8); + u8 match_data[] __aligned(8); }; struct em_ipt_xt_match { diff --git a/net/sched/em_nbyte.c b/net/sched/em_nbyte.c index 88c7ce42df7e..2c1192a2ee5e 100644 --- a/net/sched/em_nbyte.c +++ b/net/sched/em_nbyte.c @@ -16,7 +16,7 @@ struct nbyte_data { struct tcf_em_nbyte hdr; - char pattern[0]; + char pattern[]; }; static int em_nbyte_change(struct net *net, void *data, int data_len, diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 50794125bf02..0d99df1e764d 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -618,21 +618,28 @@ void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc) } EXPORT_SYMBOL(qdisc_watchdog_init); -void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires) +void qdisc_watchdog_schedule_range_ns(struct qdisc_watchdog *wd, u64 expires, + u64 delta_ns) { if (test_bit(__QDISC_STATE_DEACTIVATED, &qdisc_root_sleeping(wd->qdisc)->state)) return; - if (wd->last_expires == expires) - return; + if (hrtimer_is_queued(&wd->timer)) { + /* If timer is already set in [expires, expires + delta_ns], + * do not reprogram it. + */ + if (wd->last_expires - expires <= delta_ns) + return; + } wd->last_expires = expires; - hrtimer_start(&wd->timer, - ns_to_ktime(expires), - HRTIMER_MODE_ABS_PINNED); + hrtimer_start_range_ns(&wd->timer, + ns_to_ktime(expires), + delta_ns, + HRTIMER_MODE_ABS_PINNED); } -EXPORT_SYMBOL(qdisc_watchdog_schedule_ns); +EXPORT_SYMBOL(qdisc_watchdog_schedule_range_ns); void qdisc_watchdog_cancel(struct qdisc_watchdog *wd) { diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c index f4f9b8cdbffb..ee12ca9f55b4 100644 --- a/net/sched/sch_atm.c +++ b/net/sched/sch_atm.c @@ -58,7 +58,7 @@ struct atm_flow_data { struct atm_flow_data *excess; /* flow for excess traffic; NULL to set CLP instead */ int hdr_len; - unsigned char hdr[0]; /* header data; MUST BE LAST */ + unsigned char hdr[]; /* header data; MUST BE LAST */ }; struct atm_qdisc_data { diff --git a/net/sched/sch_fifo.c b/net/sched/sch_fifo.c index 37c8aa75d70c..a579a4131d22 100644 --- a/net/sched/sch_fifo.c +++ b/net/sched/sch_fifo.c @@ -12,6 +12,7 @@ #include <linux/errno.h> #include <linux/skbuff.h> #include <net/pkt_sched.h> +#include <net/pkt_cls.h> /* 1 band FIFO pseudo-"scheduler" */ @@ -51,8 +52,49 @@ static int pfifo_tail_enqueue(struct sk_buff *skb, struct Qdisc *sch, return NET_XMIT_CN; } -static int fifo_init(struct Qdisc *sch, struct nlattr *opt, - struct netlink_ext_ack *extack) +static void fifo_offload_init(struct Qdisc *sch) +{ + struct net_device *dev = qdisc_dev(sch); + struct tc_fifo_qopt_offload qopt; + + if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) + return; + + qopt.command = TC_FIFO_REPLACE; + qopt.handle = sch->handle; + qopt.parent = sch->parent; + dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_FIFO, &qopt); +} + +static void fifo_offload_destroy(struct Qdisc *sch) +{ + struct net_device *dev = qdisc_dev(sch); + struct tc_fifo_qopt_offload qopt; + + if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) + return; + + qopt.command = TC_FIFO_DESTROY; + qopt.handle = sch->handle; + qopt.parent = sch->parent; + dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_FIFO, &qopt); +} + +static int fifo_offload_dump(struct Qdisc *sch) +{ + struct tc_fifo_qopt_offload qopt; + + qopt.command = TC_FIFO_STATS; + qopt.handle = sch->handle; + qopt.parent = sch->parent; + qopt.stats.bstats = &sch->bstats; + qopt.stats.qstats = &sch->qstats; + + return qdisc_offload_dump_helper(sch, TC_SETUP_QDISC_FIFO, &qopt); +} + +static int __fifo_init(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) { bool bypass; bool is_bfifo = sch->ops == &bfifo_qdisc_ops; @@ -82,10 +124,35 @@ static int fifo_init(struct Qdisc *sch, struct nlattr *opt, sch->flags |= TCQ_F_CAN_BYPASS; else sch->flags &= ~TCQ_F_CAN_BYPASS; + return 0; } -static int fifo_dump(struct Qdisc *sch, struct sk_buff *skb) +static int fifo_init(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) +{ + int err; + + err = __fifo_init(sch, opt, extack); + if (err) + return err; + + fifo_offload_init(sch); + return 0; +} + +static int fifo_hd_init(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) +{ + return __fifo_init(sch, opt, extack); +} + +static void fifo_destroy(struct Qdisc *sch) +{ + fifo_offload_destroy(sch); +} + +static int __fifo_dump(struct Qdisc *sch, struct sk_buff *skb) { struct tc_fifo_qopt opt = { .limit = sch->limit }; @@ -97,6 +164,22 @@ nla_put_failure: return -1; } +static int fifo_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + int err; + + err = fifo_offload_dump(sch); + if (err) + return err; + + return __fifo_dump(sch, skb); +} + +static int fifo_hd_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + return __fifo_dump(sch, skb); +} + struct Qdisc_ops pfifo_qdisc_ops __read_mostly = { .id = "pfifo", .priv_size = 0, @@ -104,6 +187,7 @@ struct Qdisc_ops pfifo_qdisc_ops __read_mostly = { .dequeue = qdisc_dequeue_head, .peek = qdisc_peek_head, .init = fifo_init, + .destroy = fifo_destroy, .reset = qdisc_reset_queue, .change = fifo_init, .dump = fifo_dump, @@ -118,6 +202,7 @@ struct Qdisc_ops bfifo_qdisc_ops __read_mostly = { .dequeue = qdisc_dequeue_head, .peek = qdisc_peek_head, .init = fifo_init, + .destroy = fifo_destroy, .reset = qdisc_reset_queue, .change = fifo_init, .dump = fifo_dump, @@ -131,10 +216,10 @@ struct Qdisc_ops pfifo_head_drop_qdisc_ops __read_mostly = { .enqueue = pfifo_tail_enqueue, .dequeue = qdisc_dequeue_head, .peek = qdisc_peek_head, - .init = fifo_init, + .init = fifo_hd_init, .reset = qdisc_reset_queue, - .change = fifo_init, - .dump = fifo_dump, + .change = fifo_hd_init, + .dump = fifo_hd_dump, .owner = THIS_MODULE, }; diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index 371ad84def3b..4c060134c736 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -121,6 +121,8 @@ struct fq_sched_data { u64 stat_flows_plimit; u64 stat_pkts_too_long; u64 stat_allocation_errors; + + u32 timer_slack; /* hrtimer slack in ns */ struct qdisc_watchdog watchdog; }; @@ -504,8 +506,9 @@ begin: head = &q->old_flows; if (!head->first) { if (q->time_next_delayed_flow != ~0ULL) - qdisc_watchdog_schedule_ns(&q->watchdog, - q->time_next_delayed_flow); + qdisc_watchdog_schedule_range_ns(&q->watchdog, + q->time_next_delayed_flow, + q->timer_slack); return NULL; } } @@ -735,6 +738,8 @@ static int fq_resize(struct Qdisc *sch, u32 log) } static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = { + [TCA_FQ_UNSPEC] = { .strict_start_type = TCA_FQ_TIMER_SLACK }, + [TCA_FQ_PLIMIT] = { .type = NLA_U32 }, [TCA_FQ_FLOW_PLIMIT] = { .type = NLA_U32 }, [TCA_FQ_QUANTUM] = { .type = NLA_U32 }, @@ -747,6 +752,7 @@ static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = { [TCA_FQ_ORPHAN_MASK] = { .type = NLA_U32 }, [TCA_FQ_LOW_RATE_THRESHOLD] = { .type = NLA_U32 }, [TCA_FQ_CE_THRESHOLD] = { .type = NLA_U32 }, + [TCA_FQ_TIMER_SLACK] = { .type = NLA_U32 }, }; static int fq_change(struct Qdisc *sch, struct nlattr *opt, @@ -833,6 +839,9 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt, q->ce_threshold = (u64)NSEC_PER_USEC * nla_get_u32(tb[TCA_FQ_CE_THRESHOLD]); + if (tb[TCA_FQ_TIMER_SLACK]) + q->timer_slack = nla_get_u32(tb[TCA_FQ_TIMER_SLACK]); + if (!err) { sch_tree_unlock(sch); err = fq_resize(sch, fq_log); @@ -884,6 +893,8 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt, q->orphan_mask = 1024 - 1; q->low_rate_threshold = 550000 / 8; + q->timer_slack = 10 * NSEC_PER_USEC; /* 10 usec of hrtimer slack */ + /* Default ce_threshold of 4294 seconds */ q->ce_threshold = (u64)NSEC_PER_USEC * ~0U; @@ -924,7 +935,8 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb) nla_put_u32(skb, TCA_FQ_LOW_RATE_THRESHOLD, q->low_rate_threshold) || nla_put_u32(skb, TCA_FQ_CE_THRESHOLD, (u32)ce_threshold) || - nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log)) + nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log) || + nla_put_u32(skb, TCA_FQ_TIMER_SLACK, q->timer_slack)) goto nla_put_failure; return nla_nest_end(skb, opts); @@ -947,7 +959,8 @@ static int fq_dump_stats(struct Qdisc *sch, struct gnet_dump *d) st.flows_plimit = q->stat_flows_plimit; st.pkts_too_long = q->stat_pkts_too_long; st.allocation_errors = q->stat_allocation_errors; - st.time_next_delayed_flow = q->time_next_delayed_flow - ktime_get_ns(); + st.time_next_delayed_flow = q->time_next_delayed_flow + q->timer_slack - + ktime_get_ns(); st.flows = q->flows; st.inactive_flows = q->inactive_flows; st.throttled_flows = q->throttled_flows; diff --git a/net/sched/sch_fq_pie.c b/net/sched/sch_fq_pie.c index 214657eb3dfd..a9da8776bf5b 100644 --- a/net/sched/sch_fq_pie.c +++ b/net/sched/sch_fq_pie.c @@ -189,7 +189,6 @@ static int fq_pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, out: q->stats.dropped++; sel_flow->vars.accu_prob = 0; - sel_flow->vars.accu_prob_overflows = 0; __qdisc_drop(skb, to_free); qdisc_qstats_drop(sch); return NET_XMIT_CN; diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 6c9595f1048a..2efd5b61acef 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -1391,6 +1391,14 @@ void mini_qdisc_pair_swap(struct mini_Qdisc_pair *miniqp, } EXPORT_SYMBOL(mini_qdisc_pair_swap); +void mini_qdisc_pair_block_init(struct mini_Qdisc_pair *miniqp, + struct tcf_block *block) +{ + miniqp->miniq1.block = block; + miniqp->miniq2.block = block; +} +EXPORT_SYMBOL(mini_qdisc_pair_block_init); + void mini_qdisc_pair_init(struct mini_Qdisc_pair *miniqp, struct Qdisc *qdisc, struct mini_Qdisc __rcu **p_miniq) { diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c index bf56aa519797..84838128b9c5 100644 --- a/net/sched/sch_ingress.c +++ b/net/sched/sch_ingress.c @@ -78,6 +78,7 @@ static int ingress_init(struct Qdisc *sch, struct nlattr *opt, { struct ingress_sched_data *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); + int err; net_inc_ingress_queue(); @@ -87,7 +88,13 @@ static int ingress_init(struct Qdisc *sch, struct nlattr *opt, q->block_info.chain_head_change = clsact_chain_head_change; q->block_info.chain_head_change_priv = &q->miniqp; - return tcf_block_get_ext(&q->block, sch, &q->block_info, extack); + err = tcf_block_get_ext(&q->block, sch, &q->block_info, extack); + if (err) + return err; + + mini_qdisc_pair_block_init(&q->miniqp, q->block); + + return 0; } static void ingress_destroy(struct Qdisc *sch) @@ -226,6 +233,8 @@ static int clsact_init(struct Qdisc *sch, struct nlattr *opt, if (err) return err; + mini_qdisc_pair_block_init(&q->miniqp_ingress, q->ingress_block); + mini_qdisc_pair_init(&q->miniqp_egress, sch, &dev->miniq_egress); q->egress_block_info.binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_EGRESS; diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 42e557d48e4e..84f82771cdf5 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -66,7 +66,7 @@ struct disttable { u32 size; - s16 table[0]; + s16 table[]; }; struct netem_sched_data { diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c index 915bcdb59a9f..c65077f0c0f3 100644 --- a/net/sched/sch_pie.c +++ b/net/sched/sch_pie.c @@ -31,7 +31,7 @@ struct pie_sched_data { }; bool pie_drop_early(struct Qdisc *sch, struct pie_params *params, - struct pie_vars *vars, u32 qlen, u32 packet_size) + struct pie_vars *vars, u32 backlog, u32 packet_size) { u64 rnd; u64 local_prob = vars->prob; @@ -51,7 +51,7 @@ bool pie_drop_early(struct Qdisc *sch, struct pie_params *params, /* If we have fewer than 2 mtu-sized packets, disable pie_drop_early, * similar to min_th in RED */ - if (qlen < 2 * mtu) + if (backlog < 2 * mtu) return false; /* If bytemode is turned on, use packet size to compute new @@ -62,27 +62,19 @@ bool pie_drop_early(struct Qdisc *sch, struct pie_params *params, else local_prob = vars->prob; - if (local_prob == 0) { + if (local_prob == 0) vars->accu_prob = 0; - vars->accu_prob_overflows = 0; - } - - if (local_prob > MAX_PROB - vars->accu_prob) - vars->accu_prob_overflows++; - - vars->accu_prob += local_prob; + else + vars->accu_prob += local_prob; - if (vars->accu_prob_overflows == 0 && - vars->accu_prob < (MAX_PROB / 100) * 85) + if (vars->accu_prob < (MAX_PROB / 100) * 85) return false; - if (vars->accu_prob_overflows == 8 && - vars->accu_prob >= MAX_PROB / 2) + if (vars->accu_prob >= (MAX_PROB / 2) * 17) return true; prandom_bytes(&rnd, 8); - if (rnd < local_prob) { + if ((rnd >> BITS_PER_BYTE) < local_prob) { vars->accu_prob = 0; - vars->accu_prob_overflows = 0; return true; } @@ -129,7 +121,6 @@ static int pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, out: q->stats.dropped++; q->vars.accu_prob = 0; - q->vars.accu_prob_overflows = 0; return qdisc_drop(skb, sch, to_free); } @@ -215,7 +206,7 @@ static int pie_change(struct Qdisc *sch, struct nlattr *opt, } void pie_process_dequeue(struct sk_buff *skb, struct pie_params *params, - struct pie_vars *vars, u32 qlen) + struct pie_vars *vars, u32 backlog) { psched_time_t now = psched_get_time(); u32 dtime = 0; @@ -231,7 +222,7 @@ void pie_process_dequeue(struct sk_buff *skb, struct pie_params *params, vars->dq_tstamp = now; - if (qlen == 0) + if (backlog == 0) vars->qdelay = 0; if (dtime == 0) @@ -244,7 +235,7 @@ void pie_process_dequeue(struct sk_buff *skb, struct pie_params *params, * we have enough packets to calculate the drain rate. Save * current time as dq_tstamp and start measurement cycle. */ - if (qlen >= QUEUE_THRESHOLD && vars->dq_count == DQCOUNT_INVALID) { + if (backlog >= QUEUE_THRESHOLD && vars->dq_count == DQCOUNT_INVALID) { vars->dq_tstamp = psched_get_time(); vars->dq_count = 0; } @@ -283,7 +274,7 @@ void pie_process_dequeue(struct sk_buff *skb, struct pie_params *params, * dq_count to 0 to re-enter the if block when the next * packet is dequeued */ - if (qlen < QUEUE_THRESHOLD) { + if (backlog < QUEUE_THRESHOLD) { vars->dq_count = DQCOUNT_INVALID; } else { vars->dq_count = 0; @@ -307,7 +298,7 @@ burst_allowance_reduction: EXPORT_SYMBOL_GPL(pie_process_dequeue); void pie_calculate_probability(struct pie_params *params, struct pie_vars *vars, - u32 qlen) + u32 backlog) { psched_time_t qdelay = 0; /* in pschedtime */ psched_time_t qdelay_old = 0; /* in pschedtime */ @@ -322,7 +313,7 @@ void pie_calculate_probability(struct pie_params *params, struct pie_vars *vars, vars->qdelay_old = vars->qdelay; if (vars->avg_dq_rate > 0) - qdelay = (qlen << PIE_SCALE) / vars->avg_dq_rate; + qdelay = (backlog << PIE_SCALE) / vars->avg_dq_rate; else qdelay = 0; } else { @@ -330,10 +321,10 @@ void pie_calculate_probability(struct pie_params *params, struct pie_vars *vars, qdelay_old = vars->qdelay_old; } - /* If qdelay is zero and qlen is not, it means qlen is very small, + /* If qdelay is zero and backlog is not, it means backlog is very small, * so we do not update probabilty in this round. */ - if (qdelay == 0 && qlen != 0) + if (qdelay == 0 && backlog != 0) update_prob = false; /* In the algorithm, alpha and beta are between 0 and 2 with typical @@ -363,8 +354,8 @@ void pie_calculate_probability(struct pie_params *params, struct pie_vars *vars, } /* alpha and beta should be between 0 and 32, in multiples of 1/16 */ - delta += alpha * (u64)(qdelay - params->target); - delta += beta * (u64)(qdelay - qdelay_old); + delta += alpha * (qdelay - params->target); + delta += beta * (qdelay - qdelay_old); oldprob = vars->prob; @@ -409,7 +400,7 @@ void pie_calculate_probability(struct pie_params *params, struct pie_vars *vars, vars->prob -= vars->prob / 64; vars->qdelay = qdelay; - vars->qlen_old = qlen; + vars->backlog_old = backlog; /* We restart the measurement cycle if the following conditions are met * 1. If the delay has been low for 2 consecutive Tupdate periods @@ -502,7 +493,7 @@ static int pie_dump_stats(struct Qdisc *sch, struct gnet_dump *d) { struct pie_sched_data *q = qdisc_priv(sch); struct tc_pie_xstats st = { - .prob = q->vars.prob, + .prob = q->vars.prob << BITS_PER_BYTE, .delay = ((u32)PSCHED_TICKS2NS(q->vars.qdelay)) / NSEC_PER_USEC, .packets_in = q->stats.packets_in, diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c index 1695421333e3..c7de47c942e3 100644 --- a/net/sched/sch_red.c +++ b/net/sched/sch_red.c @@ -35,7 +35,11 @@ struct red_sched_data { u32 limit; /* HARD maximal queue length */ + unsigned char flags; + /* Non-flags in tc_red_qopt.flags. */ + unsigned char userbits; + struct timer_list adapt_timer; struct Qdisc *sch; struct red_parms parms; @@ -44,6 +48,8 @@ struct red_sched_data { struct Qdisc *qdisc; }; +static const u32 red_supported_flags = TC_RED_HISTORIC_FLAGS | TC_RED_NODROP; + static inline int red_use_ecn(struct red_sched_data *q) { return q->flags & TC_RED_ECN; @@ -54,6 +60,11 @@ static inline int red_use_harddrop(struct red_sched_data *q) return q->flags & TC_RED_HARDDROP; } +static int red_use_nodrop(struct red_sched_data *q) +{ + return q->flags & TC_RED_NODROP; +} + static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { @@ -74,23 +85,36 @@ static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch, case RED_PROB_MARK: qdisc_qstats_overlimit(sch); - if (!red_use_ecn(q) || !INET_ECN_set_ce(skb)) { + if (!red_use_ecn(q)) { q->stats.prob_drop++; goto congestion_drop; } - q->stats.prob_mark++; + if (INET_ECN_set_ce(skb)) { + q->stats.prob_mark++; + } else if (!red_use_nodrop(q)) { + q->stats.prob_drop++; + goto congestion_drop; + } + + /* Non-ECT packet in ECN nodrop mode: queue it. */ break; case RED_HARD_MARK: qdisc_qstats_overlimit(sch); - if (red_use_harddrop(q) || !red_use_ecn(q) || - !INET_ECN_set_ce(skb)) { + if (red_use_harddrop(q) || !red_use_ecn(q)) { + q->stats.forced_drop++; + goto congestion_drop; + } + + if (INET_ECN_set_ce(skb)) { + q->stats.forced_mark++; + } else if (!red_use_nodrop(q)) { q->stats.forced_drop++; goto congestion_drop; } - q->stats.forced_mark++; + /* Non-ECT packet in ECN nodrop mode: queue it. */ break; } @@ -165,6 +189,7 @@ static int red_offload(struct Qdisc *sch, bool enable) opt.set.limit = q->limit; opt.set.is_ecn = red_use_ecn(q); opt.set.is_harddrop = red_use_harddrop(q); + opt.set.is_nodrop = red_use_nodrop(q); opt.set.qstats = &sch->qstats; } else { opt.command = TC_RED_DESTROY; @@ -183,9 +208,12 @@ static void red_destroy(struct Qdisc *sch) } static const struct nla_policy red_policy[TCA_RED_MAX + 1] = { + [TCA_RED_UNSPEC] = { .strict_start_type = TCA_RED_FLAGS }, [TCA_RED_PARMS] = { .len = sizeof(struct tc_red_qopt) }, [TCA_RED_STAB] = { .len = RED_STAB_SIZE }, [TCA_RED_MAX_P] = { .type = NLA_U32 }, + [TCA_RED_FLAGS] = { .type = NLA_BITFIELD32, + .validation_data = &red_supported_flags }, }; static int red_change(struct Qdisc *sch, struct nlattr *opt, @@ -194,7 +222,10 @@ static int red_change(struct Qdisc *sch, struct nlattr *opt, struct Qdisc *old_child = NULL, *child = NULL; struct red_sched_data *q = qdisc_priv(sch); struct nlattr *tb[TCA_RED_MAX + 1]; + struct nla_bitfield32 flags_bf; struct tc_red_qopt *ctl; + unsigned char userbits; + unsigned char flags; int err; u32 max_P; @@ -216,6 +247,12 @@ static int red_change(struct Qdisc *sch, struct nlattr *opt, if (!red_check_params(ctl->qth_min, ctl->qth_max, ctl->Wlog)) return -EINVAL; + err = red_get_flags(ctl->flags, TC_RED_HISTORIC_FLAGS, + tb[TCA_RED_FLAGS], red_supported_flags, + &flags_bf, &userbits, extack); + if (err) + return err; + if (ctl->limit > 0) { child = fifo_create_dflt(sch, &bfifo_qdisc_ops, ctl->limit, extack); @@ -227,7 +264,14 @@ static int red_change(struct Qdisc *sch, struct nlattr *opt, } sch_tree_lock(sch); - q->flags = ctl->flags; + + flags = (q->flags & ~flags_bf.selector) | flags_bf.value; + err = red_validate_flags(flags, extack); + if (err) + goto unlock_out; + + q->flags = flags; + q->userbits = userbits; q->limit = ctl->limit; if (child) { qdisc_tree_flush_backlog(q->qdisc); @@ -256,6 +300,12 @@ static int red_change(struct Qdisc *sch, struct nlattr *opt, if (old_child) qdisc_put(old_child); return 0; + +unlock_out: + sch_tree_unlock(sch); + if (child) + qdisc_put(child); + return err; } static inline void red_adaptative_timer(struct timer_list *t) @@ -302,7 +352,8 @@ static int red_dump(struct Qdisc *sch, struct sk_buff *skb) struct nlattr *opts = NULL; struct tc_red_qopt opt = { .limit = q->limit, - .flags = q->flags, + .flags = (q->flags & TC_RED_HISTORIC_FLAGS) | + q->userbits, .qth_min = q->parms.qth_min >> q->parms.Wlog, .qth_max = q->parms.qth_max >> q->parms.Wlog, .Wlog = q->parms.Wlog, @@ -319,7 +370,9 @@ static int red_dump(struct Qdisc *sch, struct sk_buff *skb) if (opts == NULL) goto nla_put_failure; if (nla_put(skb, TCA_RED_PARMS, sizeof(opt), &opt) || - nla_put_u32(skb, TCA_RED_MAX_P, q->parms.max_P)) + nla_put_u32(skb, TCA_RED_MAX_P, q->parms.max_P) || + nla_put_bitfield32(skb, TCA_RED_FLAGS, + q->flags, red_supported_flags)) goto nla_put_failure; return nla_nest_end(skb, opts); |