From e298e505700604c97e6a9edb21cebb080bdb91f6 Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Tue, 29 Oct 2013 17:22:21 -0700 Subject: openvswitch: Per cpu flow stats. With mega flow implementation ovs flow can be shared between multiple CPUs which makes stats updates highly contended operation. This patch uses per-CPU stats in cases where a flow is likely to be shared (if there is a wildcard in the 5-tuple and therefore likely to be spread by RSS). In other situations, it uses the current strategy, saving memory and allocation time. Signed-off-by: Pravin B Shelar Signed-off-by: Jesse Gross --- net/openvswitch/flow.c | 96 ++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 89 insertions(+), 7 deletions(-) (limited to 'net/openvswitch/flow.c') diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index b409f5279601..16f4b46161d4 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -35,6 +35,7 @@ #include #include #include +#include #include #include #include @@ -60,10 +61,16 @@ u64 ovs_flow_used_time(unsigned long flow_jiffies) #define TCP_FLAGS_BE16(tp) (*(__be16 *)&tcp_flag_word(tp) & htons(0x0FFF)) -void ovs_flow_used(struct sw_flow *flow, struct sk_buff *skb) +void ovs_flow_stats_update(struct sw_flow *flow, struct sk_buff *skb) { + struct flow_stats *stats; __be16 tcp_flags = 0; + if (!flow->stats.is_percpu) + stats = flow->stats.stat; + else + stats = this_cpu_ptr(flow->stats.cpu_stats); + if ((flow->key.eth.type == htons(ETH_P_IP) || flow->key.eth.type == htons(ETH_P_IPV6)) && flow->key.ip.proto == IPPROTO_TCP && @@ -71,12 +78,87 @@ void ovs_flow_used(struct sw_flow *flow, struct sk_buff *skb) tcp_flags = TCP_FLAGS_BE16(tcp_hdr(skb)); } - spin_lock(&flow->lock); - flow->used = jiffies; - flow->packet_count++; - flow->byte_count += skb->len; - flow->tcp_flags |= tcp_flags; - spin_unlock(&flow->lock); + spin_lock(&stats->lock); + stats->used = jiffies; + stats->packet_count++; + stats->byte_count += skb->len; + stats->tcp_flags |= tcp_flags; + spin_unlock(&stats->lock); +} + +static void stats_read(struct flow_stats *stats, + struct ovs_flow_stats *ovs_stats, + unsigned long *used, __be16 *tcp_flags) +{ + spin_lock(&stats->lock); + if (time_after(stats->used, *used)) + *used = stats->used; + *tcp_flags |= stats->tcp_flags; + ovs_stats->n_packets += stats->packet_count; + ovs_stats->n_bytes += stats->byte_count; + spin_unlock(&stats->lock); +} + +void ovs_flow_stats_get(struct sw_flow *flow, struct ovs_flow_stats *ovs_stats, + unsigned long *used, __be16 *tcp_flags) +{ + int cpu, cur_cpu; + + *used = 0; + *tcp_flags = 0; + memset(ovs_stats, 0, sizeof(*ovs_stats)); + + if (!flow->stats.is_percpu) { + stats_read(flow->stats.stat, ovs_stats, used, tcp_flags); + } else { + cur_cpu = get_cpu(); + for_each_possible_cpu(cpu) { + struct flow_stats *stats; + + if (cpu == cur_cpu) + local_bh_disable(); + + stats = per_cpu_ptr(flow->stats.cpu_stats, cpu); + stats_read(stats, ovs_stats, used, tcp_flags); + + if (cpu == cur_cpu) + local_bh_enable(); + } + put_cpu(); + } +} + +static void stats_reset(struct flow_stats *stats) +{ + spin_lock(&stats->lock); + stats->used = 0; + stats->packet_count = 0; + stats->byte_count = 0; + stats->tcp_flags = 0; + spin_unlock(&stats->lock); +} + +void ovs_flow_stats_clear(struct sw_flow *flow) +{ + int cpu, cur_cpu; + + if (!flow->stats.is_percpu) { + stats_reset(flow->stats.stat); + } else { + cur_cpu = get_cpu(); + + for_each_possible_cpu(cpu) { + + if (cpu == cur_cpu) + local_bh_disable(); + + stats_reset(per_cpu_ptr(flow->stats.cpu_stats, cpu)); + + if (cpu == cur_cpu) + local_bh_enable(); + } + put_cpu(); + } } static int check_header(struct sk_buff *skb, int len) -- cgit v1.2.3 From 04382a3303c22b0c536fbd0c94c1f012f2b8ed60 Mon Sep 17 00:00:00 2001 From: Jarno Rajahalme Date: Sat, 15 Feb 2014 17:37:45 -0800 Subject: openvswitch: Read tcp flags only then the tranport header is present. Only the first IP fragment can have a TCP header, check for this. Signed-off-by: Jarno Rajahalme Signed-off-by: Jesse Gross --- net/openvswitch/flow.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/openvswitch/flow.c') diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index 16f4b46161d4..d71e60fa28cb 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -73,6 +73,7 @@ void ovs_flow_stats_update(struct sw_flow *flow, struct sk_buff *skb) if ((flow->key.eth.type == htons(ETH_P_IP) || flow->key.eth.type == htons(ETH_P_IPV6)) && + flow->key.ip.frag != OVS_FRAG_TYPE_LATER && flow->key.ip.proto == IPPROTO_TCP && likely(skb->len >= skb_transport_offset(skb) + sizeof(struct tcphdr))) { tcp_flags = TCP_FLAGS_BE16(tcp_hdr(skb)); -- cgit v1.2.3 From f9b8c4c8baded129535d82d74df8e87a7a369f54 Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Thu, 20 Mar 2014 10:45:21 -0700 Subject: openvswitch: Correctly report flow used times for first 5 minutes after boot. The kernel starts out its "jiffies" timer as 5 minutes below zero, as shown in include/linux/jiffies.h: /* * Have the 32 bit jiffies value wrap 5 minutes after boot * so jiffies wrap bugs show up earlier. */ #define INITIAL_JIFFIES ((unsigned long)(unsigned int) (-300*HZ)) The loop in ovs_flow_stats_get() starts out with 'used' set to 0, then takes any "later" time. This means that for the first five minutes after boot, flows will always be reported as never used, since 0 is greater than any time already seen. Signed-off-by: Ben Pfaff Acked-by: Pravin B Shelar Signed-off-by: Jesse Gross --- net/openvswitch/flow.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/openvswitch/flow.c') diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index d71e60fa28cb..dda451f4429c 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -92,7 +92,7 @@ static void stats_read(struct flow_stats *stats, unsigned long *used, __be16 *tcp_flags) { spin_lock(&stats->lock); - if (time_after(stats->used, *used)) + if (!*used || time_after(stats->used, *used)) *used = stats->used; *tcp_flags |= stats->tcp_flags; ovs_stats->n_packets += stats->packet_count; -- cgit v1.2.3 From 4f647e0a3c37b8d5086214128614a136064110c3 Mon Sep 17 00:00:00 2001 From: Flavio Leitner Date: Thu, 27 Mar 2014 11:05:34 -0300 Subject: openvswitch: fix a possible deadlock and lockdep warning There are two problematic situations. A deadlock can happen when is_percpu is false because it can get interrupted while holding the spinlock. Then it executes ovs_flow_stats_update() in softirq context which tries to get the same lock. The second sitation is that when is_percpu is true, the code correctly disables BH but only for the local CPU, so the following can happen when locking the remote CPU without disabling BH: CPU#0 CPU#1 ovs_flow_stats_get() stats_read() +->spin_lock remote CPU#1 ovs_flow_stats_get() | stats_read() | ... +--> spin_lock remote CPU#0 | | | ovs_flow_stats_update() | ... | spin_lock local CPU#0 <--+ ovs_flow_stats_update() +---------------------------------- spin_lock local CPU#1 This patch disables BH for both cases fixing the deadlocks. Acked-by: Jesse Gross ================================= [ INFO: inconsistent lock state ] 3.14.0-rc8-00007-g632b06a #1 Tainted: G I --------------------------------- inconsistent {SOFTIRQ-ON-W} -> {IN-SOFTIRQ-W} usage. swapper/0/0 [HC0[0]:SC1[5]:HE1:SE0] takes: (&(&cpu_stats->lock)->rlock){+.?...}, at: [] ovs_flow_stats_update+0x51/0xd0 [openvswitch] {SOFTIRQ-ON-W} state was registered at: [] __lock_acquire+0x68f/0x1c40 [] lock_acquire+0xa2/0x1d0 [] _raw_spin_lock+0x3e/0x80 [] ovs_flow_stats_get+0xc4/0x1e0 [openvswitch] [] ovs_flow_cmd_fill_info+0x185/0x360 [openvswitch] [] ovs_flow_cmd_build_info.constprop.27+0x55/0x90 [openvswitch] [] ovs_flow_cmd_new_or_set+0x4dd/0x570 [openvswitch] [] genl_family_rcv_msg+0x1cd/0x3f0 [] genl_rcv_msg+0x8e/0xd0 [] netlink_rcv_skb+0xa9/0xc0 [] genl_rcv+0x28/0x40 [] netlink_unicast+0x100/0x1e0 [] netlink_sendmsg+0x347/0x770 [] sock_sendmsg+0x9c/0xe0 [] ___sys_sendmsg+0x3a9/0x3c0 [] __sys_sendmsg+0x51/0x90 [] SyS_sendmsg+0x12/0x20 [] system_call_fastpath+0x16/0x1b irq event stamp: 1740726 hardirqs last enabled at (1740726): [] ip6_finish_output2+0x4f0/0x840 hardirqs last disabled at (1740725): [] ip6_finish_output2+0x4ab/0x840 softirqs last enabled at (1740674): [] _local_bh_enable+0x22/0x50 softirqs last disabled at (1740675): [] irq_exit+0xc5/0xd0 other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(&(&cpu_stats->lock)->rlock); lock(&(&cpu_stats->lock)->rlock); *** DEADLOCK *** 5 locks held by swapper/0/0: #0: (((&ifa->dad_timer))){+.-...}, at: [] call_timer_fn+0x5/0x320 #1: (rcu_read_lock){.+.+..}, at: [] mld_sendpack+0x5/0x4a0 #2: (rcu_read_lock_bh){.+....}, at: [] ip6_finish_output2+0x59/0x840 #3: (rcu_read_lock_bh){.+....}, at: [] __dev_queue_xmit+0x5/0x9b0 #4: (rcu_read_lock){.+.+..}, at: [] internal_dev_xmit+0x5/0x110 [openvswitch] stack backtrace: CPU: 0 PID: 0 Comm: swapper/0 Tainted: G I 3.14.0-rc8-00007-g632b06a #1 Hardware name: /DX58SO, BIOS SOX5810J.86A.5599.2012.0529.2218 05/29/2012 0000000000000000 0fcf20709903df0c ffff88042d603808 ffffffff817cfe3c ffffffff81c134c0 ffff88042d603858 ffffffff817cb6da 0000000000000005 ffffffff00000001 ffff880400000000 0000000000000006 ffffffff81c134c0 Call Trace: [] dump_stack+0x4d/0x66 [] print_usage_bug+0x1f4/0x205 [] ? check_usage_backwards+0x180/0x180 [] mark_lock+0x223/0x2b0 [] __lock_acquire+0x623/0x1c40 [] ? __lock_is_held+0x57/0x80 [] ? masked_flow_lookup+0x236/0x250 [openvswitch] [] lock_acquire+0xa2/0x1d0 [] ? ovs_flow_stats_update+0x51/0xd0 [openvswitch] [] _raw_spin_lock+0x3e/0x80 [] ? ovs_flow_stats_update+0x51/0xd0 [openvswitch] [] ovs_flow_stats_update+0x51/0xd0 [openvswitch] [] ovs_dp_process_received_packet+0x84/0x120 [openvswitch] [] ? __lock_acquire+0x347/0x1c40 [] ovs_vport_receive+0x2a/0x30 [openvswitch] [] internal_dev_xmit+0x68/0x110 [openvswitch] [] ? internal_dev_xmit+0x5/0x110 [openvswitch] [] dev_hard_start_xmit+0x2e6/0x8b0 [] __dev_queue_xmit+0x417/0x9b0 [] ? __dev_queue_xmit+0x5/0x9b0 [] ? ip6_finish_output2+0x4f0/0x840 [] dev_queue_xmit+0x10/0x20 [] ip6_finish_output2+0x551/0x840 [] ? ip6_finish_output+0x9a/0x220 [] ip6_finish_output+0x9a/0x220 [] ip6_output+0x4f/0x1f0 [] mld_sendpack+0x1d9/0x4a0 [] mld_send_initial_cr.part.32+0x88/0xa0 [] ? addrconf_dad_completed+0x220/0x220 [] ipv6_mc_dad_complete+0x31/0x50 [] addrconf_dad_completed+0x147/0x220 [] ? addrconf_dad_completed+0x220/0x220 [] addrconf_dad_timer+0x19f/0x1c0 [] call_timer_fn+0x99/0x320 [] ? call_timer_fn+0x5/0x320 [] ? addrconf_dad_completed+0x220/0x220 [] run_timer_softirq+0x254/0x3b0 [] __do_softirq+0x12d/0x480 Signed-off-by: Flavio Leitner Signed-off-by: David S. Miller --- net/openvswitch/flow.c | 26 ++++++-------------------- 1 file changed, 6 insertions(+), 20 deletions(-) (limited to 'net/openvswitch/flow.c') diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index dda451f4429c..2998989e76db 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -103,30 +103,24 @@ static void stats_read(struct flow_stats *stats, void ovs_flow_stats_get(struct sw_flow *flow, struct ovs_flow_stats *ovs_stats, unsigned long *used, __be16 *tcp_flags) { - int cpu, cur_cpu; + int cpu; *used = 0; *tcp_flags = 0; memset(ovs_stats, 0, sizeof(*ovs_stats)); + local_bh_disable(); if (!flow->stats.is_percpu) { stats_read(flow->stats.stat, ovs_stats, used, tcp_flags); } else { - cur_cpu = get_cpu(); for_each_possible_cpu(cpu) { struct flow_stats *stats; - if (cpu == cur_cpu) - local_bh_disable(); - stats = per_cpu_ptr(flow->stats.cpu_stats, cpu); stats_read(stats, ovs_stats, used, tcp_flags); - - if (cpu == cur_cpu) - local_bh_enable(); } - put_cpu(); } + local_bh_enable(); } static void stats_reset(struct flow_stats *stats) @@ -141,25 +135,17 @@ static void stats_reset(struct flow_stats *stats) void ovs_flow_stats_clear(struct sw_flow *flow) { - int cpu, cur_cpu; + int cpu; + local_bh_disable(); if (!flow->stats.is_percpu) { stats_reset(flow->stats.stat); } else { - cur_cpu = get_cpu(); - for_each_possible_cpu(cpu) { - - if (cpu == cur_cpu) - local_bh_disable(); - stats_reset(per_cpu_ptr(flow->stats.cpu_stats, cpu)); - - if (cpu == cur_cpu) - local_bh_enable(); } - put_cpu(); } + local_bh_enable(); } static int check_header(struct sk_buff *skb, int len) -- cgit v1.2.3