From 03f1eccc7a69c965351e6bee41c62afa2844752f Mon Sep 17 00:00:00 2001 From: Stephen Suryaputra Date: Tue, 19 Mar 2019 12:37:12 -0400 Subject: ipv6: Add icmp_echo_ignore_multicast support for ICMPv6 IPv4 has icmp_echo_ignore_broadcast to prevent responding to broadcast pings. IPv6 needs a similar mechanism. v1->v2: - Remove NET_IPV6_ICMP_ECHO_IGNORE_MULTICAST. Signed-off-by: Stephen Suryaputra Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'Documentation') diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index acdfb5d2bcaa..55ea7def46be 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -1918,6 +1918,11 @@ echo_ignore_all - BOOLEAN requests sent to it over the IPv6 protocol. Default: 0 +echo_ignore_multicast - BOOLEAN + If set non-zero, then the kernel will ignore all ICMP ECHO + requests sent to it over the IPv6 protocol via multicast. + Default: 0 + xfrm6_gc_thresh - INTEGER The threshold at which we will start garbage collecting for IPv6 destination cache entries. At twice this value the system will -- cgit v1.2.3 From 0b03a5ca8b14321366eec4a903922d2b46d585ff Mon Sep 17 00:00:00 2001 From: Stephen Suryaputra Date: Wed, 20 Mar 2019 10:29:27 -0400 Subject: ipv6: Add icmp_echo_ignore_anycast for ICMPv6 In addition to icmp_echo_ignore_multicast, there is a need to also prevent responding to pings to anycast addresses for security. Signed-off-by: Stephen Suryaputra Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 5 +++++ include/net/netns/ipv6.h | 1 + net/ipv6/af_inet6.c | 1 + net/ipv6/icmp.c | 16 ++++++++++++++-- 4 files changed, 21 insertions(+), 2 deletions(-) (limited to 'Documentation') diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 55ea7def46be..bd029fc55ccb 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -1923,6 +1923,11 @@ echo_ignore_multicast - BOOLEAN requests sent to it over the IPv6 protocol via multicast. Default: 0 +echo_ignore_anycast - BOOLEAN + If set non-zero, then the kernel will ignore all ICMP ECHO + requests sent to it over the IPv6 protocol destined to anycast address. + Default: 0 + xfrm6_gc_thresh - INTEGER The threshold at which we will start garbage collecting for IPv6 destination cache entries. At twice this value the system will diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index e29aff15acc9..64e29b58bb5e 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -34,6 +34,7 @@ struct netns_sysctl_ipv6 { int icmpv6_time; int icmpv6_echo_ignore_all; int icmpv6_echo_ignore_multicast; + int icmpv6_echo_ignore_anycast; int anycast_src_echo_reply; int ip_nonlocal_bind; int fwmark_reflect; diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index fdc117de849c..fa6b404cbd10 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -848,6 +848,7 @@ static int __net_init inet6_net_init(struct net *net) net->ipv6.sysctl.icmpv6_time = 1*HZ; net->ipv6.sysctl.icmpv6_echo_ignore_all = 0; net->ipv6.sysctl.icmpv6_echo_ignore_multicast = 0; + net->ipv6.sysctl.icmpv6_echo_ignore_anycast = 0; net->ipv6.sysctl.flowlabel_consistency = 1; net->ipv6.sysctl.auto_flowlabels = IP6_DEFAULT_AUTO_FLOW_LABELS; net->ipv6.sysctl.idgen_retries = 3; diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 0907bcede5e5..cc14b9998941 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -683,6 +683,7 @@ static void icmpv6_echo_reply(struct sk_buff *skb) struct dst_entry *dst; struct ipcm6_cookie ipc6; u32 mark = IP6_REPLY_MARK(net, skb->mark); + bool acast; if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr) && net->ipv6.sysctl.icmpv6_echo_ignore_multicast) @@ -690,9 +691,12 @@ static void icmpv6_echo_reply(struct sk_buff *skb) saddr = &ipv6_hdr(skb)->daddr; + acast = ipv6_anycast_destination(skb_dst(skb), saddr); + if (acast && net->ipv6.sysctl.icmpv6_echo_ignore_anycast) + return; + if (!ipv6_unicast_destination(skb) && - !(net->ipv6.sysctl.anycast_src_echo_reply && - ipv6_anycast_destination(skb_dst(skb), saddr))) + !(net->ipv6.sysctl.anycast_src_echo_reply && acast)) saddr = NULL; memcpy(&tmp_hdr, icmph, sizeof(tmp_hdr)); @@ -1126,6 +1130,13 @@ static struct ctl_table ipv6_icmp_table_template[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "echo_ignore_anycast", + .data = &init_net.ipv6.sysctl.icmpv6_echo_ignore_anycast, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, { }, }; @@ -1141,6 +1152,7 @@ struct ctl_table * __net_init ipv6_icmp_sysctl_init(struct net *net) table[0].data = &net->ipv6.sysctl.icmpv6_time; table[1].data = &net->ipv6.sysctl.icmpv6_echo_ignore_all; table[2].data = &net->ipv6.sysctl.icmpv6_echo_ignore_multicast; + table[3].data = &net->ipv6.sysctl.icmpv6_echo_ignore_anycast; } return table; } -- cgit v1.2.3 From 9ab948a91b2c2abc8e82845c0e61f4b1683e3a4f Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 20 Mar 2019 09:18:59 -0700 Subject: ipv4: Allow amount of dirty memory from fib resizing to be controllable fib_trie implementation calls synchronize_rcu when a certain amount of pages are dirty from freed entries. The number of pages was determined experimentally in 2009 (commit c3059477fce2d). At the current setting, synchronize_rcu is called often -- 51 times in a second in one test with an average of an 8 msec delay adding a fib entry. The total impact is a lot of slow down modifying the fib. This is seen in the output of 'time' - the difference between real time and sys+user. For example, using 720,022 single path routes and 'ip -batch'[1]: $ time ./ip -batch ipv4/routes-1-hops real 0m14.214s user 0m2.513s sys 0m6.783s So roughly 35% of the actual time to install the routes is from the ip command getting scheduled out, most notably due to synchronize_rcu (this is observed using 'perf sched timehist'). This patch makes the amount of dirty memory configurable between 64k where the synchronize_rcu is called often (small, low end systems that are memory sensitive) to 64M where synchronize_rcu is called rarely during a large FIB change (for high end systems with lots of memory). The default is 512kB which corresponds to the current setting of 128 pages with a 4kB page size. As an example, at 16MB the worst interval shows 4 calls to synchronize_rcu in a second blocking for up to 30 msec in a single instance, and a total of almost 100 msec across the 4 calls in the second. The trade off is allowing FIB entries to consume more memory in a given time window but but with much better fib insertion rates (~30% increase in prefixes/sec). With this patch and net.ipv4.fib_sync_mem set to 16MB, the same batch file runs in: $ time ./ip -batch ipv4/routes-1-hops real 0m9.692s user 0m2.491s sys 0m6.769s So the dead time is reduced to about 1/2 second or <5% of the real time. [1] 'ip' modified to not request ACK messages which improves route insertion times by about 20% Signed-off-by: David Ahern Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 5 +++++ include/net/ip.h | 4 ++++ net/ipv4/fib_trie.c | 14 ++++++++------ net/ipv4/sysctl_net_ipv4.c | 9 +++++++++ 4 files changed, 26 insertions(+), 6 deletions(-) (limited to 'Documentation') diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index bd029fc55ccb..5eedc6941ce5 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -81,6 +81,11 @@ fib_multipath_hash_policy - INTEGER 0 - Layer 3 1 - Layer 4 +fib_sync_mem - UNSIGNED INTEGER + Amount of dirty memory from fib entries that can be backlogged before + synchronize_rcu is forced. + Default: 512kB Minimum: 64kB Maximum: 64MB + ip_forward_update_priority - INTEGER Whether to update SKB priority from "TOS" field in IPv4 header after it is forwarded. The new SKB priority is mapped from TOS field value diff --git a/include/net/ip.h b/include/net/ip.h index be3cad9c2e4c..aa09ae5f01a5 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -38,6 +38,10 @@ #define IPV4_MAX_PMTU 65535U /* RFC 2675, Section 5.1 */ #define IPV4_MIN_MTU 68 /* RFC 791 */ +extern unsigned int sysctl_fib_sync_mem; +extern unsigned int sysctl_fib_sync_mem_min; +extern unsigned int sysctl_fib_sync_mem_max; + struct sock; struct inet_skb_parm { diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index a573e37e0615..1704f432de1f 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -183,14 +183,16 @@ struct trie { }; static struct key_vector *resize(struct trie *t, struct key_vector *tn); -static size_t tnode_free_size; +static unsigned int tnode_free_size; /* - * synchronize_rcu after call_rcu for that many pages; it should be especially - * useful before resizing the root node with PREEMPT_NONE configs; the value was - * obtained experimentally, aiming to avoid visible slowdown. + * synchronize_rcu after call_rcu for outstanding dirty memory; it should be + * especially useful before resizing the root node with PREEMPT_NONE configs; + * the value was obtained experimentally, aiming to avoid visible slowdown. */ -static const int sync_pages = 128; +unsigned int sysctl_fib_sync_mem = 512 * 1024; +unsigned int sysctl_fib_sync_mem_min = 64 * 1024; +unsigned int sysctl_fib_sync_mem_max = 64 * 1024 * 1024; static struct kmem_cache *fn_alias_kmem __ro_after_init; static struct kmem_cache *trie_leaf_kmem __ro_after_init; @@ -504,7 +506,7 @@ static void tnode_free(struct key_vector *tn) tn = container_of(head, struct tnode, rcu)->kv; } - if (tnode_free_size >= PAGE_SIZE * sync_pages) { + if (tnode_free_size >= sysctl_fib_sync_mem) { tnode_free_size = 0; synchronize_rcu(); } diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index ba0fc4b18465..2316c08e9591 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -549,6 +549,15 @@ static struct ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, + { + .procname = "fib_sync_mem", + .data = &sysctl_fib_sync_mem, + .maxlen = sizeof(sysctl_fib_sync_mem), + .mode = 0644, + .proc_handler = proc_douintvec_minmax, + .extra1 = &sysctl_fib_sync_mem_min, + .extra2 = &sysctl_fib_sync_mem_max, + }, { } }; -- cgit v1.2.3 From 4c35e15a8311fbf5dd95b598a43bb28985876ee2 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Sun, 3 Mar 2019 18:02:55 +0100 Subject: batman-adv: Drop documentation about debugfs files The debugfs files were marked as deprecated by commit 00caf6a2b318 ("batman-adv: Mark debugfs functionality as deprecated"). The documentation should not advertise its usage anymore and instead promote the generic netlink family and a userspace tool to access it. Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- Documentation/networking/batman-adv.rst | 29 +++++++++-------------------- 1 file changed, 9 insertions(+), 20 deletions(-) (limited to 'Documentation') diff --git a/Documentation/networking/batman-adv.rst b/Documentation/networking/batman-adv.rst index 245fb6c0ab6f..1b9ff47c0976 100644 --- a/Documentation/networking/batman-adv.rst +++ b/Documentation/networking/batman-adv.rst @@ -74,23 +74,9 @@ All mesh wide settings can be found in batman's own interface folder:: bridge_loop_avoidance gw_sel_class network_coding distributed_arp_table hop_penalty orig_interval -There is a special folder for debugging information:: - - $ ls /sys/kernel/debug/batman_adv/bat0/ - bla_backbone_table log neighbors transtable_local - bla_claim_table mcast_flags originators - dat_cache nc socket - gateways nc_nodes transtable_global - -Some of the files contain all sort of status information regarding the mesh -network. For example, you can view the table of originators (mesh -participants) with:: - - $ cat /sys/kernel/debug/batman_adv/bat0/originators - -Other files allow to change batman's behaviour to better fit your requirements. -For instance, you can check the current originator interval (value in -milliseconds which determines how often batman sends its broadcast packets):: +Some files allow to change batman-adv's behaviour to better fit your +requirements. For instance, you can check the current originator interval (value +in milliseconds which determines how often batman sends its broadcast packets):: $ cat /sys/class/net/bat0/mesh/orig_interval 1000 @@ -103,6 +89,10 @@ In very mobile scenarios, you might want to adjust the originator interval to a lower value. This will make the mesh more responsive to topology changes, but will also increase the overhead. +Information about the current state can be accessed via the batadv generic +netlink family. batctl provides human readable version via its debug tables +subcommands. + Usage ===== @@ -147,10 +137,9 @@ batman-adv module. When building batman-adv as part of kernel, use "make menuconfig" and enable the option ``B.A.T.M.A.N. debugging`` (``CONFIG_BATMAN_ADV_DEBUG=y``). -Those additional debug messages can be accessed using a special file in -debugfs:: +Those additional debug messages can be accessed using the perf infrastructure:: - $ cat /sys/kernel/debug/batman_adv/bat0/log + $ trace-cmd stream -e batadv:batadv_dbg The additional debug output is by default disabled. It can be enabled during run time. Following log_levels are defined: -- cgit v1.2.3 From 52735a6f0bd2593d681001ded2f7fbbee168a235 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Sun, 3 Mar 2019 18:02:56 +0100 Subject: batman-adv: Drop documentation about sysfs files The sysfs files will be marked as deprecated in the near future. They are already replaced by the batadv generic netlink family. The documentation should not advertise its usage anymore and instead promote the generic netlink family and a userspace tool to access it. Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- Documentation/networking/batman-adv.rst | 91 +++++++++------------------------ 1 file changed, 24 insertions(+), 67 deletions(-) (limited to 'Documentation') diff --git a/Documentation/networking/batman-adv.rst b/Documentation/networking/batman-adv.rst index 1b9ff47c0976..18020943ba25 100644 --- a/Documentation/networking/batman-adv.rst +++ b/Documentation/networking/batman-adv.rst @@ -27,24 +27,8 @@ Load the batman-adv module into your kernel:: $ insmod batman-adv.ko The module is now waiting for activation. You must add some interfaces on which -batman can operate. After loading the module batman advanced will scan your -systems interfaces to search for compatible interfaces. Once found, it will -create subfolders in the ``/sys`` directories of each supported interface, -e.g.:: - - $ ls /sys/class/net/eth0/batman_adv/ - elp_interval iface_status mesh_iface throughput_override - -If an interface does not have the ``batman_adv`` subfolder, it probably is not -supported. Not supported interfaces are: loopback, non-ethernet and batman's -own interfaces. - -Note: After the module was loaded it will continuously watch for new -interfaces to verify the compatibility. There is no need to reload the module -if you plug your USB wifi adapter into your machine after batman advanced was -initially loaded. - -The batman-adv soft-interface can be created using the iproute2 tool ``ip``:: +batman-adv can operate. The batman-adv soft-interface can be created using the +iproute2 tool ``ip``:: $ ip link add name bat0 type batadv @@ -52,38 +36,37 @@ To activate a given interface simply attach it to the ``bat0`` interface:: $ ip link set dev eth0 master bat0 -Repeat this step for all interfaces you wish to add. Now batman starts +Repeat this step for all interfaces you wish to add. Now batman-adv starts using/broadcasting on this/these interface(s). -By reading the "iface_status" file you can check its status:: - - $ cat /sys/class/net/eth0/batman_adv/iface_status - active - To deactivate an interface you have to detach it from the "bat0" interface:: $ ip link set dev eth0 nomaster +The same can also be done using the batctl interface subcommand:: + + batctl -m bat0 interface create + batctl -m bat0 interface add -M eth0 + +To detach eth0 and destroy bat0:: -All mesh wide settings can be found in batman's own interface folder:: + batctl -m bat0 interface del -M eth0 + batctl -m bat0 interface destroy - $ ls /sys/class/net/bat0/mesh/ - aggregated_ogms fragmentation isolation_mark routing_algo - ap_isolation gw_bandwidth log_level vlan0 - bonding gw_mode multicast_mode - bridge_loop_avoidance gw_sel_class network_coding - distributed_arp_table hop_penalty orig_interval +There are additional settings for each batadv mesh interface, vlan and hardif +which can be modified using batctl. Detailed information about this can be found +in its manual. -Some files allow to change batman-adv's behaviour to better fit your -requirements. For instance, you can check the current originator interval (value -in milliseconds which determines how often batman sends its broadcast packets):: +For instance, you can check the current originator interval (value +in milliseconds which determines how often batman-adv sends its broadcast +packets):: - $ cat /sys/class/net/bat0/mesh/orig_interval + $ batctl -M bat0 orig_interval 1000 and also change its value:: - $ echo 3000 > /sys/class/net/bat0/mesh/orig_interval + $ batctl -M bat0 orig_interval 3000 In very mobile scenarios, you might want to adjust the originator interval to a lower value. This will make the mesh more responsive to topology changes, but @@ -142,37 +125,11 @@ Those additional debug messages can be accessed using the perf infrastructure:: $ trace-cmd stream -e batadv:batadv_dbg The additional debug output is by default disabled. It can be enabled during -run time. Following log_levels are defined: - -.. flat-table:: - - * - 0 - - All debug output disabled - * - 1 - - Enable messages related to routing / flooding / broadcasting - * - 2 - - Enable messages related to route added / changed / deleted - * - 4 - - Enable messages related to translation table operations - * - 8 - - Enable messages related to bridge loop avoidance - * - 16 - - Enable messages related to DAT, ARP snooping and parsing - * - 32 - - Enable messages related to network coding - * - 64 - - Enable messages related to multicast - * - 128 - - Enable messages related to throughput meter - * - 255 - - Enable all messages - -The debug output can be changed at runtime using the file -``/sys/class/net/bat0/mesh/log_level``. e.g.:: - - $ echo 6 > /sys/class/net/bat0/mesh/log_level - -will enable debug messages for when routes change. +run time:: + + $ batctl -m bat0 loglevel routes tt + +will enable debug messages for when routes and translation table entries change. Counters for different types of packets entering and leaving the batman-adv module are available through ethtool:: -- cgit v1.2.3 From 42cdd521487f6509f52096fa08590f275073e81b Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Sun, 3 Mar 2019 18:02:58 +0100 Subject: batman-adv: ABI: Mark sysfs files as deprecated The sysfs files are replaced by the batadv generic netlink family. The old sysfs configuration interface was frowned upon by other kernel developers. But the files cannot be removed immediately because userspace tools might still depend on it. Instead schedule for its removal in 2021. Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- .../ABI/obsolete/sysfs-class-net-batman-adv | 32 ++++++ Documentation/ABI/obsolete/sysfs-class-net-mesh | 110 +++++++++++++++++++++ .../ABI/testing/sysfs-class-net-batman-adv | 30 ------ Documentation/ABI/testing/sysfs-class-net-mesh | 108 -------------------- MAINTAINERS | 4 +- 5 files changed, 144 insertions(+), 140 deletions(-) create mode 100644 Documentation/ABI/obsolete/sysfs-class-net-batman-adv create mode 100644 Documentation/ABI/obsolete/sysfs-class-net-mesh delete mode 100644 Documentation/ABI/testing/sysfs-class-net-batman-adv delete mode 100644 Documentation/ABI/testing/sysfs-class-net-mesh (limited to 'Documentation') diff --git a/Documentation/ABI/obsolete/sysfs-class-net-batman-adv b/Documentation/ABI/obsolete/sysfs-class-net-batman-adv new file mode 100644 index 000000000000..5bdbc8d40256 --- /dev/null +++ b/Documentation/ABI/obsolete/sysfs-class-net-batman-adv @@ -0,0 +1,32 @@ +This ABI is deprecated and will be removed after 2021. It is +replaced with the batadv generic netlink family. + +What: /sys/class/net//batman-adv/elp_interval +Date: Feb 2014 +Contact: Linus Lüssing +Description: + Defines the interval in milliseconds in which batman + emits probing packets for neighbor sensing (ELP). + +What: /sys/class/net//batman-adv/iface_status +Date: May 2010 +Contact: Marek Lindner +Description: + Indicates the status of as it is seen by batman. + +What: /sys/class/net//batman-adv/mesh_iface +Date: May 2010 +Contact: Marek Lindner +Description: + The /sys/class/net//batman-adv/mesh_iface file + displays the batman mesh interface this + currently is associated with. + +What: /sys/class/net//batman-adv/throughput_override +Date: Feb 2014 +Contact: Antonio Quartulli +description: + Defines the throughput value to be used by B.A.T.M.A.N. V + when estimating the link throughput using this interface. + If the value is set to 0 then batman-adv will try to + estimate the throughput by itself. diff --git a/Documentation/ABI/obsolete/sysfs-class-net-mesh b/Documentation/ABI/obsolete/sysfs-class-net-mesh new file mode 100644 index 000000000000..04c1a2932507 --- /dev/null +++ b/Documentation/ABI/obsolete/sysfs-class-net-mesh @@ -0,0 +1,110 @@ +This ABI is deprecated and will be removed after 2021. It is +replaced with the batadv generic netlink family. + +What: /sys/class/net//mesh/aggregated_ogms +Date: May 2010 +Contact: Marek Lindner +Description: + Indicates whether the batman protocol messages of the + mesh shall be aggregated or not. + +What: /sys/class/net//mesh//ap_isolation +Date: May 2011 +Contact: Antonio Quartulli +Description: + Indicates whether the data traffic going from a + wireless client to another wireless client will be + silently dropped. is empty when referring + to the untagged lan. + +What: /sys/class/net//mesh/bonding +Date: June 2010 +Contact: Simon Wunderlich +Description: + Indicates whether the data traffic going through the + mesh will be sent using multiple interfaces at the + same time (if available). + +What: /sys/class/net//mesh/bridge_loop_avoidance +Date: November 2011 +Contact: Simon Wunderlich +Description: + Indicates whether the bridge loop avoidance feature + is enabled. This feature detects and avoids loops + between the mesh and devices bridged with the soft + interface . + +What: /sys/class/net//mesh/fragmentation +Date: October 2010 +Contact: Andreas Langer +Description: + Indicates whether the data traffic going through the + mesh will be fragmented or silently discarded if the + packet size exceeds the outgoing interface MTU. + +What: /sys/class/net//mesh/gw_bandwidth +Date: October 2010 +Contact: Marek Lindner +Description: + Defines the bandwidth which is propagated by this + node if gw_mode was set to 'server'. + +What: /sys/class/net//mesh/gw_mode +Date: October 2010 +Contact: Marek Lindner +Description: + Defines the state of the gateway features. Can be + either 'off', 'client' or 'server'. + +What: /sys/class/net//mesh/gw_sel_class +Date: October 2010 +Contact: Marek Lindner +Description: + Defines the selection criteria this node will use + to choose a gateway if gw_mode was set to 'client'. + +What: /sys/class/net//mesh/hop_penalty +Date: Oct 2010 +Contact: Linus Lüssing +Description: + Defines the penalty which will be applied to an + originator message's tq-field on every hop. + +What: /sys/class/net//mesh/isolation_mark +Date: Nov 2013 +Contact: Antonio Quartulli +Description: + Defines the isolation mark (and its bitmask) which + is used to classify clients as "isolated" by the + Extended Isolation feature. + +What: /sys/class/net//mesh/multicast_mode +Date: Feb 2014 +Contact: Linus Lüssing +Description: + Indicates whether multicast optimizations are enabled + or disabled. If set to zero then all nodes in the + mesh are going to use classic flooding for any + multicast packet with no optimizations. + +What: /sys/class/net//mesh/network_coding +Date: Nov 2012 +Contact: Martin Hundeboll +Description: + Controls whether Network Coding (using some magic + to send fewer wifi packets but still the same + content) is enabled or not. + +What: /sys/class/net//mesh/orig_interval +Date: May 2010 +Contact: Marek Lindner +Description: + Defines the interval in milliseconds in which batman + sends its protocol messages. + +What: /sys/class/net//mesh/routing_algo +Date: Dec 2011 +Contact: Marek Lindner +Description: + Defines the routing procotol this mesh instance + uses to find the optimal paths through the mesh. diff --git a/Documentation/ABI/testing/sysfs-class-net-batman-adv b/Documentation/ABI/testing/sysfs-class-net-batman-adv deleted file mode 100644 index 898106849e27..000000000000 --- a/Documentation/ABI/testing/sysfs-class-net-batman-adv +++ /dev/null @@ -1,30 +0,0 @@ - -What: /sys/class/net//batman-adv/elp_interval -Date: Feb 2014 -Contact: Linus Lüssing -Description: - Defines the interval in milliseconds in which batman - emits probing packets for neighbor sensing (ELP). - -What: /sys/class/net//batman-adv/iface_status -Date: May 2010 -Contact: Marek Lindner -Description: - Indicates the status of as it is seen by batman. - -What: /sys/class/net//batman-adv/mesh_iface -Date: May 2010 -Contact: Marek Lindner -Description: - The /sys/class/net//batman-adv/mesh_iface file - displays the batman mesh interface this - currently is associated with. - -What: /sys/class/net//batman-adv/throughput_override -Date: Feb 2014 -Contact: Antonio Quartulli -description: - Defines the throughput value to be used by B.A.T.M.A.N. V - when estimating the link throughput using this interface. - If the value is set to 0 then batman-adv will try to - estimate the throughput by itself. diff --git a/Documentation/ABI/testing/sysfs-class-net-mesh b/Documentation/ABI/testing/sysfs-class-net-mesh deleted file mode 100644 index c2b956d44a95..000000000000 --- a/Documentation/ABI/testing/sysfs-class-net-mesh +++ /dev/null @@ -1,108 +0,0 @@ - -What: /sys/class/net//mesh/aggregated_ogms -Date: May 2010 -Contact: Marek Lindner -Description: - Indicates whether the batman protocol messages of the - mesh shall be aggregated or not. - -What: /sys/class/net//mesh//ap_isolation -Date: May 2011 -Contact: Antonio Quartulli -Description: - Indicates whether the data traffic going from a - wireless client to another wireless client will be - silently dropped. is empty when referring - to the untagged lan. - -What: /sys/class/net//mesh/bonding -Date: June 2010 -Contact: Simon Wunderlich -Description: - Indicates whether the data traffic going through the - mesh will be sent using multiple interfaces at the - same time (if available). - -What: /sys/class/net//mesh/bridge_loop_avoidance -Date: November 2011 -Contact: Simon Wunderlich -Description: - Indicates whether the bridge loop avoidance feature - is enabled. This feature detects and avoids loops - between the mesh and devices bridged with the soft - interface . - -What: /sys/class/net//mesh/fragmentation -Date: October 2010 -Contact: Andreas Langer -Description: - Indicates whether the data traffic going through the - mesh will be fragmented or silently discarded if the - packet size exceeds the outgoing interface MTU. - -What: /sys/class/net//mesh/gw_bandwidth -Date: October 2010 -Contact: Marek Lindner -Description: - Defines the bandwidth which is propagated by this - node if gw_mode was set to 'server'. - -What: /sys/class/net//mesh/gw_mode -Date: October 2010 -Contact: Marek Lindner -Description: - Defines the state of the gateway features. Can be - either 'off', 'client' or 'server'. - -What: /sys/class/net//mesh/gw_sel_class -Date: October 2010 -Contact: Marek Lindner -Description: - Defines the selection criteria this node will use - to choose a gateway if gw_mode was set to 'client'. - -What: /sys/class/net//mesh/hop_penalty -Date: Oct 2010 -Contact: Linus Lüssing -Description: - Defines the penalty which will be applied to an - originator message's tq-field on every hop. - -What: /sys/class/net//mesh/isolation_mark -Date: Nov 2013 -Contact: Antonio Quartulli -Description: - Defines the isolation mark (and its bitmask) which - is used to classify clients as "isolated" by the - Extended Isolation feature. - -What: /sys/class/net//mesh/multicast_mode -Date: Feb 2014 -Contact: Linus Lüssing -Description: - Indicates whether multicast optimizations are enabled - or disabled. If set to zero then all nodes in the - mesh are going to use classic flooding for any - multicast packet with no optimizations. - -What: /sys/class/net//mesh/network_coding -Date: Nov 2012 -Contact: Martin Hundeboll -Description: - Controls whether Network Coding (using some magic - to send fewer wifi packets but still the same - content) is enabled or not. - -What: /sys/class/net//mesh/orig_interval -Date: May 2010 -Contact: Marek Lindner -Description: - Defines the interval in milliseconds in which batman - sends its protocol messages. - -What: /sys/class/net//mesh/routing_algo -Date: Dec 2011 -Contact: Marek Lindner -Description: - Defines the routing procotol this mesh instance - uses to find the optimal paths through the mesh. diff --git a/MAINTAINERS b/MAINTAINERS index f8ff9ae52c21..fce2919582fb 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2795,8 +2795,8 @@ L: b.a.t.m.a.n@lists.open-mesh.org (moderated for non-subscribers) W: https://www.open-mesh.org/ Q: https://patchwork.open-mesh.org/project/batman/list/ S: Maintained -F: Documentation/ABI/testing/sysfs-class-net-batman-adv -F: Documentation/ABI/testing/sysfs-class-net-mesh +F: Documentation/ABI/obsolete/sysfs-class-net-batman-adv +F: Documentation/ABI/obsolete/sysfs-class-net-mesh F: Documentation/networking/batman-adv.rst F: include/uapi/linux/batadv_packet.h F: include/uapi/linux/batman_adv.h -- cgit v1.2.3 From 867934e9c9babe9192797726f6910554bbdc28ce Mon Sep 17 00:00:00 2001 From: Jerome Brunet Date: Thu, 4 Apr 2019 15:11:44 +0200 Subject: dt-bindings: net: phy: add g12a mdio mux documentation Add documentation for the device tree bindings of the MDIO mux of Amlogic g12a SoC family Reviewed-by: Rob Herring Signed-off-by: Jerome Brunet Signed-off-by: David S. Miller --- .../bindings/net/mdio-mux-meson-g12a.txt | 48 ++++++++++++++++++++++ 1 file changed, 48 insertions(+) create mode 100644 Documentation/devicetree/bindings/net/mdio-mux-meson-g12a.txt (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/net/mdio-mux-meson-g12a.txt b/Documentation/devicetree/bindings/net/mdio-mux-meson-g12a.txt new file mode 100644 index 000000000000..3a96cbed9294 --- /dev/null +++ b/Documentation/devicetree/bindings/net/mdio-mux-meson-g12a.txt @@ -0,0 +1,48 @@ +Properties for the MDIO bus multiplexer/glue of Amlogic G12a SoC family. + +This is a special case of a MDIO bus multiplexer. It allows to choose between +the internal mdio bus leading to the embedded 10/100 PHY or the external +MDIO bus. + +Required properties in addition to the generic multiplexer properties: +- compatible : amlogic,g12a-mdio-mux +- reg: physical address and length of the multiplexer/glue registers +- clocks: list of clock phandle, one for each entry clock-names. +- clock-names: should contain the following: + * "pclk" : peripheral clock. + * "clkin0" : platform crytal + * "clkin1" : SoC 50MHz MPLL + +Example : + +mdio_mux: mdio-multiplexer@4c000 { + compatible = "amlogic,g12a-mdio-mux"; + reg = <0x0 0x4c000 0x0 0xa4>; + clocks = <&clkc CLKID_ETH_PHY>, + <&xtal>, + <&clkc CLKID_MPLL_5OM>; + clock-names = "pclk", "clkin0", "clkin1"; + mdio-parent-bus = <&mdio0>; + #address-cells = <1>; + #size-cells = <0>; + + ext_mdio: mdio@0 { + reg = <0>; + #address-cells = <1>; + #size-cells = <0>; + }; + + int_mdio: mdio@1 { + reg = <1>; + #address-cells = <1>; + #size-cells = <0>; + + internal_ephy: ethernet-phy@8 { + compatible = "ethernet-phy-id0180.3301", + "ethernet-phy-ieee802.3-c22"; + interrupts = ; + reg = <8>; + max-speed = <100>; + }; + }; +}; -- cgit v1.2.3 From be0faac952e1643237931956ab8ec5ceaeda41b0 Mon Sep 17 00:00:00 2001 From: Shalom Toledo Date: Mon, 8 Apr 2019 06:59:36 +0000 Subject: Documentation: networking: devlink-info-versions: Add fw.psid Add firmware parameter id (fw.psid). Signed-off-by: Shalom Toledo Acked-by: Jiri Pirko Signed-off-by: Ido Schimmel Acked-by: Jakub Kicinski Signed-off-by: David S. Miller --- Documentation/networking/devlink-info-versions.rst | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'Documentation') diff --git a/Documentation/networking/devlink-info-versions.rst b/Documentation/networking/devlink-info-versions.rst index c79ad8593383..4316342b7746 100644 --- a/Documentation/networking/devlink-info-versions.rst +++ b/Documentation/networking/devlink-info-versions.rst @@ -41,3 +41,8 @@ fw.ncsi Version of the software responsible for supporting/handling the Network Controller Sideband Interface. + +fw.psid +======= + +Unique identifier of the firmware parameter set. -- cgit v1.2.3 From f063c889c9458354a92b235a51cbb60d30321070 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 9 Apr 2019 23:20:08 +0200 Subject: bpf: add specification for BTF Var and DataSec kinds This adds the BTF specification and UAPI bits for supporting BTF Var and DataSec kinds. This is following LLVM upstream commit ac4082b77e07 ("[BPF] Add BTF Var and DataSec Support") which has been merged recently. Var itself is for describing a global variable and DataSec to describe ELF sections e.g. data/bss/rodata sections that hold one or multiple global variables. Signed-off-by: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov --- Documentation/bpf/btf.rst | 57 +++++++++++++++++++++++++++++++++++++++++++++++ include/uapi/linux/btf.h | 32 ++++++++++++++++++++++---- 2 files changed, 85 insertions(+), 4 deletions(-) (limited to 'Documentation') diff --git a/Documentation/bpf/btf.rst b/Documentation/bpf/btf.rst index 9a60a5d60e38..60d87d7363ec 100644 --- a/Documentation/bpf/btf.rst +++ b/Documentation/bpf/btf.rst @@ -82,6 +82,8 @@ sequentially and type id is assigned to each recognized type starting from id #define BTF_KIND_RESTRICT 11 /* Restrict */ #define BTF_KIND_FUNC 12 /* Function */ #define BTF_KIND_FUNC_PROTO 13 /* Function Proto */ + #define BTF_KIND_VAR 14 /* Variable */ + #define BTF_KIND_DATASEC 15 /* Section */ Note that the type section encodes debug info, not just pure types. ``BTF_KIND_FUNC`` is not a type, and it represents a defined subprogram. @@ -393,6 +395,61 @@ refers to parameter type. If the function has variable arguments, the last parameter is encoded with ``name_off = 0`` and ``type = 0``. +2.2.14 BTF_KIND_VAR +~~~~~~~~~~~~~~~~~~~ + +``struct btf_type`` encoding requirement: + * ``name_off``: offset to a valid C identifier + * ``info.kind_flag``: 0 + * ``info.kind``: BTF_KIND_VAR + * ``info.vlen``: 0 + * ``type``: the type of the variable + +``btf_type`` is followed by a single ``struct btf_variable`` with the +following data:: + + struct btf_var { + __u32 linkage; + }; + +``struct btf_var`` encoding: + * ``linkage``: currently only static variable 0, or globally allocated + variable in ELF sections 1 + +Not all type of global variables are supported by LLVM at this point. +The following is currently available: + + * static variables with or without section attributes + * global variables with section attributes + +The latter is for future extraction of map key/value type id's from a +map definition. + +2.2.15 BTF_KIND_DATASEC +~~~~~~~~~~~~~~~~~~~~~~~ + +``struct btf_type`` encoding requirement: + * ``name_off``: offset to a valid name associated with a variable or + one of .data/.bss/.rodata + * ``info.kind_flag``: 0 + * ``info.kind``: BTF_KIND_DATASEC + * ``info.vlen``: # of variables + * ``size``: total section size in bytes (0 at compilation time, patched + to actual size by BPF loaders such as libbpf) + +``btf_type`` is followed by ``info.vlen`` number of ``struct btf_var_secinfo``.:: + + struct btf_var_secinfo { + __u32 type; + __u32 offset; + __u32 size; + }; + +``struct btf_var_secinfo`` encoding: + * ``type``: the type of the BTF_KIND_VAR variable + * ``offset``: the in-section offset of the variable + * ``size``: the size of the variable in bytes + 3. BTF Kernel API ***************** diff --git a/include/uapi/linux/btf.h b/include/uapi/linux/btf.h index 7b7475ef2f17..9310652ca4f9 100644 --- a/include/uapi/linux/btf.h +++ b/include/uapi/linux/btf.h @@ -39,11 +39,11 @@ struct btf_type { * struct, union and fwd */ __u32 info; - /* "size" is used by INT, ENUM, STRUCT and UNION. + /* "size" is used by INT, ENUM, STRUCT, UNION and DATASEC. * "size" tells the size of the type it is describing. * * "type" is used by PTR, TYPEDEF, VOLATILE, CONST, RESTRICT, - * FUNC and FUNC_PROTO. + * FUNC, FUNC_PROTO and VAR. * "type" is a type_id referring to another type. */ union { @@ -70,8 +70,10 @@ struct btf_type { #define BTF_KIND_RESTRICT 11 /* Restrict */ #define BTF_KIND_FUNC 12 /* Function */ #define BTF_KIND_FUNC_PROTO 13 /* Function Proto */ -#define BTF_KIND_MAX 13 -#define NR_BTF_KINDS 14 +#define BTF_KIND_VAR 14 /* Variable */ +#define BTF_KIND_DATASEC 15 /* Section */ +#define BTF_KIND_MAX BTF_KIND_DATASEC +#define NR_BTF_KINDS (BTF_KIND_MAX + 1) /* For some specific BTF_KIND, "struct btf_type" is immediately * followed by extra data. @@ -138,4 +140,26 @@ struct btf_param { __u32 type; }; +enum { + BTF_VAR_STATIC = 0, + BTF_VAR_GLOBAL_ALLOCATED, +}; + +/* BTF_KIND_VAR is followed by a single "struct btf_var" to describe + * additional information related to the variable such as its linkage. + */ +struct btf_var { + __u32 linkage; +}; + +/* BTF_KIND_DATASEC is followed by multiple "struct btf_var_secinfo" + * to describe all BTF_KIND_VAR types it contains along with it's + * in-section offset as well as size. + */ +struct btf_var_secinfo { + __u32 type; + __u32 offset; + __u32 size; +}; + #endif /* _UAPI__LINUX_BTF_H__ */ -- cgit v1.2.3 From 3c91d114832027b75a6518734081a2507c5d7a87 Mon Sep 17 00:00:00 2001 From: Ioana Ciornei Date: Fri, 12 Apr 2019 14:55:18 +0300 Subject: Documentation: net: dsa: transition to the rst format This patch also performs some minor adjustments such as numbering for the receive path sequence, conversion of keywords to inline literals and adding an index page so it looks better in the output of 'make htmldocs'. Signed-off-by: Ioana Ciornei Signed-off-by: David S. Miller --- Documentation/networking/dsa/bcm_sf2.rst | 115 ++++++ Documentation/networking/dsa/bcm_sf2.txt | 114 ------ Documentation/networking/dsa/dsa.rst | 587 +++++++++++++++++++++++++++++++ Documentation/networking/dsa/dsa.txt | 584 ------------------------------ Documentation/networking/dsa/index.rst | 10 + Documentation/networking/dsa/lan9303.rst | 37 ++ Documentation/networking/dsa/lan9303.txt | 37 -- Documentation/networking/index.rst | 1 + 8 files changed, 750 insertions(+), 735 deletions(-) create mode 100644 Documentation/networking/dsa/bcm_sf2.rst delete mode 100644 Documentation/networking/dsa/bcm_sf2.txt create mode 100644 Documentation/networking/dsa/dsa.rst delete mode 100644 Documentation/networking/dsa/dsa.txt create mode 100644 Documentation/networking/dsa/index.rst create mode 100644 Documentation/networking/dsa/lan9303.rst delete mode 100644 Documentation/networking/dsa/lan9303.txt (limited to 'Documentation') diff --git a/Documentation/networking/dsa/bcm_sf2.rst b/Documentation/networking/dsa/bcm_sf2.rst new file mode 100644 index 000000000000..dee234039e1e --- /dev/null +++ b/Documentation/networking/dsa/bcm_sf2.rst @@ -0,0 +1,115 @@ +============================================= +Broadcom Starfighter 2 Ethernet switch driver +============================================= + +Broadcom's Starfighter 2 Ethernet switch hardware block is commonly found and +deployed in the following products: + +- xDSL gateways such as BCM63138 +- streaming/multimedia Set Top Box such as BCM7445 +- Cable Modem/residential gateways such as BCM7145/BCM3390 + +The switch is typically deployed in a configuration involving between 5 to 13 +ports, offering a range of built-in and customizable interfaces: + +- single integrated Gigabit PHY +- quad integrated Gigabit PHY +- quad external Gigabit PHY w/ MDIO multiplexer +- integrated MoCA PHY +- several external MII/RevMII/GMII/RGMII interfaces + +The switch also supports specific congestion control features which allow MoCA +fail-over not to lose packets during a MoCA role re-election, as well as out of +band back-pressure to the host CPU network interface when downstream interfaces +are connected at a lower speed. + +The switch hardware block is typically interfaced using MMIO accesses and +contains a bunch of sub-blocks/registers: + +- ``SWITCH_CORE``: common switch registers +- ``SWITCH_REG``: external interfaces switch register +- ``SWITCH_MDIO``: external MDIO bus controller (there is another one in SWITCH_CORE, + which is used for indirect PHY accesses) +- ``SWITCH_INDIR_RW``: 64-bits wide register helper block +- ``SWITCH_INTRL2_0/1``: Level-2 interrupt controllers +- ``SWITCH_ACB``: Admission control block +- ``SWITCH_FCB``: Fail-over control block + +Implementation details +====================== + +The driver is located in ``drivers/net/dsa/bcm_sf2.c`` and is implemented as a DSA +driver; see ``Documentation/networking/dsa/dsa.rst`` for details on the subsystem +and what it provides. + +The SF2 switch is configured to enable a Broadcom specific 4-bytes switch tag +which gets inserted by the switch for every packet forwarded to the CPU +interface, conversely, the CPU network interface should insert a similar tag for +packets entering the CPU port. The tag format is described in +``net/dsa/tag_brcm.c``. + +Overall, the SF2 driver is a fairly regular DSA driver; there are a few +specifics covered below. + +Device Tree probing +------------------- + +The DSA platform device driver is probed using a specific compatible string +provided in ``net/dsa/dsa.c``. The reason for that is because the DSA subsystem gets +registered as a platform device driver currently. DSA will provide the needed +device_node pointers which are then accessible by the switch driver setup +function to setup resources such as register ranges and interrupts. This +currently works very well because none of the of_* functions utilized by the +driver require a struct device to be bound to a struct device_node, but things +may change in the future. + +MDIO indirect accesses +---------------------- + +Due to a limitation in how Broadcom switches have been designed, external +Broadcom switches connected to a SF2 require the use of the DSA slave MDIO bus +in order to properly configure them. By default, the SF2 pseudo-PHY address, and +an external switch pseudo-PHY address will both be snooping for incoming MDIO +transactions, since they are at the same address (30), resulting in some kind of +"double" programming. Using DSA, and setting ``ds->phys_mii_mask`` accordingly, we +selectively divert reads and writes towards external Broadcom switches +pseudo-PHY addresses. Newer revisions of the SF2 hardware have introduced a +configurable pseudo-PHY address which circumvents the initial design limitation. + +Multimedia over CoAxial (MoCA) interfaces +----------------------------------------- + +MoCA interfaces are fairly specific and require the use of a firmware blob which +gets loaded onto the MoCA processor(s) for packet processing. The switch +hardware contains logic which will assert/de-assert link states accordingly for +the MoCA interface whenever the MoCA coaxial cable gets disconnected or the +firmware gets reloaded. The SF2 driver relies on such events to properly set its +MoCA interface carrier state and properly report this to the networking stack. + +The MoCA interfaces are supported using the PHY library's fixed PHY/emulated PHY +device and the switch driver registers a ``fixed_link_update`` callback for such +PHYs which reflects the link state obtained from the interrupt handler. + + +Power Management +---------------- + +Whenever possible, the SF2 driver tries to minimize the overall switch power +consumption by applying a combination of: + +- turning off internal buffers/memories +- disabling packet processing logic +- putting integrated PHYs in IDDQ/low-power +- reducing the switch core clock based on the active port count +- enabling and advertising EEE +- turning off RGMII data processing logic when the link goes down + +Wake-on-LAN +----------- + +Wake-on-LAN is currently implemented by utilizing the host processor Ethernet +MAC controller wake-on logic. Whenever Wake-on-LAN is requested, an intersection +between the user request and the supported host Ethernet interface WoL +capabilities is done and the intersection result gets configured. During +system-wide suspend/resume, only ports not participating in Wake-on-LAN are +disabled. diff --git a/Documentation/networking/dsa/bcm_sf2.txt b/Documentation/networking/dsa/bcm_sf2.txt deleted file mode 100644 index eba3a2431e91..000000000000 --- a/Documentation/networking/dsa/bcm_sf2.txt +++ /dev/null @@ -1,114 +0,0 @@ -Broadcom Starfighter 2 Ethernet switch driver -============================================= - -Broadcom's Starfighter 2 Ethernet switch hardware block is commonly found and -deployed in the following products: - -- xDSL gateways such as BCM63138 -- streaming/multimedia Set Top Box such as BCM7445 -- Cable Modem/residential gateways such as BCM7145/BCM3390 - -The switch is typically deployed in a configuration involving between 5 to 13 -ports, offering a range of built-in and customizable interfaces: - -- single integrated Gigabit PHY -- quad integrated Gigabit PHY -- quad external Gigabit PHY w/ MDIO multiplexer -- integrated MoCA PHY -- several external MII/RevMII/GMII/RGMII interfaces - -The switch also supports specific congestion control features which allow MoCA -fail-over not to lose packets during a MoCA role re-election, as well as out of -band back-pressure to the host CPU network interface when downstream interfaces -are connected at a lower speed. - -The switch hardware block is typically interfaced using MMIO accesses and -contains a bunch of sub-blocks/registers: - -* SWITCH_CORE: common switch registers -* SWITCH_REG: external interfaces switch register -* SWITCH_MDIO: external MDIO bus controller (there is another one in SWITCH_CORE, - which is used for indirect PHY accesses) -* SWITCH_INDIR_RW: 64-bits wide register helper block -* SWITCH_INTRL2_0/1: Level-2 interrupt controllers -* SWITCH_ACB: Admission control block -* SWITCH_FCB: Fail-over control block - -Implementation details -====================== - -The driver is located in drivers/net/dsa/bcm_sf2.c and is implemented as a DSA -driver; see Documentation/networking/dsa/dsa.txt for details on the subsystem -and what it provides. - -The SF2 switch is configured to enable a Broadcom specific 4-bytes switch tag -which gets inserted by the switch for every packet forwarded to the CPU -interface, conversely, the CPU network interface should insert a similar tag for -packets entering the CPU port. The tag format is described in -net/dsa/tag_brcm.c. - -Overall, the SF2 driver is a fairly regular DSA driver; there are a few -specifics covered below. - -Device Tree probing -------------------- - -The DSA platform device driver is probed using a specific compatible string -provided in net/dsa/dsa.c. The reason for that is because the DSA subsystem gets -registered as a platform device driver currently. DSA will provide the needed -device_node pointers which are then accessible by the switch driver setup -function to setup resources such as register ranges and interrupts. This -currently works very well because none of the of_* functions utilized by the -driver require a struct device to be bound to a struct device_node, but things -may change in the future. - -MDIO indirect accesses ----------------------- - -Due to a limitation in how Broadcom switches have been designed, external -Broadcom switches connected to a SF2 require the use of the DSA slave MDIO bus -in order to properly configure them. By default, the SF2 pseudo-PHY address, and -an external switch pseudo-PHY address will both be snooping for incoming MDIO -transactions, since they are at the same address (30), resulting in some kind of -"double" programming. Using DSA, and setting ds->phys_mii_mask accordingly, we -selectively divert reads and writes towards external Broadcom switches -pseudo-PHY addresses. Newer revisions of the SF2 hardware have introduced a -configurable pseudo-PHY address which circumvents the initial design limitation. - -Multimedia over CoAxial (MoCA) interfaces ------------------------------------------ - -MoCA interfaces are fairly specific and require the use of a firmware blob which -gets loaded onto the MoCA processor(s) for packet processing. The switch -hardware contains logic which will assert/de-assert link states accordingly for -the MoCA interface whenever the MoCA coaxial cable gets disconnected or the -firmware gets reloaded. The SF2 driver relies on such events to properly set its -MoCA interface carrier state and properly report this to the networking stack. - -The MoCA interfaces are supported using the PHY library's fixed PHY/emulated PHY -device and the switch driver registers a fixed_link_update callback for such -PHYs which reflects the link state obtained from the interrupt handler. - - -Power Management ----------------- - -Whenever possible, the SF2 driver tries to minimize the overall switch power -consumption by applying a combination of: - -- turning off internal buffers/memories -- disabling packet processing logic -- putting integrated PHYs in IDDQ/low-power -- reducing the switch core clock based on the active port count -- enabling and advertising EEE -- turning off RGMII data processing logic when the link goes down - -Wake-on-LAN ------------ - -Wake-on-LAN is currently implemented by utilizing the host processor Ethernet -MAC controller wake-on logic. Whenever Wake-on-LAN is requested, an intersection -between the user request and the supported host Ethernet interface WoL -capabilities is done and the intersection result gets configured. During -system-wide suspend/resume, only ports not participating in Wake-on-LAN are -disabled. diff --git a/Documentation/networking/dsa/dsa.rst b/Documentation/networking/dsa/dsa.rst new file mode 100644 index 000000000000..ca87068b9ab9 --- /dev/null +++ b/Documentation/networking/dsa/dsa.rst @@ -0,0 +1,587 @@ +============ +Architecture +============ + +This document describes the **Distributed Switch Architecture (DSA)** subsystem +design principles, limitations, interactions with other subsystems, and how to +develop drivers for this subsystem as well as a TODO for developers interested +in joining the effort. + +Design principles +================= + +The Distributed Switch Architecture is a subsystem which was primarily designed +to support Marvell Ethernet switches (MV88E6xxx, a.k.a Linkstreet product line) +using Linux, but has since evolved to support other vendors as well. + +The original philosophy behind this design was to be able to use unmodified +Linux tools such as bridge, iproute2, ifconfig to work transparently whether +they configured/queried a switch port network device or a regular network +device. + +An Ethernet switch is typically comprised of multiple front-panel ports, and one +or more CPU or management port. The DSA subsystem currently relies on the +presence of a management port connected to an Ethernet controller capable of +receiving Ethernet frames from the switch. This is a very common setup for all +kinds of Ethernet switches found in Small Home and Office products: routers, +gateways, or even top-of-the rack switches. This host Ethernet controller will +be later referred to as "master" and "cpu" in DSA terminology and code. + +The D in DSA stands for Distributed, because the subsystem has been designed +with the ability to configure and manage cascaded switches on top of each other +using upstream and downstream Ethernet links between switches. These specific +ports are referred to as "dsa" ports in DSA terminology and code. A collection +of multiple switches connected to each other is called a "switch tree". + +For each front-panel port, DSA will create specialized network devices which are +used as controlling and data-flowing endpoints for use by the Linux networking +stack. These specialized network interfaces are referred to as "slave" network +interfaces in DSA terminology and code. + +The ideal case for using DSA is when an Ethernet switch supports a "switch tag" +which is a hardware feature making the switch insert a specific tag for each +Ethernet frames it received to/from specific ports to help the management +interface figure out: + +- what port is this frame coming from +- what was the reason why this frame got forwarded +- how to send CPU originated traffic to specific ports + +The subsystem does support switches not capable of inserting/stripping tags, but +the features might be slightly limited in that case (traffic separation relies +on Port-based VLAN IDs). + +Note that DSA does not currently create network interfaces for the "cpu" and +"dsa" ports because: + +- the "cpu" port is the Ethernet switch facing side of the management + controller, and as such, would create a duplication of feature, since you + would get two interfaces for the same conduit: master netdev, and "cpu" netdev + +- the "dsa" port(s) are just conduits between two or more switches, and as such + cannot really be used as proper network interfaces either, only the + downstream, or the top-most upstream interface makes sense with that model + +Switch tagging protocols +------------------------ + +DSA currently supports 5 different tagging protocols, and a tag-less mode as +well. The different protocols are implemented in: + +- ``net/dsa/tag_trailer.c``: Marvell's 4 trailer tag mode (legacy) +- ``net/dsa/tag_dsa.c``: Marvell's original DSA tag +- ``net/dsa/tag_edsa.c``: Marvell's enhanced DSA tag +- ``net/dsa/tag_brcm.c``: Broadcom's 4 bytes tag +- ``net/dsa/tag_qca.c``: Qualcomm's 2 bytes tag + +The exact format of the tag protocol is vendor specific, but in general, they +all contain something which: + +- identifies which port the Ethernet frame came from/should be sent to +- provides a reason why this frame was forwarded to the management interface + +Master network devices +---------------------- + +Master network devices are regular, unmodified Linux network device drivers for +the CPU/management Ethernet interface. Such a driver might occasionally need to +know whether DSA is enabled (e.g.: to enable/disable specific offload features), +but the DSA subsystem has been proven to work with industry standard drivers: +``e1000e,`` ``mv643xx_eth`` etc. without having to introduce modifications to these +drivers. Such network devices are also often referred to as conduit network +devices since they act as a pipe between the host processor and the hardware +Ethernet switch. + +Networking stack hooks +---------------------- + +When a master netdev is used with DSA, a small hook is placed in in the +networking stack is in order to have the DSA subsystem process the Ethernet +switch specific tagging protocol. DSA accomplishes this by registering a +specific (and fake) Ethernet type (later becoming ``skb->protocol``) with the +networking stack, this is also known as a ``ptype`` or ``packet_type``. A typical +Ethernet Frame receive sequence looks like this: + +Master network device (e.g.: e1000e): + +1. Receive interrupt fires: + + - receive function is invoked + - basic packet processing is done: getting length, status etc. + - packet is prepared to be processed by the Ethernet layer by calling + ``eth_type_trans`` + +2. net/ethernet/eth.c:: + + eth_type_trans(skb, dev) + if (dev->dsa_ptr != NULL) + -> skb->protocol = ETH_P_XDSA + +3. drivers/net/ethernet/\*:: + + netif_receive_skb(skb) + -> iterate over registered packet_type + -> invoke handler for ETH_P_XDSA, calls dsa_switch_rcv() + +4. net/dsa/dsa.c:: + + -> dsa_switch_rcv() + -> invoke switch tag specific protocol handler in 'net/dsa/tag_*.c' + +5. net/dsa/tag_*.c: + + - inspect and strip switch tag protocol to determine originating port + - locate per-port network device + - invoke ``eth_type_trans()`` with the DSA slave network device + - invoked ``netif_receive_skb()`` + +Past this point, the DSA slave network devices get delivered regular Ethernet +frames that can be processed by the networking stack. + +Slave network devices +--------------------- + +Slave network devices created by DSA are stacked on top of their master network +device, each of these network interfaces will be responsible for being a +controlling and data-flowing end-point for each front-panel port of the switch. +These interfaces are specialized in order to: + +- insert/remove the switch tag protocol (if it exists) when sending traffic + to/from specific switch ports +- query the switch for ethtool operations: statistics, link state, + Wake-on-LAN, register dumps... +- external/internal PHY management: link, auto-negotiation etc. + +These slave network devices have custom net_device_ops and ethtool_ops function +pointers which allow DSA to introduce a level of layering between the networking +stack/ethtool, and the switch driver implementation. + +Upon frame transmission from these slave network devices, DSA will look up which +switch tagging protocol is currently registered with these network devices, and +invoke a specific transmit routine which takes care of adding the relevant +switch tag in the Ethernet frames. + +These frames are then queued for transmission using the master network device +``ndo_start_xmit()`` function, since they contain the appropriate switch tag, the +Ethernet switch will be able to process these incoming frames from the +management interface and delivers these frames to the physical switch port. + +Graphical representation +------------------------ + +Summarized, this is basically how DSA looks like from a network device +perspective:: + + + |--------------------------- + | CPU network device (eth0)| + ---------------------------- + | | + |--------------------------------------------| + | Switch driver | + |--------------------------------------------| + || || || + |-------| |-------| |-------| + | sw0p0 | | sw0p1 | | sw0p2 | + |-------| |-------| |-------| + + + +Slave MDIO bus +-------------- + +In order to be able to read to/from a switch PHY built into it, DSA creates a +slave MDIO bus which allows a specific switch driver to divert and intercept +MDIO reads/writes towards specific PHY addresses. In most MDIO-connected +switches, these functions would utilize direct or indirect PHY addressing mode +to return standard MII registers from the switch builtin PHYs, allowing the PHY +library and/or to return link status, link partner pages, auto-negotiation +results etc.. + +For Ethernet switches which have both external and internal MDIO busses, the +slave MII bus can be utilized to mux/demux MDIO reads and writes towards either +internal or external MDIO devices this switch might be connected to: internal +PHYs, external PHYs, or even external switches. + +Data structures +--------------- + +DSA data structures are defined in ``include/net/dsa.h`` as well as +``net/dsa/dsa_priv.h``: + +- ``dsa_chip_data``: platform data configuration for a given switch device, + this structure describes a switch device's parent device, its address, as + well as various properties of its ports: names/labels, and finally a routing + table indication (when cascading switches) + +- ``dsa_platform_data``: platform device configuration data which can reference + a collection of dsa_chip_data structure if multiples switches are cascaded, + the master network device this switch tree is attached to needs to be + referenced + +- ``dsa_switch_tree``: structure assigned to the master network device under + ``dsa_ptr``, this structure references a dsa_platform_data structure as well as + the tagging protocol supported by the switch tree, and which receive/transmit + function hooks should be invoked, information about the directly attached + switch is also provided: CPU port. Finally, a collection of dsa_switch are + referenced to address individual switches in the tree. + +- ``dsa_switch``: structure describing a switch device in the tree, referencing + a ``dsa_switch_tree`` as a backpointer, slave network devices, master network + device, and a reference to the backing``dsa_switch_ops`` + +- ``dsa_switch_ops``: structure referencing function pointers, see below for a + full description. + +Design limitations +================== + +Limits on the number of devices and ports +----------------------------------------- + +DSA currently limits the number of maximum switches within a tree to 4 +(``DSA_MAX_SWITCHES``), and the number of ports per switch to 12 (``DSA_MAX_PORTS``). +These limits could be extended to support larger configurations would this need +arise. + +Lack of CPU/DSA network devices +------------------------------- + +DSA does not currently create slave network devices for the CPU or DSA ports, as +described before. This might be an issue in the following cases: + +- inability to fetch switch CPU port statistics counters using ethtool, which + can make it harder to debug MDIO switch connected using xMII interfaces + +- inability to configure the CPU port link parameters based on the Ethernet + controller capabilities attached to it: http://patchwork.ozlabs.org/patch/509806/ + +- inability to configure specific VLAN IDs / trunking VLANs between switches + when using a cascaded setup + +Common pitfalls using DSA setups +-------------------------------- + +Once a master network device is configured to use DSA (dev->dsa_ptr becomes +non-NULL), and the switch behind it expects a tagging protocol, this network +interface can only exclusively be used as a conduit interface. Sending packets +directly through this interface (e.g.: opening a socket using this interface) +will not make us go through the switch tagging protocol transmit function, so +the Ethernet switch on the other end, expecting a tag will typically drop this +frame. + +Slave network devices check that the master network device is UP before allowing +you to administratively bring UP these slave network devices. A common +configuration mistake is forgetting to bring UP the master network device first. + +Interactions with other subsystems +================================== + +DSA currently leverages the following subsystems: + +- MDIO/PHY library: ``drivers/net/phy/phy.c``, ``mdio_bus.c`` +- Switchdev:``net/switchdev/*`` +- Device Tree for various of_* functions + +MDIO/PHY library +---------------- + +Slave network devices exposed by DSA may or may not be interfacing with PHY +devices (``struct phy_device`` as defined in ``include/linux/phy.h)``, but the DSA +subsystem deals with all possible combinations: + +- internal PHY devices, built into the Ethernet switch hardware +- external PHY devices, connected via an internal or external MDIO bus +- internal PHY devices, connected via an internal MDIO bus +- special, non-autonegotiated or non MDIO-managed PHY devices: SFPs, MoCA; a.k.a + fixed PHYs + +The PHY configuration is done by the ``dsa_slave_phy_setup()`` function and the +logic basically looks like this: + +- if Device Tree is used, the PHY device is looked up using the standard + "phy-handle" property, if found, this PHY device is created and registered + using ``of_phy_connect()`` + +- if Device Tree is used, and the PHY device is "fixed", that is, conforms to + the definition of a non-MDIO managed PHY as defined in + ``Documentation/devicetree/bindings/net/fixed-link.txt``, the PHY is registered + and connected transparently using the special fixed MDIO bus driver + +- finally, if the PHY is built into the switch, as is very common with + standalone switch packages, the PHY is probed using the slave MII bus created + by DSA + + +SWITCHDEV +--------- + +DSA directly utilizes SWITCHDEV when interfacing with the bridge layer, and +more specifically with its VLAN filtering portion when configuring VLANs on top +of per-port slave network devices. Since DSA primarily deals with +MDIO-connected switches, although not exclusively, SWITCHDEV's +prepare/abort/commit phases are often simplified into a prepare phase which +checks whether the operation is supported by the DSA switch driver, and a commit +phase which applies the changes. + +As of today, the only SWITCHDEV objects supported by DSA are the FDB and VLAN +objects. + +Device Tree +----------- + +DSA features a standardized binding which is documented in +``Documentation/devicetree/bindings/net/dsa/dsa.txt``. PHY/MDIO library helper +functions such as ``of_get_phy_mode()``, ``of_phy_connect()`` are also used to query +per-port PHY specific details: interface connection, MDIO bus location etc.. + +Driver development +================== + +DSA switch drivers need to implement a dsa_switch_ops structure which will +contain the various members described below. + +``register_switch_driver()`` registers this dsa_switch_ops in its internal list +of drivers to probe for. ``unregister_switch_driver()`` does the exact opposite. + +Unless requested differently by setting the priv_size member accordingly, DSA +does not allocate any driver private context space. + +Switch configuration +-------------------- + +- ``tag_protocol``: this is to indicate what kind of tagging protocol is supported, + should be a valid value from the ``dsa_tag_protocol`` enum + +- ``probe``: probe routine which will be invoked by the DSA platform device upon + registration to test for the presence/absence of a switch device. For MDIO + devices, it is recommended to issue a read towards internal registers using + the switch pseudo-PHY and return whether this is a supported device. For other + buses, return a non-NULL string + +- ``setup``: setup function for the switch, this function is responsible for setting + up the ``dsa_switch_ops`` private structure with all it needs: register maps, + interrupts, mutexes, locks etc.. This function is also expected to properly + configure the switch to separate all network interfaces from each other, that + is, they should be isolated by the switch hardware itself, typically by creating + a Port-based VLAN ID for each port and allowing only the CPU port and the + specific port to be in the forwarding vector. Ports that are unused by the + platform should be disabled. Past this function, the switch is expected to be + fully configured and ready to serve any kind of request. It is recommended + to issue a software reset of the switch during this setup function in order to + avoid relying on what a previous software agent such as a bootloader/firmware + may have previously configured. + +PHY devices and link management +------------------------------- + +- ``get_phy_flags``: Some switches are interfaced to various kinds of Ethernet PHYs, + if the PHY library PHY driver needs to know about information it cannot obtain + on its own (e.g.: coming from switch memory mapped registers), this function + should return a 32-bits bitmask of "flags", that is private between the switch + driver and the Ethernet PHY driver in ``drivers/net/phy/\*``. + +- ``phy_read``: Function invoked by the DSA slave MDIO bus when attempting to read + the switch port MDIO registers. If unavailable, return 0xffff for each read. + For builtin switch Ethernet PHYs, this function should allow reading the link + status, auto-negotiation results, link partner pages etc.. + +- ``phy_write``: Function invoked by the DSA slave MDIO bus when attempting to write + to the switch port MDIO registers. If unavailable return a negative error + code. + +- ``adjust_link``: Function invoked by the PHY library when a slave network device + is attached to a PHY device. This function is responsible for appropriately + configuring the switch port link parameters: speed, duplex, pause based on + what the ``phy_device`` is providing. + +- ``fixed_link_update``: Function invoked by the PHY library, and specifically by + the fixed PHY driver asking the switch driver for link parameters that could + not be auto-negotiated, or obtained by reading the PHY registers through MDIO. + This is particularly useful for specific kinds of hardware such as QSGMII, + MoCA or other kinds of non-MDIO managed PHYs where out of band link + information is obtained + +Ethtool operations +------------------ + +- ``get_strings``: ethtool function used to query the driver's strings, will + typically return statistics strings, private flags strings etc. + +- ``get_ethtool_stats``: ethtool function used to query per-port statistics and + return their values. DSA overlays slave network devices general statistics: + RX/TX counters from the network device, with switch driver specific statistics + per port + +- ``get_sset_count``: ethtool function used to query the number of statistics items + +- ``get_wol``: ethtool function used to obtain Wake-on-LAN settings per-port, this + function may, for certain implementations also query the master network device + Wake-on-LAN settings if this interface needs to participate in Wake-on-LAN + +- ``set_wol``: ethtool function used to configure Wake-on-LAN settings per-port, + direct counterpart to set_wol with similar restrictions + +- ``set_eee``: ethtool function which is used to configure a switch port EEE (Green + Ethernet) settings, can optionally invoke the PHY library to enable EEE at the + PHY level if relevant. This function should enable EEE at the switch port MAC + controller and data-processing logic + +- ``get_eee``: ethtool function which is used to query a switch port EEE settings, + this function should return the EEE state of the switch port MAC controller + and data-processing logic as well as query the PHY for its currently configured + EEE settings + +- ``get_eeprom_len``: ethtool function returning for a given switch the EEPROM + length/size in bytes + +- ``get_eeprom``: ethtool function returning for a given switch the EEPROM contents + +- ``set_eeprom``: ethtool function writing specified data to a given switch EEPROM + +- ``get_regs_len``: ethtool function returning the register length for a given + switch + +- ``get_regs``: ethtool function returning the Ethernet switch internal register + contents. This function might require user-land code in ethtool to + pretty-print register values and registers + +Power management +---------------- + +- ``suspend``: function invoked by the DSA platform device when the system goes to + suspend, should quiesce all Ethernet switch activities, but keep ports + participating in Wake-on-LAN active as well as additional wake-up logic if + supported + +- ``resume``: function invoked by the DSA platform device when the system resumes, + should resume all Ethernet switch activities and re-configure the switch to be + in a fully active state + +- ``port_enable``: function invoked by the DSA slave network device ndo_open + function when a port is administratively brought up, this function should be + fully enabling a given switch port. DSA takes care of marking the port with + ``BR_STATE_BLOCKING`` if the port is a bridge member, or ``BR_STATE_FORWARDING`` if it + was not, and propagating these changes down to the hardware + +- ``port_disable``: function invoked by the DSA slave network device ndo_close + function when a port is administratively brought down, this function should be + fully disabling a given switch port. DSA takes care of marking the port with + ``BR_STATE_DISABLED`` and propagating changes to the hardware if this port is + disabled while being a bridge member + +Bridge layer +------------ + +- ``port_bridge_join``: bridge layer function invoked when a given switch port is + added to a bridge, this function should be doing the necessary at the switch + level to permit the joining port from being added to the relevant logical + domain for it to ingress/egress traffic with other members of the bridge. + +- ``port_bridge_leave``: bridge layer function invoked when a given switch port is + removed from a bridge, this function should be doing the necessary at the + switch level to deny the leaving port from ingress/egress traffic from the + remaining bridge members. When the port leaves the bridge, it should be aged + out at the switch hardware for the switch to (re) learn MAC addresses behind + this port. + +- ``port_stp_state_set``: bridge layer function invoked when a given switch port STP + state is computed by the bridge layer and should be propagated to switch + hardware to forward/block/learn traffic. The switch driver is responsible for + computing a STP state change based on current and asked parameters and perform + the relevant ageing based on the intersection results + +Bridge VLAN filtering +--------------------- + +- ``port_vlan_filtering``: bridge layer function invoked when the bridge gets + configured for turning on or off VLAN filtering. If nothing specific needs to + be done at the hardware level, this callback does not need to be implemented. + When VLAN filtering is turned on, the hardware must be programmed with + rejecting 802.1Q frames which have VLAN IDs outside of the programmed allowed + VLAN ID map/rules. If there is no PVID programmed into the switch port, + untagged frames must be rejected as well. When turned off the switch must + accept any 802.1Q frames irrespective of their VLAN ID, and untagged frames are + allowed. + +- ``port_vlan_prepare``: bridge layer function invoked when the bridge prepares the + configuration of a VLAN on the given port. If the operation is not supported + by the hardware, this function should return ``-EOPNOTSUPP`` to inform the bridge + code to fallback to a software implementation. No hardware setup must be done + in this function. See port_vlan_add for this and details. + +- ``port_vlan_add``: bridge layer function invoked when a VLAN is configured + (tagged or untagged) for the given switch port + +- ``port_vlan_del``: bridge layer function invoked when a VLAN is removed from the + given switch port + +- ``port_vlan_dump``: bridge layer function invoked with a switchdev callback + function that the driver has to call for each VLAN the given port is a member + of. A switchdev object is used to carry the VID and bridge flags. + +- ``port_fdb_add``: bridge layer function invoked when the bridge wants to install a + Forwarding Database entry, the switch hardware should be programmed with the + specified address in the specified VLAN Id in the forwarding database + associated with this VLAN ID. If the operation is not supported, this + function should return ``-EOPNOTSUPP`` to inform the bridge code to fallback to + a software implementation. + +.. note:: VLAN ID 0 corresponds to the port private database, which, in the context + of DSA, would be the its port-based VLAN, used by the associated bridge device. + +- ``port_fdb_del``: bridge layer function invoked when the bridge wants to remove a + Forwarding Database entry, the switch hardware should be programmed to delete + the specified MAC address from the specified VLAN ID if it was mapped into + this port forwarding database + +- ``port_fdb_dump``: bridge layer function invoked with a switchdev callback + function that the driver has to call for each MAC address known to be behind + the given port. A switchdev object is used to carry the VID and FDB info. + +- ``port_mdb_prepare``: bridge layer function invoked when the bridge prepares the + installation of a multicast database entry. If the operation is not supported, + this function should return ``-EOPNOTSUPP`` to inform the bridge code to fallback + to a software implementation. No hardware setup must be done in this function. + See ``port_fdb_add`` for this and details. + +- ``port_mdb_add``: bridge layer function invoked when the bridge wants to install + a multicast database entry, the switch hardware should be programmed with the + specified address in the specified VLAN ID in the forwarding database + associated with this VLAN ID. + +.. note:: VLAN ID 0 corresponds to the port private database, which, in the context + of DSA, would be the its port-based VLAN, used by the associated bridge device. + +- ``port_mdb_del``: bridge layer function invoked when the bridge wants to remove a + multicast database entry, the switch hardware should be programmed to delete + the specified MAC address from the specified VLAN ID if it was mapped into + this port forwarding database. + +- ``port_mdb_dump``: bridge layer function invoked with a switchdev callback + function that the driver has to call for each MAC address known to be behind + the given port. A switchdev object is used to carry the VID and MDB info. + +TODO +==== + +Making SWITCHDEV and DSA converge towards an unified codebase +------------------------------------------------------------- + +SWITCHDEV properly takes care of abstracting the networking stack with offload +capable hardware, but does not enforce a strict switch device driver model. On +the other DSA enforces a fairly strict device driver model, and deals with most +of the switch specific. At some point we should envision a merger between these +two subsystems and get the best of both worlds. + +Other hanging fruits +-------------------- + +- making the number of ports fully dynamic and not dependent on ``DSA_MAX_PORTS`` +- allowing more than one CPU/management interface: + http://comments.gmane.org/gmane.linux.network/365657 +- porting more drivers from other vendors: + http://comments.gmane.org/gmane.linux.network/365510 diff --git a/Documentation/networking/dsa/dsa.txt b/Documentation/networking/dsa/dsa.txt deleted file mode 100644 index 43ef767bc440..000000000000 --- a/Documentation/networking/dsa/dsa.txt +++ /dev/null @@ -1,584 +0,0 @@ -Distributed Switch Architecture -=============================== - -Introduction -============ - -This document describes the Distributed Switch Architecture (DSA) subsystem -design principles, limitations, interactions with other subsystems, and how to -develop drivers for this subsystem as well as a TODO for developers interested -in joining the effort. - -Design principles -================= - -The Distributed Switch Architecture is a subsystem which was primarily designed -to support Marvell Ethernet switches (MV88E6xxx, a.k.a Linkstreet product line) -using Linux, but has since evolved to support other vendors as well. - -The original philosophy behind this design was to be able to use unmodified -Linux tools such as bridge, iproute2, ifconfig to work transparently whether -they configured/queried a switch port network device or a regular network -device. - -An Ethernet switch is typically comprised of multiple front-panel ports, and one -or more CPU or management port. The DSA subsystem currently relies on the -presence of a management port connected to an Ethernet controller capable of -receiving Ethernet frames from the switch. This is a very common setup for all -kinds of Ethernet switches found in Small Home and Office products: routers, -gateways, or even top-of-the rack switches. This host Ethernet controller will -be later referred to as "master" and "cpu" in DSA terminology and code. - -The D in DSA stands for Distributed, because the subsystem has been designed -with the ability to configure and manage cascaded switches on top of each other -using upstream and downstream Ethernet links between switches. These specific -ports are referred to as "dsa" ports in DSA terminology and code. A collection -of multiple switches connected to each other is called a "switch tree". - -For each front-panel port, DSA will create specialized network devices which are -used as controlling and data-flowing endpoints for use by the Linux networking -stack. These specialized network interfaces are referred to as "slave" network -interfaces in DSA terminology and code. - -The ideal case for using DSA is when an Ethernet switch supports a "switch tag" -which is a hardware feature making the switch insert a specific tag for each -Ethernet frames it received to/from specific ports to help the management -interface figure out: - -- what port is this frame coming from -- what was the reason why this frame got forwarded -- how to send CPU originated traffic to specific ports - -The subsystem does support switches not capable of inserting/stripping tags, but -the features might be slightly limited in that case (traffic separation relies -on Port-based VLAN IDs). - -Note that DSA does not currently create network interfaces for the "cpu" and -"dsa" ports because: - -- the "cpu" port is the Ethernet switch facing side of the management - controller, and as such, would create a duplication of feature, since you - would get two interfaces for the same conduit: master netdev, and "cpu" netdev - -- the "dsa" port(s) are just conduits between two or more switches, and as such - cannot really be used as proper network interfaces either, only the - downstream, or the top-most upstream interface makes sense with that model - -Switch tagging protocols ------------------------- - -DSA currently supports 5 different tagging protocols, and a tag-less mode as -well. The different protocols are implemented in: - -net/dsa/tag_trailer.c: Marvell's 4 trailer tag mode (legacy) -net/dsa/tag_dsa.c: Marvell's original DSA tag -net/dsa/tag_edsa.c: Marvell's enhanced DSA tag -net/dsa/tag_brcm.c: Broadcom's 4 bytes tag -net/dsa/tag_qca.c: Qualcomm's 2 bytes tag - -The exact format of the tag protocol is vendor specific, but in general, they -all contain something which: - -- identifies which port the Ethernet frame came from/should be sent to -- provides a reason why this frame was forwarded to the management interface - -Master network devices ----------------------- - -Master network devices are regular, unmodified Linux network device drivers for -the CPU/management Ethernet interface. Such a driver might occasionally need to -know whether DSA is enabled (e.g.: to enable/disable specific offload features), -but the DSA subsystem has been proven to work with industry standard drivers: -e1000e, mv643xx_eth etc. without having to introduce modifications to these -drivers. Such network devices are also often referred to as conduit network -devices since they act as a pipe between the host processor and the hardware -Ethernet switch. - -Networking stack hooks ----------------------- - -When a master netdev is used with DSA, a small hook is placed in in the -networking stack is in order to have the DSA subsystem process the Ethernet -switch specific tagging protocol. DSA accomplishes this by registering a -specific (and fake) Ethernet type (later becoming skb->protocol) with the -networking stack, this is also known as a ptype or packet_type. A typical -Ethernet Frame receive sequence looks like this: - -Master network device (e.g.: e1000e): - -Receive interrupt fires: -- receive function is invoked -- basic packet processing is done: getting length, status etc. -- packet is prepared to be processed by the Ethernet layer by calling - eth_type_trans - -net/ethernet/eth.c: - -eth_type_trans(skb, dev) - if (dev->dsa_ptr != NULL) - -> skb->protocol = ETH_P_XDSA - -drivers/net/ethernet/*: - -netif_receive_skb(skb) - -> iterate over registered packet_type - -> invoke handler for ETH_P_XDSA, calls dsa_switch_rcv() - -net/dsa/dsa.c: - -> dsa_switch_rcv() - -> invoke switch tag specific protocol handler in - net/dsa/tag_*.c - -net/dsa/tag_*.c: - -> inspect and strip switch tag protocol to determine originating port - -> locate per-port network device - -> invoke eth_type_trans() with the DSA slave network device - -> invoked netif_receive_skb() - -Past this point, the DSA slave network devices get delivered regular Ethernet -frames that can be processed by the networking stack. - -Slave network devices ---------------------- - -Slave network devices created by DSA are stacked on top of their master network -device, each of these network interfaces will be responsible for being a -controlling and data-flowing end-point for each front-panel port of the switch. -These interfaces are specialized in order to: - -- insert/remove the switch tag protocol (if it exists) when sending traffic - to/from specific switch ports -- query the switch for ethtool operations: statistics, link state, - Wake-on-LAN, register dumps... -- external/internal PHY management: link, auto-negotiation etc. - -These slave network devices have custom net_device_ops and ethtool_ops function -pointers which allow DSA to introduce a level of layering between the networking -stack/ethtool, and the switch driver implementation. - -Upon frame transmission from these slave network devices, DSA will look up which -switch tagging protocol is currently registered with these network devices, and -invoke a specific transmit routine which takes care of adding the relevant -switch tag in the Ethernet frames. - -These frames are then queued for transmission using the master network device -ndo_start_xmit() function, since they contain the appropriate switch tag, the -Ethernet switch will be able to process these incoming frames from the -management interface and delivers these frames to the physical switch port. - -Graphical representation ------------------------- - -Summarized, this is basically how DSA looks like from a network device -perspective: - - - |--------------------------- - | CPU network device (eth0)| - ---------------------------- - | | - |--------------------------------------------| - | Switch driver | - |--------------------------------------------| - || || || - |-------| |-------| |-------| - | sw0p0 | | sw0p1 | | sw0p2 | - |-------| |-------| |-------| - -Slave MDIO bus --------------- - -In order to be able to read to/from a switch PHY built into it, DSA creates a -slave MDIO bus which allows a specific switch driver to divert and intercept -MDIO reads/writes towards specific PHY addresses. In most MDIO-connected -switches, these functions would utilize direct or indirect PHY addressing mode -to return standard MII registers from the switch builtin PHYs, allowing the PHY -library and/or to return link status, link partner pages, auto-negotiation -results etc.. - -For Ethernet switches which have both external and internal MDIO busses, the -slave MII bus can be utilized to mux/demux MDIO reads and writes towards either -internal or external MDIO devices this switch might be connected to: internal -PHYs, external PHYs, or even external switches. - -Data structures ---------------- - -DSA data structures are defined in include/net/dsa.h as well as -net/dsa/dsa_priv.h. - -dsa_chip_data: platform data configuration for a given switch device, this -structure describes a switch device's parent device, its address, as well as -various properties of its ports: names/labels, and finally a routing table -indication (when cascading switches) - -dsa_platform_data: platform device configuration data which can reference a -collection of dsa_chip_data structure if multiples switches are cascaded, the -master network device this switch tree is attached to needs to be referenced - -dsa_switch_tree: structure assigned to the master network device under -"dsa_ptr", this structure references a dsa_platform_data structure as well as -the tagging protocol supported by the switch tree, and which receive/transmit -function hooks should be invoked, information about the directly attached switch -is also provided: CPU port. Finally, a collection of dsa_switch are referenced -to address individual switches in the tree. - -dsa_switch: structure describing a switch device in the tree, referencing a -dsa_switch_tree as a backpointer, slave network devices, master network device, -and a reference to the backing dsa_switch_ops - -dsa_switch_ops: structure referencing function pointers, see below for a full -description. - -Design limitations -================== - -Limits on the number of devices and ports ------------------------------------------ - -DSA currently limits the number of maximum switches within a tree to 4 -(DSA_MAX_SWITCHES), and the number of ports per switch to 12 (DSA_MAX_PORTS). -These limits could be extended to support larger configurations would this need -arise. - -Lack of CPU/DSA network devices -------------------------------- - -DSA does not currently create slave network devices for the CPU or DSA ports, as -described before. This might be an issue in the following cases: - -- inability to fetch switch CPU port statistics counters using ethtool, which - can make it harder to debug MDIO switch connected using xMII interfaces - -- inability to configure the CPU port link parameters based on the Ethernet - controller capabilities attached to it: http://patchwork.ozlabs.org/patch/509806/ - -- inability to configure specific VLAN IDs / trunking VLANs between switches - when using a cascaded setup - -Common pitfalls using DSA setups --------------------------------- - -Once a master network device is configured to use DSA (dev->dsa_ptr becomes -non-NULL), and the switch behind it expects a tagging protocol, this network -interface can only exclusively be used as a conduit interface. Sending packets -directly through this interface (e.g.: opening a socket using this interface) -will not make us go through the switch tagging protocol transmit function, so -the Ethernet switch on the other end, expecting a tag will typically drop this -frame. - -Slave network devices check that the master network device is UP before allowing -you to administratively bring UP these slave network devices. A common -configuration mistake is forgetting to bring UP the master network device first. - -Interactions with other subsystems -================================== - -DSA currently leverages the following subsystems: - -- MDIO/PHY library: drivers/net/phy/phy.c, mdio_bus.c -- Switchdev: net/switchdev/* -- Device Tree for various of_* functions - -MDIO/PHY library ----------------- - -Slave network devices exposed by DSA may or may not be interfacing with PHY -devices (struct phy_device as defined in include/linux/phy.h), but the DSA -subsystem deals with all possible combinations: - -- internal PHY devices, built into the Ethernet switch hardware -- external PHY devices, connected via an internal or external MDIO bus -- internal PHY devices, connected via an internal MDIO bus -- special, non-autonegotiated or non MDIO-managed PHY devices: SFPs, MoCA; a.k.a - fixed PHYs - -The PHY configuration is done by the dsa_slave_phy_setup() function and the -logic basically looks like this: - -- if Device Tree is used, the PHY device is looked up using the standard - "phy-handle" property, if found, this PHY device is created and registered - using of_phy_connect() - -- if Device Tree is used, and the PHY device is "fixed", that is, conforms to - the definition of a non-MDIO managed PHY as defined in - Documentation/devicetree/bindings/net/fixed-link.txt, the PHY is registered - and connected transparently using the special fixed MDIO bus driver - -- finally, if the PHY is built into the switch, as is very common with - standalone switch packages, the PHY is probed using the slave MII bus created - by DSA - - -SWITCHDEV ---------- - -DSA directly utilizes SWITCHDEV when interfacing with the bridge layer, and -more specifically with its VLAN filtering portion when configuring VLANs on top -of per-port slave network devices. Since DSA primarily deals with -MDIO-connected switches, although not exclusively, SWITCHDEV's -prepare/abort/commit phases are often simplified into a prepare phase which -checks whether the operation is supported by the DSA switch driver, and a commit -phase which applies the changes. - -As of today, the only SWITCHDEV objects supported by DSA are the FDB and VLAN -objects. - -Device Tree ------------ - -DSA features a standardized binding which is documented in -Documentation/devicetree/bindings/net/dsa/dsa.txt. PHY/MDIO library helper -functions such as of_get_phy_mode(), of_phy_connect() are also used to query -per-port PHY specific details: interface connection, MDIO bus location etc.. - -Driver development -================== - -DSA switch drivers need to implement a dsa_switch_ops structure which will -contain the various members described below. - -register_switch_driver() registers this dsa_switch_ops in its internal list -of drivers to probe for. unregister_switch_driver() does the exact opposite. - -Unless requested differently by setting the priv_size member accordingly, DSA -does not allocate any driver private context space. - -Switch configuration --------------------- - -- tag_protocol: this is to indicate what kind of tagging protocol is supported, - should be a valid value from the dsa_tag_protocol enum - -- probe: probe routine which will be invoked by the DSA platform device upon - registration to test for the presence/absence of a switch device. For MDIO - devices, it is recommended to issue a read towards internal registers using - the switch pseudo-PHY and return whether this is a supported device. For other - buses, return a non-NULL string - -- setup: setup function for the switch, this function is responsible for setting - up the dsa_switch_ops private structure with all it needs: register maps, - interrupts, mutexes, locks etc.. This function is also expected to properly - configure the switch to separate all network interfaces from each other, that - is, they should be isolated by the switch hardware itself, typically by creating - a Port-based VLAN ID for each port and allowing only the CPU port and the - specific port to be in the forwarding vector. Ports that are unused by the - platform should be disabled. Past this function, the switch is expected to be - fully configured and ready to serve any kind of request. It is recommended - to issue a software reset of the switch during this setup function in order to - avoid relying on what a previous software agent such as a bootloader/firmware - may have previously configured. - -PHY devices and link management -------------------------------- - -- get_phy_flags: Some switches are interfaced to various kinds of Ethernet PHYs, - if the PHY library PHY driver needs to know about information it cannot obtain - on its own (e.g.: coming from switch memory mapped registers), this function - should return a 32-bits bitmask of "flags", that is private between the switch - driver and the Ethernet PHY driver in drivers/net/phy/*. - -- phy_read: Function invoked by the DSA slave MDIO bus when attempting to read - the switch port MDIO registers. If unavailable, return 0xffff for each read. - For builtin switch Ethernet PHYs, this function should allow reading the link - status, auto-negotiation results, link partner pages etc.. - -- phy_write: Function invoked by the DSA slave MDIO bus when attempting to write - to the switch port MDIO registers. If unavailable return a negative error - code. - -- adjust_link: Function invoked by the PHY library when a slave network device - is attached to a PHY device. This function is responsible for appropriately - configuring the switch port link parameters: speed, duplex, pause based on - what the phy_device is providing. - -- fixed_link_update: Function invoked by the PHY library, and specifically by - the fixed PHY driver asking the switch driver for link parameters that could - not be auto-negotiated, or obtained by reading the PHY registers through MDIO. - This is particularly useful for specific kinds of hardware such as QSGMII, - MoCA or other kinds of non-MDIO managed PHYs where out of band link - information is obtained - -Ethtool operations ------------------- - -- get_strings: ethtool function used to query the driver's strings, will - typically return statistics strings, private flags strings etc. - -- get_ethtool_stats: ethtool function used to query per-port statistics and - return their values. DSA overlays slave network devices general statistics: - RX/TX counters from the network device, with switch driver specific statistics - per port - -- get_sset_count: ethtool function used to query the number of statistics items - -- get_wol: ethtool function used to obtain Wake-on-LAN settings per-port, this - function may, for certain implementations also query the master network device - Wake-on-LAN settings if this interface needs to participate in Wake-on-LAN - -- set_wol: ethtool function used to configure Wake-on-LAN settings per-port, - direct counterpart to set_wol with similar restrictions - -- set_eee: ethtool function which is used to configure a switch port EEE (Green - Ethernet) settings, can optionally invoke the PHY library to enable EEE at the - PHY level if relevant. This function should enable EEE at the switch port MAC - controller and data-processing logic - -- get_eee: ethtool function which is used to query a switch port EEE settings, - this function should return the EEE state of the switch port MAC controller - and data-processing logic as well as query the PHY for its currently configured - EEE settings - -- get_eeprom_len: ethtool function returning for a given switch the EEPROM - length/size in bytes - -- get_eeprom: ethtool function returning for a given switch the EEPROM contents - -- set_eeprom: ethtool function writing specified data to a given switch EEPROM - -- get_regs_len: ethtool function returning the register length for a given - switch - -- get_regs: ethtool function returning the Ethernet switch internal register - contents. This function might require user-land code in ethtool to - pretty-print register values and registers - -Power management ----------------- - -- suspend: function invoked by the DSA platform device when the system goes to - suspend, should quiesce all Ethernet switch activities, but keep ports - participating in Wake-on-LAN active as well as additional wake-up logic if - supported - -- resume: function invoked by the DSA platform device when the system resumes, - should resume all Ethernet switch activities and re-configure the switch to be - in a fully active state - -- port_enable: function invoked by the DSA slave network device ndo_open - function when a port is administratively brought up, this function should be - fully enabling a given switch port. DSA takes care of marking the port with - BR_STATE_BLOCKING if the port is a bridge member, or BR_STATE_FORWARDING if it - was not, and propagating these changes down to the hardware - -- port_disable: function invoked by the DSA slave network device ndo_close - function when a port is administratively brought down, this function should be - fully disabling a given switch port. DSA takes care of marking the port with - BR_STATE_DISABLED and propagating changes to the hardware if this port is - disabled while being a bridge member - -Bridge layer ------------- - -- port_bridge_join: bridge layer function invoked when a given switch port is - added to a bridge, this function should be doing the necessary at the switch - level to permit the joining port from being added to the relevant logical - domain for it to ingress/egress traffic with other members of the bridge. - -- port_bridge_leave: bridge layer function invoked when a given switch port is - removed from a bridge, this function should be doing the necessary at the - switch level to deny the leaving port from ingress/egress traffic from the - remaining bridge members. When the port leaves the bridge, it should be aged - out at the switch hardware for the switch to (re) learn MAC addresses behind - this port. - -- port_stp_state_set: bridge layer function invoked when a given switch port STP - state is computed by the bridge layer and should be propagated to switch - hardware to forward/block/learn traffic. The switch driver is responsible for - computing a STP state change based on current and asked parameters and perform - the relevant ageing based on the intersection results - -Bridge VLAN filtering ---------------------- - -- port_vlan_filtering: bridge layer function invoked when the bridge gets - configured for turning on or off VLAN filtering. If nothing specific needs to - be done at the hardware level, this callback does not need to be implemented. - When VLAN filtering is turned on, the hardware must be programmed with - rejecting 802.1Q frames which have VLAN IDs outside of the programmed allowed - VLAN ID map/rules. If there is no PVID programmed into the switch port, - untagged frames must be rejected as well. When turned off the switch must - accept any 802.1Q frames irrespective of their VLAN ID, and untagged frames are - allowed. - -- port_vlan_prepare: bridge layer function invoked when the bridge prepares the - configuration of a VLAN on the given port. If the operation is not supported - by the hardware, this function should return -EOPNOTSUPP to inform the bridge - code to fallback to a software implementation. No hardware setup must be done - in this function. See port_vlan_add for this and details. - -- port_vlan_add: bridge layer function invoked when a VLAN is configured - (tagged or untagged) for the given switch port - -- port_vlan_del: bridge layer function invoked when a VLAN is removed from the - given switch port - -- port_vlan_dump: bridge layer function invoked with a switchdev callback - function that the driver has to call for each VLAN the given port is a member - of. A switchdev object is used to carry the VID and bridge flags. - -- port_fdb_add: bridge layer function invoked when the bridge wants to install a - Forwarding Database entry, the switch hardware should be programmed with the - specified address in the specified VLAN Id in the forwarding database - associated with this VLAN ID. If the operation is not supported, this - function should return -EOPNOTSUPP to inform the bridge code to fallback to - a software implementation. - -Note: VLAN ID 0 corresponds to the port private database, which, in the context -of DSA, would be the its port-based VLAN, used by the associated bridge device. - -- port_fdb_del: bridge layer function invoked when the bridge wants to remove a - Forwarding Database entry, the switch hardware should be programmed to delete - the specified MAC address from the specified VLAN ID if it was mapped into - this port forwarding database - -- port_fdb_dump: bridge layer function invoked with a switchdev callback - function that the driver has to call for each MAC address known to be behind - the given port. A switchdev object is used to carry the VID and FDB info. - -- port_mdb_prepare: bridge layer function invoked when the bridge prepares the - installation of a multicast database entry. If the operation is not supported, - this function should return -EOPNOTSUPP to inform the bridge code to fallback - to a software implementation. No hardware setup must be done in this function. - See port_fdb_add for this and details. - -- port_mdb_add: bridge layer function invoked when the bridge wants to install - a multicast database entry, the switch hardware should be programmed with the - specified address in the specified VLAN ID in the forwarding database - associated with this VLAN ID. - -Note: VLAN ID 0 corresponds to the port private database, which, in the context -of DSA, would be the its port-based VLAN, used by the associated bridge device. - -- port_mdb_del: bridge layer function invoked when the bridge wants to remove a - multicast database entry, the switch hardware should be programmed to delete - the specified MAC address from the specified VLAN ID if it was mapped into - this port forwarding database. - -- port_mdb_dump: bridge layer function invoked with a switchdev callback - function that the driver has to call for each MAC address known to be behind - the given port. A switchdev object is used to carry the VID and MDB info. - -TODO -==== - -Making SWITCHDEV and DSA converge towards an unified codebase -------------------------------------------------------------- - -SWITCHDEV properly takes care of abstracting the networking stack with offload -capable hardware, but does not enforce a strict switch device driver model. On -the other DSA enforces a fairly strict device driver model, and deals with most -of the switch specific. At some point we should envision a merger between these -two subsystems and get the best of both worlds. - -Other hanging fruits --------------------- - -- making the number of ports fully dynamic and not dependent on DSA_MAX_PORTS -- allowing more than one CPU/management interface: - http://comments.gmane.org/gmane.linux.network/365657 -- porting more drivers from other vendors: - http://comments.gmane.org/gmane.linux.network/365510 diff --git a/Documentation/networking/dsa/index.rst b/Documentation/networking/dsa/index.rst new file mode 100644 index 000000000000..5c488d345a1e --- /dev/null +++ b/Documentation/networking/dsa/index.rst @@ -0,0 +1,10 @@ +=============================== +Distributed Switch Architecture +=============================== + +.. toctree:: + :maxdepth: 1 + + dsa + bcm_sf2 + lan9303 diff --git a/Documentation/networking/dsa/lan9303.rst b/Documentation/networking/dsa/lan9303.rst new file mode 100644 index 000000000000..e3c820db28ad --- /dev/null +++ b/Documentation/networking/dsa/lan9303.rst @@ -0,0 +1,37 @@ +============================== +LAN9303 Ethernet switch driver +============================== + +The LAN9303 is a three port 10/100 Mbps ethernet switch with integrated phys for +the two external ethernet ports. The third port is an RMII/MII interface to a +host master network interface (e.g. fixed link). + + +Driver details +============== + +The driver is implemented as a DSA driver, see ``Documentation/networking/dsa/dsa.rst``. + +See ``Documentation/devicetree/bindings/net/dsa/lan9303.txt`` for device tree +binding. + +The LAN9303 can be managed both via MDIO and I2C, both supported by this driver. + +At startup the driver configures the device to provide two separate network +interfaces (which is the default state of a DSA device). Due to HW limitations, +no HW MAC learning takes place in this mode. + +When both user ports are joined to the same bridge, the normal HW MAC learning +is enabled. This means that unicast traffic is forwarded in HW. Broadcast and +multicast is flooded in HW. STP is also supported in this mode. The driver +support fdb/mdb operations as well, meaning IGMP snooping is supported. + +If one of the user ports leave the bridge, the ports goes back to the initial +separated operation. + + +Driver limitations +================== + + - Support for VLAN filtering is not implemented + - The HW does not support VLAN-specific fdb entries diff --git a/Documentation/networking/dsa/lan9303.txt b/Documentation/networking/dsa/lan9303.txt deleted file mode 100644 index 144b02b95207..000000000000 --- a/Documentation/networking/dsa/lan9303.txt +++ /dev/null @@ -1,37 +0,0 @@ -LAN9303 Ethernet switch driver -============================== - -The LAN9303 is a three port 10/100 Mbps ethernet switch with integrated phys for -the two external ethernet ports. The third port is an RMII/MII interface to a -host master network interface (e.g. fixed link). - - -Driver details -============== - -The driver is implemented as a DSA driver, see -Documentation/networking/dsa/dsa.txt. - -See Documentation/devicetree/bindings/net/dsa/lan9303.txt for device tree -binding. - -The LAN9303 can be managed both via MDIO and I2C, both supported by this driver. - -At startup the driver configures the device to provide two separate network -interfaces (which is the default state of a DSA device). Due to HW limitations, -no HW MAC learning takes place in this mode. - -When both user ports are joined to the same bridge, the normal HW MAC learning -is enabled. This means that unicast traffic is forwarded in HW. Broadcast and -multicast is flooded in HW. STP is also supported in this mode. The driver -support fdb/mdb operations as well, meaning IGMP snooping is supported. - -If one of the user ports leave the bridge, the ports goes back to the initial -separated operation. - - -Driver limitations -================== - - - Support for VLAN filtering is not implemented - - The HW does not support VLAN-specific fdb entries diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index 984e68f9e026..269d6f2661d5 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -25,6 +25,7 @@ Contents: device_drivers/intel/i40e device_drivers/intel/iavf device_drivers/intel/ice + dsa/index devlink-info-versions ieee802154 kapi -- cgit v1.2.3 From da70314917862d4da4a8d7601cd47339df8b3c23 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Wed, 17 Apr 2019 22:28:57 -0700 Subject: bpf: Document BPF_PROG_TYPE_CGROUP_SYSCTL Add documentation for BPF_PROG_TYPE_CGROUP_SYSCTL, including general info, attach type, context, return code, helpers, example and usage considerations. A separate file prog_cgroup_sysctl.rst is added to Documentation/bpf/. In the future more program types can be documented in their own prog_.rst files. Another way to place program type specific documentation would be to group program types somehow (e.g. cgroup.rst for all cgroup-bpf programs), but it may not scale well since some program types may belong to different groups, e.g. BPF_PROG_TYPE_CGROUP_SKB can be documented together with either cgroup-bpf programs or programs that access skb. The new file is added to the index and verified by `make htmldocs` / sanity-check by lynx. Signed-off-by: Andrey Ignatov Acked-by: Yonghong Song Signed-off-by: Alexei Starovoitov --- Documentation/bpf/index.rst | 9 +++ Documentation/bpf/prog_cgroup_sysctl.rst | 125 +++++++++++++++++++++++++++++++ 2 files changed, 134 insertions(+) create mode 100644 Documentation/bpf/prog_cgroup_sysctl.rst (limited to 'Documentation') diff --git a/Documentation/bpf/index.rst b/Documentation/bpf/index.rst index 4e77932959cc..dadcaa9a9f5f 100644 --- a/Documentation/bpf/index.rst +++ b/Documentation/bpf/index.rst @@ -36,6 +36,15 @@ Two sets of Questions and Answers (Q&A) are maintained. bpf_devel_QA +Program types +============= + +.. toctree:: + :maxdepth: 1 + + prog_cgroup_sysctl + + .. Links: .. _Documentation/networking/filter.txt: ../networking/filter.txt .. _man-pages: https://www.kernel.org/doc/man-pages/ diff --git a/Documentation/bpf/prog_cgroup_sysctl.rst b/Documentation/bpf/prog_cgroup_sysctl.rst new file mode 100644 index 000000000000..677d6c637cf3 --- /dev/null +++ b/Documentation/bpf/prog_cgroup_sysctl.rst @@ -0,0 +1,125 @@ +.. SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) + +=========================== +BPF_PROG_TYPE_CGROUP_SYSCTL +=========================== + +This document describes ``BPF_PROG_TYPE_CGROUP_SYSCTL`` program type that +provides cgroup-bpf hook for sysctl. + +The hook has to be attached to a cgroup and will be called every time a +process inside that cgroup tries to read from or write to sysctl knob in proc. + +1. Attach type +************** + +``BPF_CGROUP_SYSCTL`` attach type has to be used to attach +``BPF_PROG_TYPE_CGROUP_SYSCTL`` program to a cgroup. + +2. Context +********** + +``BPF_PROG_TYPE_CGROUP_SYSCTL`` provides access to the following context from +BPF program:: + + struct bpf_sysctl { + __u32 write; + __u32 file_pos; + }; + +* ``write`` indicates whether sysctl value is being read (``0``) or written + (``1``). This field is read-only. + +* ``file_pos`` indicates file position sysctl is being accessed at, read + or written. This field is read-write. Writing to the field sets the starting + position in sysctl proc file ``read(2)`` will be reading from or ``write(2)`` + will be writing to. Writing zero to the field can be used e.g. to override + whole sysctl value by ``bpf_sysctl_set_new_value()`` on ``write(2)`` even + when it's called by user space on ``file_pos > 0``. Writing non-zero + value to the field can be used to access part of sysctl value starting from + specified ``file_pos``. Not all sysctl support access with ``file_pos != + 0``, e.g. writes to numeric sysctl entries must always be at file position + ``0``. See also ``kernel.sysctl_writes_strict`` sysctl. + +See `linux/bpf.h`_ for more details on how context field can be accessed. + +3. Return code +************** + +``BPF_PROG_TYPE_CGROUP_SYSCTL`` program must return one of the following +return codes: + +* ``0`` means "reject access to sysctl"; +* ``1`` means "proceed with access". + +If program returns ``0`` user space will get ``-1`` from ``read(2)`` or +``write(2)`` and ``errno`` will be set to ``EPERM``. + +4. Helpers +********** + +Since sysctl knob is represented by a name and a value, sysctl specific BPF +helpers focus on providing access to these properties: + +* ``bpf_sysctl_get_name()`` to get sysctl name as it is visible in + ``/proc/sys`` into provided by BPF program buffer; + +* ``bpf_sysctl_get_current_value()`` to get string value currently held by + sysctl into provided by BPF program buffer. This helper is available on both + ``read(2)`` from and ``write(2)`` to sysctl; + +* ``bpf_sysctl_get_new_value()`` to get new string value currently being + written to sysctl before actual write happens. This helper can be used only + on ``ctx->write == 1``; + +* ``bpf_sysctl_set_new_value()`` to override new string value currently being + written to sysctl before actual write happens. Sysctl value will be + overridden starting from the current ``ctx->file_pos``. If the whole value + has to be overridden BPF program can set ``file_pos`` to zero before calling + to the helper. This helper can be used only on ``ctx->write == 1``. New + string value set by the helper is treated and verified by kernel same way as + an equivalent string passed by user space. + +BPF program sees sysctl value same way as user space does in proc filesystem, +i.e. as a string. Since many sysctl values represent an integer or a vector +of integers, the following helpers can be used to get numeric value from the +string: + +* ``bpf_strtol()`` to convert initial part of the string to long integer + similar to user space `strtol(3)`_; +* ``bpf_strtoul()`` to convert initial part of the string to unsigned long + integer similar to user space `strtoul(3)`_; + +See `linux/bpf.h`_ for more details on helpers described here. + +5. Examples +*********** + +See `test_sysctl_prog.c`_ for an example of BPF program in C that access +sysctl name and value, parses string value to get vector of integers and uses +the result to make decision whether to allow or deny access to sysctl. + +6. Notes +******** + +``BPF_PROG_TYPE_CGROUP_SYSCTL`` is intended to be used in **trusted** root +environment, for example to monitor sysctl usage or catch unreasonable values +an application, running as root in a separate cgroup, is trying to set. + +Since `task_dfl_cgroup(current)` is called at `sys_read` / `sys_write` time it +may return results different from that at `sys_open` time, i.e. process that +opened sysctl file in proc filesystem may differ from process that is trying +to read from / write to it and two such processes may run in different +cgroups, what means ``BPF_PROG_TYPE_CGROUP_SYSCTL`` should not be used as a +security mechanism to limit sysctl usage. + +As with any cgroup-bpf program additional care should be taken if an +application running as root in a cgroup should not be allowed to +detach/replace BPF program attached by administrator. + +.. Links +.. _linux/bpf.h: ../../include/uapi/linux/bpf.h +.. _strtol(3): http://man7.org/linux/man-pages/man3/strtol.3p.html +.. _strtoul(3): http://man7.org/linux/man-pages/man3/strtoul.3p.html +.. _test_sysctl_prog.c: + ../../tools/testing/selftests/bpf/progs/test_sysctl_prog.c -- cgit v1.2.3 From 0bc199854405543b0debe67c735c0aae94f1d319 Mon Sep 17 00:00:00 2001 From: Stephen Suryaputra Date: Wed, 17 Apr 2019 16:35:49 -0400 Subject: ipv6: Add rate limit mask for ICMPv6 messages To make ICMPv6 closer to ICMPv4, add ratemask parameter. Since the ICMP message types use larger numeric values, a simple bitmask doesn't fit. I use large bitmap. The input and output are the in form of list of ranges. Set the default to rate limit all error messages but Packet Too Big. For Packet Too Big, use ratemask instead of hard-coded. There are functions where icmpv6_xrlim_allow() and icmpv6_global_allow() aren't called. This patch only adds them to icmpv6_echo_reply(). Rate limiting error messages is mandated by RFC 4443 but RFC 4890 says that it is also acceptable to rate limit informational messages. Thus, I removed the current hard-coded behavior of icmpv6_mask_allow() that doesn't rate limit informational messages. v2: Add dummy function proc_do_large_bitmap() if CONFIG_PROC_SYSCTL isn't defined, expand the description in ip-sysctl.txt and remove unnecessary conditional before kfree(). v3: Inline the bitmap instead of dynamically allocated. Still is a pointer to it is needed because of the way proc_do_large_bitmap work. Signed-off-by: Stephen Suryaputra Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 17 ++++++++++++++++- include/net/netns/ipv6.h | 3 +++ include/uapi/linux/icmpv6.h | 4 ++++ kernel/sysctl.c | 6 ++++++ net/ipv6/af_inet6.c | 9 +++++++++ net/ipv6/icmp.c | 31 ++++++++++++++++++++++--------- 6 files changed, 60 insertions(+), 10 deletions(-) (limited to 'Documentation') diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 5eedc6941ce5..8a5e59ba223f 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -1913,11 +1913,26 @@ enhanced_dad - BOOLEAN icmp/*: ratelimit - INTEGER - Limit the maximal rates for sending ICMPv6 packets. + Limit the maximal rates for sending ICMPv6 messages. 0 to disable any limiting, otherwise the minimal space between responses in milliseconds. Default: 1000 +ratemask - list of comma separated ranges + For ICMPv6 message types matching the ranges in the ratemask, limit + the sending of the message according to ratelimit parameter. + + The format used for both input and output is a comma separated + list of ranges (e.g. "0-127,129" for ICMPv6 message type 0 to 127 and + 129). Writing to the file will clear all previous ranges of ICMPv6 + message types and update the current list with the input. + + Refer to: https://www.iana.org/assignments/icmpv6-parameters/icmpv6-parameters.xhtml + for numerical values of ICMPv6 message types, e.g. echo request is 128 + and echo reply is 129. + + Default: 0-1,3-127 (rate limit ICMPv6 errors except Packet Too Big) + echo_ignore_all - BOOLEAN If set non-zero, then the kernel will ignore all ICMP ECHO requests sent to it over the IPv6 protocol. diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index 64e29b58bb5e..5e61b5a8635d 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -8,6 +8,7 @@ #ifndef __NETNS_IPV6_H__ #define __NETNS_IPV6_H__ #include +#include struct ctl_table_header; @@ -35,6 +36,8 @@ struct netns_sysctl_ipv6 { int icmpv6_echo_ignore_all; int icmpv6_echo_ignore_multicast; int icmpv6_echo_ignore_anycast; + DECLARE_BITMAP(icmpv6_ratemask, ICMPV6_MSG_MAX + 1); + unsigned long *icmpv6_ratemask_ptr; int anycast_src_echo_reply; int ip_nonlocal_bind; int fwmark_reflect; diff --git a/include/uapi/linux/icmpv6.h b/include/uapi/linux/icmpv6.h index 325395f56bfa..2622b5a3e616 100644 --- a/include/uapi/linux/icmpv6.h +++ b/include/uapi/linux/icmpv6.h @@ -90,6 +90,8 @@ struct icmp6hdr { #define ICMPV6_TIME_EXCEED 3 #define ICMPV6_PARAMPROB 4 +#define ICMPV6_ERRMSG_MAX 127 + #define ICMPV6_INFOMSG_MASK 0x80 #define ICMPV6_ECHO_REQUEST 128 @@ -110,6 +112,8 @@ struct icmp6hdr { #define ICMPV6_MRDISC_ADV 151 +#define ICMPV6_MSG_MAX 255 + /* * Codes for Destination Unreachable */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c9ec050bcf46..599510a3355e 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -3326,6 +3326,11 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, return -ENOSYS; } +int proc_do_large_bitmap(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} #endif /* CONFIG_PROC_SYSCTL */ @@ -3366,3 +3371,4 @@ EXPORT_SYMBOL(proc_dointvec_ms_jiffies); EXPORT_SYMBOL(proc_dostring); EXPORT_SYMBOL(proc_doulongvec_minmax); EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax); +EXPORT_SYMBOL(proc_do_large_bitmap); diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index d8587ca4fbeb..3d1de28aaa9e 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -850,6 +850,15 @@ static int __net_init inet6_net_init(struct net *net) net->ipv6.sysctl.icmpv6_echo_ignore_all = 0; net->ipv6.sysctl.icmpv6_echo_ignore_multicast = 0; net->ipv6.sysctl.icmpv6_echo_ignore_anycast = 0; + + /* By default, rate limit error messages. + * Except for pmtu discovery, it would break it. + * proc_do_large_bitmap needs pointer to the bitmap. + */ + bitmap_set(net->ipv6.sysctl.icmpv6_ratemask, 0, ICMPV6_ERRMSG_MAX + 1); + bitmap_clear(net->ipv6.sysctl.icmpv6_ratemask, ICMPV6_PKT_TOOBIG, 1); + net->ipv6.sysctl.icmpv6_ratemask_ptr = net->ipv6.sysctl.icmpv6_ratemask; + net->ipv6.sysctl.flowlabel_consistency = 1; net->ipv6.sysctl.auto_flowlabels = IP6_DEFAULT_AUTO_FLOW_LABELS; net->ipv6.sysctl.idgen_retries = 3; diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index cc14b9998941..afb915807cd0 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -168,22 +168,21 @@ static bool is_ineligible(const struct sk_buff *skb) return false; } -static bool icmpv6_mask_allow(int type) +static bool icmpv6_mask_allow(struct net *net, int type) { - /* Informational messages are not limited. */ - if (type & ICMPV6_INFOMSG_MASK) + if (type > ICMPV6_MSG_MAX) return true; - /* Do not limit pmtu discovery, it would break it. */ - if (type == ICMPV6_PKT_TOOBIG) + /* Limit if icmp type is set in ratemask. */ + if (!test_bit(type, net->ipv6.sysctl.icmpv6_ratemask)) return true; return false; } -static bool icmpv6_global_allow(int type) +static bool icmpv6_global_allow(struct net *net, int type) { - if (icmpv6_mask_allow(type)) + if (icmpv6_mask_allow(net, type)) return true; if (icmp_global_allow()) @@ -202,7 +201,7 @@ static bool icmpv6_xrlim_allow(struct sock *sk, u8 type, struct dst_entry *dst; bool res = false; - if (icmpv6_mask_allow(type)) + if (icmpv6_mask_allow(net, type)) return true; /* @@ -511,7 +510,7 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, local_bh_disable(); /* Check global sysctl_icmp_msgs_per_sec ratelimit */ - if (!(skb->dev->flags&IFF_LOOPBACK) && !icmpv6_global_allow(type)) + if (!(skb->dev->flags & IFF_LOOPBACK) && !icmpv6_global_allow(net, type)) goto out_bh_enable; mip6_addr_swap(skb); @@ -731,6 +730,11 @@ static void icmpv6_echo_reply(struct sk_buff *skb) if (IS_ERR(dst)) goto out; + /* Check the ratelimit */ + if ((!(skb->dev->flags & IFF_LOOPBACK) && !icmpv6_global_allow(net, ICMPV6_ECHO_REPLY)) || + !icmpv6_xrlim_allow(sk, ICMPV6_ECHO_REPLY, &fl6)) + goto out_dst_release; + idev = __in6_dev_get(skb->dev); msg.skb = skb; @@ -751,6 +755,7 @@ static void icmpv6_echo_reply(struct sk_buff *skb) icmpv6_push_pending_frames(sk, &fl6, &tmp_hdr, skb->len + sizeof(struct icmp6hdr)); } +out_dst_release: dst_release(dst); out: icmpv6_xmit_unlock(sk); @@ -1137,6 +1142,13 @@ static struct ctl_table ipv6_icmp_table_template[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "ratemask", + .data = &init_net.ipv6.sysctl.icmpv6_ratemask_ptr, + .maxlen = ICMPV6_MSG_MAX + 1, + .mode = 0644, + .proc_handler = proc_do_large_bitmap, + }, { }, }; @@ -1153,6 +1165,7 @@ struct ctl_table * __net_init ipv6_icmp_sysctl_init(struct net *net) table[1].data = &net->ipv6.sysctl.icmpv6_echo_ignore_all; table[2].data = &net->ipv6.sysctl.icmpv6_echo_ignore_multicast; table[3].data = &net->ipv6.sysctl.icmpv6_echo_ignore_anycast; + table[4].data = &net->ipv6.sysctl.icmpv6_ratemask_ptr; } return table; } -- cgit v1.2.3 From 80695946737dff4cfc1ecdefd4ebf300f132d8ee Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Thu, 18 Apr 2019 16:47:52 -0700 Subject: bpf: move BPF_PROG_TYPE_FLOW_DISSECTOR documentation to a new common place In commit da7031491786 ("bpf: Document BPF_PROG_TYPE_CGROUP_SYSCTL") Andrey proposes to put per-prog type docs under Documentation/bpf/ Let's move flow dissector documentation there as well. Signed-off-by: Stanislav Fomichev Signed-off-by: Alexei Starovoitov --- Documentation/bpf/index.rst | 1 + Documentation/bpf/prog_flow_dissector.rst | 126 ++++++++++++++++++++++++ Documentation/networking/bpf_flow_dissector.rst | 126 ------------------------ Documentation/networking/index.rst | 1 - 4 files changed, 127 insertions(+), 127 deletions(-) create mode 100644 Documentation/bpf/prog_flow_dissector.rst delete mode 100644 Documentation/networking/bpf_flow_dissector.rst (limited to 'Documentation') diff --git a/Documentation/bpf/index.rst b/Documentation/bpf/index.rst index dadcaa9a9f5f..d3fe4cac0c90 100644 --- a/Documentation/bpf/index.rst +++ b/Documentation/bpf/index.rst @@ -43,6 +43,7 @@ Program types :maxdepth: 1 prog_cgroup_sysctl + prog_flow_dissector .. Links: diff --git a/Documentation/bpf/prog_flow_dissector.rst b/Documentation/bpf/prog_flow_dissector.rst new file mode 100644 index 000000000000..ed343abe541e --- /dev/null +++ b/Documentation/bpf/prog_flow_dissector.rst @@ -0,0 +1,126 @@ +.. SPDX-License-Identifier: GPL-2.0 + +============================ +BPF_PROG_TYPE_FLOW_DISSECTOR +============================ + +Overview +======== + +Flow dissector is a routine that parses metadata out of the packets. It's +used in the various places in the networking subsystem (RFS, flow hash, etc). + +BPF flow dissector is an attempt to reimplement C-based flow dissector logic +in BPF to gain all the benefits of BPF verifier (namely, limits on the +number of instructions and tail calls). + +API +=== + +BPF flow dissector programs operate on an ``__sk_buff``. However, only the +limited set of fields is allowed: ``data``, ``data_end`` and ``flow_keys``. +``flow_keys`` is ``struct bpf_flow_keys`` and contains flow dissector input +and output arguments. + +The inputs are: + * ``nhoff`` - initial offset of the networking header + * ``thoff`` - initial offset of the transport header, initialized to nhoff + * ``n_proto`` - L3 protocol type, parsed out of L2 header + +Flow dissector BPF program should fill out the rest of the ``struct +bpf_flow_keys`` fields. Input arguments ``nhoff/thoff/n_proto`` should be +also adjusted accordingly. + +The return code of the BPF program is either BPF_OK to indicate successful +dissection, or BPF_DROP to indicate parsing error. + +__sk_buff->data +=============== + +In the VLAN-less case, this is what the initial state of the BPF flow +dissector looks like:: + + +------+------+------------+-----------+ + | DMAC | SMAC | ETHER_TYPE | L3_HEADER | + +------+------+------------+-----------+ + ^ + | + +-- flow dissector starts here + + +.. code:: c + + skb->data + flow_keys->nhoff point to the first byte of L3_HEADER + flow_keys->thoff = nhoff + flow_keys->n_proto = ETHER_TYPE + +In case of VLAN, flow dissector can be called with the two different states. + +Pre-VLAN parsing:: + + +------+------+------+-----+-----------+-----------+ + | DMAC | SMAC | TPID | TCI |ETHER_TYPE | L3_HEADER | + +------+------+------+-----+-----------+-----------+ + ^ + | + +-- flow dissector starts here + +.. code:: c + + skb->data + flow_keys->nhoff point the to first byte of TCI + flow_keys->thoff = nhoff + flow_keys->n_proto = TPID + +Please note that TPID can be 802.1AD and, hence, BPF program would +have to parse VLAN information twice for double tagged packets. + + +Post-VLAN parsing:: + + +------+------+------+-----+-----------+-----------+ + | DMAC | SMAC | TPID | TCI |ETHER_TYPE | L3_HEADER | + +------+------+------+-----+-----------+-----------+ + ^ + | + +-- flow dissector starts here + +.. code:: c + + skb->data + flow_keys->nhoff point the to first byte of L3_HEADER + flow_keys->thoff = nhoff + flow_keys->n_proto = ETHER_TYPE + +In this case VLAN information has been processed before the flow dissector +and BPF flow dissector is not required to handle it. + + +The takeaway here is as follows: BPF flow dissector program can be called with +the optional VLAN header and should gracefully handle both cases: when single +or double VLAN is present and when it is not present. The same program +can be called for both cases and would have to be written carefully to +handle both cases. + + +Reference Implementation +======================== + +See ``tools/testing/selftests/bpf/progs/bpf_flow.c`` for the reference +implementation and ``tools/testing/selftests/bpf/flow_dissector_load.[hc]`` +for the loader. bpftool can be used to load BPF flow dissector program as well. + +The reference implementation is organized as follows: + * ``jmp_table`` map that contains sub-programs for each supported L3 protocol + * ``_dissect`` routine - entry point; it does input ``n_proto`` parsing and + does ``bpf_tail_call`` to the appropriate L3 handler + +Since BPF at this point doesn't support looping (or any jumping back), +jmp_table is used instead to handle multiple levels of encapsulation (and +IPv6 options). + + +Current Limitations +=================== +BPF flow dissector doesn't support exporting all the metadata that in-kernel +C-based implementation can export. Notable example is single VLAN (802.1Q) +and double VLAN (802.1AD) tags. Please refer to the ``struct bpf_flow_keys`` +for a set of information that's currently can be exported from the BPF context. diff --git a/Documentation/networking/bpf_flow_dissector.rst b/Documentation/networking/bpf_flow_dissector.rst deleted file mode 100644 index b375ae2ec2c4..000000000000 --- a/Documentation/networking/bpf_flow_dissector.rst +++ /dev/null @@ -1,126 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -================== -BPF Flow Dissector -================== - -Overview -======== - -Flow dissector is a routine that parses metadata out of the packets. It's -used in the various places in the networking subsystem (RFS, flow hash, etc). - -BPF flow dissector is an attempt to reimplement C-based flow dissector logic -in BPF to gain all the benefits of BPF verifier (namely, limits on the -number of instructions and tail calls). - -API -=== - -BPF flow dissector programs operate on an ``__sk_buff``. However, only the -limited set of fields is allowed: ``data``, ``data_end`` and ``flow_keys``. -``flow_keys`` is ``struct bpf_flow_keys`` and contains flow dissector input -and output arguments. - -The inputs are: - * ``nhoff`` - initial offset of the networking header - * ``thoff`` - initial offset of the transport header, initialized to nhoff - * ``n_proto`` - L3 protocol type, parsed out of L2 header - -Flow dissector BPF program should fill out the rest of the ``struct -bpf_flow_keys`` fields. Input arguments ``nhoff/thoff/n_proto`` should be -also adjusted accordingly. - -The return code of the BPF program is either BPF_OK to indicate successful -dissection, or BPF_DROP to indicate parsing error. - -__sk_buff->data -=============== - -In the VLAN-less case, this is what the initial state of the BPF flow -dissector looks like:: - - +------+------+------------+-----------+ - | DMAC | SMAC | ETHER_TYPE | L3_HEADER | - +------+------+------------+-----------+ - ^ - | - +-- flow dissector starts here - - -.. code:: c - - skb->data + flow_keys->nhoff point to the first byte of L3_HEADER - flow_keys->thoff = nhoff - flow_keys->n_proto = ETHER_TYPE - -In case of VLAN, flow dissector can be called with the two different states. - -Pre-VLAN parsing:: - - +------+------+------+-----+-----------+-----------+ - | DMAC | SMAC | TPID | TCI |ETHER_TYPE | L3_HEADER | - +------+------+------+-----+-----------+-----------+ - ^ - | - +-- flow dissector starts here - -.. code:: c - - skb->data + flow_keys->nhoff point the to first byte of TCI - flow_keys->thoff = nhoff - flow_keys->n_proto = TPID - -Please note that TPID can be 802.1AD and, hence, BPF program would -have to parse VLAN information twice for double tagged packets. - - -Post-VLAN parsing:: - - +------+------+------+-----+-----------+-----------+ - | DMAC | SMAC | TPID | TCI |ETHER_TYPE | L3_HEADER | - +------+------+------+-----+-----------+-----------+ - ^ - | - +-- flow dissector starts here - -.. code:: c - - skb->data + flow_keys->nhoff point the to first byte of L3_HEADER - flow_keys->thoff = nhoff - flow_keys->n_proto = ETHER_TYPE - -In this case VLAN information has been processed before the flow dissector -and BPF flow dissector is not required to handle it. - - -The takeaway here is as follows: BPF flow dissector program can be called with -the optional VLAN header and should gracefully handle both cases: when single -or double VLAN is present and when it is not present. The same program -can be called for both cases and would have to be written carefully to -handle both cases. - - -Reference Implementation -======================== - -See ``tools/testing/selftests/bpf/progs/bpf_flow.c`` for the reference -implementation and ``tools/testing/selftests/bpf/flow_dissector_load.[hc]`` -for the loader. bpftool can be used to load BPF flow dissector program as well. - -The reference implementation is organized as follows: - * ``jmp_table`` map that contains sub-programs for each supported L3 protocol - * ``_dissect`` routine - entry point; it does input ``n_proto`` parsing and - does ``bpf_tail_call`` to the appropriate L3 handler - -Since BPF at this point doesn't support looping (or any jumping back), -jmp_table is used instead to handle multiple levels of encapsulation (and -IPv6 options). - - -Current Limitations -=================== -BPF flow dissector doesn't support exporting all the metadata that in-kernel -C-based implementation can export. Notable example is single VLAN (802.1Q) -and double VLAN (802.1AD) tags. Please refer to the ``struct bpf_flow_keys`` -for a set of information that's currently can be exported from the BPF context. diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index 984e68f9e026..5449149be496 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -9,7 +9,6 @@ Contents: netdev-FAQ af_xdp batman-adv - bpf_flow_dissector can can_ucan_protocol device_drivers/freescale/dpaa2/index -- cgit v1.2.3 From b54dd90cab00f5b64ed8ce533991c20bf781a3cd Mon Sep 17 00:00:00 2001 From: David Bauer Date: Wed, 17 Apr 2019 23:59:20 +0200 Subject: dt-bindings: net: add PHY reset controller binding Add the documentation for PHY reset lines controlled by a reset controller. Signed-off-by: David Bauer Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- Documentation/devicetree/bindings/net/phy.txt | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/net/phy.txt b/Documentation/devicetree/bindings/net/phy.txt index 17c1d2bd00f6..9b9e5b1765dd 100644 --- a/Documentation/devicetree/bindings/net/phy.txt +++ b/Documentation/devicetree/bindings/net/phy.txt @@ -51,6 +51,10 @@ Optional Properties: to ensure the integrated PHY is used. The absence of this property indicates the muxers should be configured so that the external PHY is used. +- resets: The reset-controller phandle and specifier for the PHY reset signal. + +- reset-names: Must be "phy" for the PHY reset signal. + - reset-gpios: The GPIO phandle and specifier for the PHY reset signal. - reset-assert-us: Delay after the reset was asserted in microseconds. @@ -67,6 +71,8 @@ ethernet-phy@0 { interrupts = <35 IRQ_TYPE_EDGE_RISING>; reg = <0>; + resets = <&rst 8>; + reset-names = "phy"; reset-gpios = <&gpio1 4 GPIO_ACTIVE_LOW>; reset-assert-us = <1000>; reset-deassert-us = <2000>; -- cgit v1.2.3 From 3b8802446d27522cd6d32178ba975cc492611f31 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 17 Apr 2019 18:27:01 -0700 Subject: bpf: document the verifier limits Document the verifier limits. Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Signed-off-by: Daniel Borkmann --- Documentation/bpf/bpf_design_QA.rst | 29 +++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) (limited to 'Documentation') diff --git a/Documentation/bpf/bpf_design_QA.rst b/Documentation/bpf/bpf_design_QA.rst index 10453c627135..cb402c59eca5 100644 --- a/Documentation/bpf/bpf_design_QA.rst +++ b/Documentation/bpf/bpf_design_QA.rst @@ -85,8 +85,33 @@ Q: Can loops be supported in a safe way? A: It's not clear yet. BPF developers are trying to find a way to -support bounded loops where the verifier can guarantee that -the program terminates in less than 4096 instructions. +support bounded loops. + +Q: What are the verifier limits? +-------------------------------- +A: The only limit known to the user space is BPF_MAXINSNS (4096). +It's the maximum number of instructions that the unprivileged bpf +program can have. The verifier has various internal limits. +Like the maximum number of instructions that can be explored during +program analysis. Currently, that limit is set to 1 million. +Which essentially means that the largest program can consist +of 1 million NOP instructions. There is a limit to the maximum number +of subsequent branches, a limit to the number of nested bpf-to-bpf +calls, a limit to the number of the verifier states per instruction, +a limit to the number of maps used by the program. +All these limits can be hit with a sufficiently complex program. +There are also non-numerical limits that can cause the program +to be rejected. The verifier used to recognize only pointer + constant +expressions. Now it can recognize pointer + bounded_register. +bpf_lookup_map_elem(key) had a requirement that 'key' must be +a pointer to the stack. Now, 'key' can be a pointer to map value. +The verifier is steadily getting 'smarter'. The limits are +being removed. The only way to know that the program is going to +be accepted by the verifier is to try to load it. +The bpf development process guarantees that the future kernel +versions will accept all bpf programs that were accepted by +the earlier versions. + Instruction level questions --------------------------- -- cgit v1.2.3 From c1b0f9fa064a90532a4184a31ce0847a979069f5 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Sun, 28 Apr 2019 02:56:24 +0200 Subject: dt-bindings: net: DSA: Remove legacy binding Now that the code to support the legacy binding has been removed, remove the documentation for it. Signed-off-by: Andrew Lunn Signed-off-by: David S. Miller --- Documentation/devicetree/bindings/net/dsa/dsa.txt | 155 ---------------------- 1 file changed, 155 deletions(-) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/net/dsa/dsa.txt b/Documentation/devicetree/bindings/net/dsa/dsa.txt index d66a5292b9d3..c107d2848888 100644 --- a/Documentation/devicetree/bindings/net/dsa/dsa.txt +++ b/Documentation/devicetree/bindings/net/dsa/dsa.txt @@ -1,12 +1,6 @@ Distributed Switch Architecture Device Tree Bindings ---------------------------------------------------- -Two bindings exist, one of which has been deprecated due to -limitations. - -Current Binding ---------------- - Switches are true Linux devices and can be probed by any means. Once probed, they register to the DSA framework, passing a node pointer. This node is expected to fulfil the following binding, and @@ -262,152 +256,3 @@ linked into one DSA cluster. }; }; }; - -Deprecated Binding ------------------- - -The deprecated binding makes use of a platform device to represent the -switches. The switches themselves are not Linux devices, and make use -of an MDIO bus for management. - -Required properties: -- compatible : Should be "marvell,dsa" -- #address-cells : Must be 2, first cell is the address on the MDIO bus - and second cell is the address in the switch tree. - Second cell is used only when cascading/chaining. -- #size-cells : Must be 0 -- dsa,ethernet : Should be a phandle to a valid Ethernet device node -- dsa,mii-bus : Should be a phandle to a valid MDIO bus device node - -Optional properties: -- interrupts : property with a value describing the switch - interrupt number (not supported by the driver) - -A DSA node can contain multiple switch chips which are therefore child nodes of -the parent DSA node. The maximum number of allowed child nodes is 4 -(DSA_MAX_SWITCHES). -Each of these switch child nodes should have the following required properties: - -- reg : Contains two fields. The first one describes the - address on the MII bus. The second is the switch - number that must be unique in cascaded configurations -- #address-cells : Must be 1 -- #size-cells : Must be 0 - -A switch child node has the following optional property: - -- eeprom-length : Set to the length of an EEPROM connected to the - switch. Must be set if the switch can not detect - the presence and/or size of a connected EEPROM, - otherwise optional. - -A switch may have multiple "port" children nodes - -Each port children node must have the following mandatory properties: -- reg : Describes the port address in the switch -- label : Describes the label associated with this port, special - labels are "cpu" to indicate a CPU port and "dsa" to - indicate an uplink/downlink port. - -Note that a port labelled "dsa" will imply checking for the uplink phandle -described below. - -Optional property: -- link : Should be a list of phandles to another switch's DSA port. - This property is only used when switches are being - chained/cascaded together. This port is used as outgoing port - towards the phandle port, which can be more than one hop away. - -- phy-handle : Phandle to a PHY on an external MDIO bus, not the - switch internal one. See - Documentation/devicetree/bindings/net/ethernet.txt - for details. - -- phy-mode : String representing the connection to the designated - PHY node specified by the 'phy-handle' property. See - Documentation/devicetree/bindings/net/ethernet.txt - for details. - -- mii-bus : Should be a phandle to a valid MDIO bus device node. - This mii-bus will be used in preference to the - global dsa,mii-bus defined above, for this switch. - -Optional subnodes: -- fixed-link : Fixed-link subnode describing a link to a non-MDIO - managed entity. See - Documentation/devicetree/bindings/net/fixed-link.txt - for details. - -Example: - - dsa@0 { - compatible = "marvell,dsa"; - #address-cells = <2>; - #size-cells = <0>; - - interrupts = <10>; - dsa,ethernet = <ðernet0>; - dsa,mii-bus = <&mii_bus0>; - - switch@0 { - #address-cells = <1>; - #size-cells = <0>; - reg = <16 0>; /* MDIO address 16, switch 0 in tree */ - - port@0 { - reg = <0>; - label = "lan1"; - phy-handle = <&phy0>; - }; - - port@1 { - reg = <1>; - label = "lan2"; - }; - - port@5 { - reg = <5>; - label = "cpu"; - }; - - switch0port6: port@6 { - reg = <6>; - label = "dsa"; - link = <&switch1port0 - &switch2port0>; - }; - }; - - switch@1 { - #address-cells = <1>; - #size-cells = <0>; - reg = <17 1>; /* MDIO address 17, switch 1 in tree */ - mii-bus = <&mii_bus1>; - reset-gpios = <&gpio5 1 GPIO_ACTIVE_LOW>; - - switch1port0: port@0 { - reg = <0>; - label = "dsa"; - link = <&switch0port6>; - }; - switch1port1: port@1 { - reg = <1>; - label = "dsa"; - link = <&switch2port1>; - }; - }; - - switch@2 { - #address-cells = <1>; - #size-cells = <0>; - reg = <18 2>; /* MDIO address 18, switch 2 in tree */ - mii-bus = <&mii_bus1>; - - switch2port0: port@0 { - reg = <0>; - label = "dsa"; - link = <&switch1port1 - &switch0port6>; - }; - }; - }; -- cgit v1.2.3 From 04fdd5dd79a9012dad7b0e6ba676c5a61ca592e4 Mon Sep 17 00:00:00 2001 From: Harish Bandi Date: Fri, 26 Apr 2019 19:26:02 +0530 Subject: dt-bindings: net: bluetooth: Add device tree bindings for QTI chip WCN3998 Add compatible string for the Qualcomm WCN3998 Bluetooth controller Signed-off-by: Harish Bandi Reviewed-by: Rob Herring Signed-off-by: Marcel Holtmann --- Documentation/devicetree/bindings/net/qualcomm-bluetooth.txt | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/net/qualcomm-bluetooth.txt b/Documentation/devicetree/bindings/net/qualcomm-bluetooth.txt index 824c0e23c544..7ef6118abd3d 100644 --- a/Documentation/devicetree/bindings/net/qualcomm-bluetooth.txt +++ b/Documentation/devicetree/bindings/net/qualcomm-bluetooth.txt @@ -11,20 +11,21 @@ Required properties: - compatible: should contain one of the following: * "qcom,qca6174-bt" * "qcom,wcn3990-bt" + * "qcom,wcn3998-bt" Optional properties for compatible string qcom,qca6174-bt: - enable-gpios: gpio specifier used to enable chip - clocks: clock provided to the controller (SUSCLK_32KHZ) -Required properties for compatible string qcom,wcn3990-bt: +Required properties for compatible string qcom,wcn399x-bt: - vddio-supply: VDD_IO supply regulator handle. - vddxo-supply: VDD_XO supply regulator handle. - vddrf-supply: VDD_RF supply regulator handle. - vddch0-supply: VDD_CH0 supply regulator handle. -Optional properties for compatible string qcom,wcn3990-bt: +Optional properties for compatible string qcom,wcn399x-bt: - max-speed: see Documentation/devicetree/bindings/serial/slave-device.txt -- cgit v1.2.3 From 554aae35007e49f533d3d10e788295f7141725bc Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 2 May 2019 23:23:29 +0300 Subject: lib: Add support for generic packing operations This provides an unified API for accessing register bit fields regardless of memory layout. The basic unit of data for these API functions is the u64. The process of transforming an u64 from native CPU encoding into the peripheral's encoding is called 'pack', and transforming it from peripheral to native CPU encoding is 'unpack'. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- Documentation/packing.txt | 149 ++++++++++++++++++++++++++++++++ MAINTAINERS | 8 ++ include/linux/packing.h | 49 +++++++++++ lib/Kconfig | 17 ++++ lib/Makefile | 1 + lib/packing.c | 213 ++++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 437 insertions(+) create mode 100644 Documentation/packing.txt create mode 100644 include/linux/packing.h create mode 100644 lib/packing.c (limited to 'Documentation') diff --git a/Documentation/packing.txt b/Documentation/packing.txt new file mode 100644 index 000000000000..f830c98645f1 --- /dev/null +++ b/Documentation/packing.txt @@ -0,0 +1,149 @@ +================================================ +Generic bitfield packing and unpacking functions +================================================ + +Problem statement +----------------- + +When working with hardware, one has to choose between several approaches of +interfacing with it. +One can memory-map a pointer to a carefully crafted struct over the hardware +device's memory region, and access its fields as struct members (potentially +declared as bitfields). But writing code this way would make it less portable, +due to potential endianness mismatches between the CPU and the hardware device. +Additionally, one has to pay close attention when translating register +definitions from the hardware documentation into bit field indices for the +structs. Also, some hardware (typically networking equipment) tends to group +its register fields in ways that violate any reasonable word boundaries +(sometimes even 64 bit ones). This creates the inconvenience of having to +define "high" and "low" portions of register fields within the struct. +A more robust alternative to struct field definitions would be to extract the +required fields by shifting the appropriate number of bits. But this would +still not protect from endianness mismatches, except if all memory accesses +were performed byte-by-byte. Also the code can easily get cluttered, and the +high-level idea might get lost among the many bit shifts required. +Many drivers take the bit-shifting approach and then attempt to reduce the +clutter with tailored macros, but more often than not these macros take +shortcuts that still prevent the code from being truly portable. + +The solution +------------ + +This API deals with 2 basic operations: + - Packing a CPU-usable number into a memory buffer (with hardware + constraints/quirks) + - Unpacking a memory buffer (which has hardware constraints/quirks) + into a CPU-usable number. + +The API offers an abstraction over said hardware constraints and quirks, +over CPU endianness and therefore between possible mismatches between +the two. + +The basic unit of these API functions is the u64. From the CPU's +perspective, bit 63 always means bit offset 7 of byte 7, albeit only +logically. The question is: where do we lay this bit out in memory? + +The following examples cover the memory layout of a packed u64 field. +The byte offsets in the packed buffer are always implicitly 0, 1, ... 7. +What the examples show is where the logical bytes and bits sit. + +1. Normally (no quirks), we would do it like this: + +63 62 61 60 59 58 57 56 55 54 53 52 51 50 49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 +7 6 5 4 +31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 +3 2 1 0 + +That is, the MSByte (7) of the CPU-usable u64 sits at memory offset 0, and the +LSByte (0) of the u64 sits at memory offset 7. +This corresponds to what most folks would regard to as "big endian", where +bit i corresponds to the number 2^i. This is also referred to in the code +comments as "logical" notation. + + +2. If QUIRK_MSB_ON_THE_RIGHT is set, we do it like this: + +56 57 58 59 60 61 62 63 48 49 50 51 52 53 54 55 40 41 42 43 44 45 46 47 32 33 34 35 36 37 38 39 +7 6 5 4 +24 25 26 27 28 29 30 31 16 17 18 19 20 21 22 23 8 9 10 11 12 13 14 15 0 1 2 3 4 5 6 7 +3 2 1 0 + +That is, QUIRK_MSB_ON_THE_RIGHT does not affect byte positioning, but +inverts bit offsets inside a byte. + + +3. If QUIRK_LITTLE_ENDIAN is set, we do it like this: + +39 38 37 36 35 34 33 32 47 46 45 44 43 42 41 40 55 54 53 52 51 50 49 48 63 62 61 60 59 58 57 56 +4 5 6 7 +7 6 5 4 3 2 1 0 15 14 13 12 11 10 9 8 23 22 21 20 19 18 17 16 31 30 29 28 27 26 25 24 +0 1 2 3 + +Therefore, QUIRK_LITTLE_ENDIAN means that inside the memory region, every +byte from each 4-byte word is placed at its mirrored position compared to +the boundary of that word. + +4. If QUIRK_MSB_ON_THE_RIGHT and QUIRK_LITTLE_ENDIAN are both set, we do it + like this: + +32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 +4 5 6 7 +0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 +0 1 2 3 + + +5. If just QUIRK_LSW32_IS_FIRST is set, we do it like this: + +31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 +3 2 1 0 +63 62 61 60 59 58 57 56 55 54 53 52 51 50 49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 +7 6 5 4 + +In this case the 8 byte memory region is interpreted as follows: first +4 bytes correspond to the least significant 4-byte word, next 4 bytes to +the more significant 4-byte word. + + +6. If QUIRK_LSW32_IS_FIRST and QUIRK_MSB_ON_THE_RIGHT are set, we do it like + this: + +24 25 26 27 28 29 30 31 16 17 18 19 20 21 22 23 8 9 10 11 12 13 14 15 0 1 2 3 4 5 6 7 +3 2 1 0 +56 57 58 59 60 61 62 63 48 49 50 51 52 53 54 55 40 41 42 43 44 45 46 47 32 33 34 35 36 37 38 39 +7 6 5 4 + + +7. If QUIRK_LSW32_IS_FIRST and QUIRK_LITTLE_ENDIAN are set, it looks like + this: + +7 6 5 4 3 2 1 0 15 14 13 12 11 10 9 8 23 22 21 20 19 18 17 16 31 30 29 28 27 26 25 24 +0 1 2 3 +39 38 37 36 35 34 33 32 47 46 45 44 43 42 41 40 55 54 53 52 51 50 49 48 63 62 61 60 59 58 57 56 +4 5 6 7 + + +8. If QUIRK_LSW32_IS_FIRST, QUIRK_LITTLE_ENDIAN and QUIRK_MSB_ON_THE_RIGHT + are set, it looks like this: + +0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 +0 1 2 3 +32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 +4 5 6 7 + + +We always think of our offsets as if there were no quirk, and we translate +them afterwards, before accessing the memory region. + +Intended use +------------ + +Drivers that opt to use this API first need to identify which of the above 3 +quirk combinations (for a total of 8) match what the hardware documentation +describes. Then they should wrap the packing() function, creating a new +xxx_packing() that calls it using the proper QUIRK_* one-hot bits set. + +The packing() function returns an int-encoded error code, which protects the +programmer against incorrect API use. The errors are not expected to occur +durring runtime, therefore it is reasonable for xxx_packing() to return void +and simply swallow those errors. Optionally it can dump stack or print the +error description. diff --git a/MAINTAINERS b/MAINTAINERS index 0af66fa919a8..ff029f3d0f13 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -11673,6 +11673,14 @@ L: linux-i2c@vger.kernel.org S: Orphan F: drivers/i2c/busses/i2c-pasemi.c +PACKING +M: Vladimir Oltean +L: netdev@vger.kernel.org +S: Supported +F: lib/packing.c +F: include/linux/packing.h +F: Documentation/packing.txt + PADATA PARALLEL EXECUTION MECHANISM M: Steffen Klassert L: linux-crypto@vger.kernel.org diff --git a/include/linux/packing.h b/include/linux/packing.h new file mode 100644 index 000000000000..54667735cc67 --- /dev/null +++ b/include/linux/packing.h @@ -0,0 +1,49 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright (c) 2016-2018, NXP Semiconductors + * Copyright (c) 2018-2019, Vladimir Oltean + */ +#ifndef _LINUX_PACKING_H +#define _LINUX_PACKING_H + +#include +#include + +#define QUIRK_MSB_ON_THE_RIGHT BIT(0) +#define QUIRK_LITTLE_ENDIAN BIT(1) +#define QUIRK_LSW32_IS_FIRST BIT(2) + +enum packing_op { + PACK, + UNPACK, +}; + +/** + * packing - Convert numbers (currently u64) between a packed and an unpacked + * format. Unpacked means laid out in memory in the CPU's native + * understanding of integers, while packed means anything else that + * requires translation. + * + * @pbuf: Pointer to a buffer holding the packed value. + * @uval: Pointer to an u64 holding the unpacked value. + * @startbit: The index (in logical notation, compensated for quirks) where + * the packed value starts within pbuf. Must be larger than, or + * equal to, endbit. + * @endbit: The index (in logical notation, compensated for quirks) where + * the packed value ends within pbuf. Must be smaller than, or equal + * to, startbit. + * @op: If PACK, then uval will be treated as const pointer and copied (packed) + * into pbuf, between startbit and endbit. + * If UNPACK, then pbuf will be treated as const pointer and the logical + * value between startbit and endbit will be copied (unpacked) to uval. + * @quirks: A bit mask of QUIRK_LITTLE_ENDIAN, QUIRK_LSW32_IS_FIRST and + * QUIRK_MSB_ON_THE_RIGHT. + * + * Return: 0 on success, EINVAL or ERANGE if called incorrectly. Assuming + * correct usage, return code may be discarded. + * If op is PACK, pbuf is modified. + * If op is UNPACK, uval is modified. + */ +int packing(void *pbuf, u64 *uval, int startbit, int endbit, size_t pbuflen, + enum packing_op op, u8 quirks); + +#endif diff --git a/lib/Kconfig b/lib/Kconfig index a9e56539bd11..ac1fcf06d8ea 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -18,6 +18,23 @@ config RAID6_PQ_BENCHMARK Benchmark all available RAID6 PQ functions on init and choose the fastest one. +config PACKING + bool "Generic bitfield packing and unpacking" + default n + help + This option provides the packing() helper function, which permits + converting bitfields between a CPU-usable representation and a + memory representation that can have any combination of these quirks: + - Is little endian (bytes are reversed within a 32-bit group) + - The least-significant 32-bit word comes first (within a 64-bit + group) + - The most significant bit of a byte is at its right (bit 0 of a + register description is numerically 2^7). + Drivers may use these helpers to match the bit indices as described + in the data sheets of the peripherals they are in control of. + + When in doubt, say N. + config BITREVERSE tristate diff --git a/lib/Makefile b/lib/Makefile index 3b08673e8881..7d4db18fabf1 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -108,6 +108,7 @@ obj-$(CONFIG_DEBUG_LIST) += list_debug.o obj-$(CONFIG_DEBUG_OBJECTS) += debugobjects.o obj-$(CONFIG_BITREVERSE) += bitrev.o +obj-$(CONFIG_PACKING) += packing.o obj-$(CONFIG_RATIONAL) += rational.o obj-$(CONFIG_CRC_CCITT) += crc-ccitt.o obj-$(CONFIG_CRC16) += crc16.o diff --git a/lib/packing.c b/lib/packing.c new file mode 100644 index 000000000000..50d1e9f2f5a7 --- /dev/null +++ b/lib/packing.c @@ -0,0 +1,213 @@ +// SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0 +/* Copyright (c) 2016-2018, NXP Semiconductors + * Copyright (c) 2018-2019, Vladimir Oltean + */ +#include +#include +#include +#include +#include + +static int get_le_offset(int offset) +{ + int closest_multiple_of_4; + + closest_multiple_of_4 = (offset / 4) * 4; + offset -= closest_multiple_of_4; + return closest_multiple_of_4 + (3 - offset); +} + +static int get_reverse_lsw32_offset(int offset, size_t len) +{ + int closest_multiple_of_4; + int word_index; + + word_index = offset / 4; + closest_multiple_of_4 = word_index * 4; + offset -= closest_multiple_of_4; + word_index = (len / 4) - word_index - 1; + return word_index * 4 + offset; +} + +static u64 bit_reverse(u64 val, unsigned int width) +{ + u64 new_val = 0; + unsigned int bit; + unsigned int i; + + for (i = 0; i < width; i++) { + bit = (val & (1 << i)) != 0; + new_val |= (bit << (width - i - 1)); + } + return new_val; +} + +static void adjust_for_msb_right_quirk(u64 *to_write, int *box_start_bit, + int *box_end_bit, u8 *box_mask) +{ + int box_bit_width = *box_start_bit - *box_end_bit + 1; + int new_box_start_bit, new_box_end_bit; + + *to_write >>= *box_end_bit; + *to_write = bit_reverse(*to_write, box_bit_width); + *to_write <<= *box_end_bit; + + new_box_end_bit = box_bit_width - *box_start_bit - 1; + new_box_start_bit = box_bit_width - *box_end_bit - 1; + *box_mask = GENMASK_ULL(new_box_start_bit, new_box_end_bit); + *box_start_bit = new_box_start_bit; + *box_end_bit = new_box_end_bit; +} + +/** + * packing - Convert numbers (currently u64) between a packed and an unpacked + * format. Unpacked means laid out in memory in the CPU's native + * understanding of integers, while packed means anything else that + * requires translation. + * + * @pbuf: Pointer to a buffer holding the packed value. + * @uval: Pointer to an u64 holding the unpacked value. + * @startbit: The index (in logical notation, compensated for quirks) where + * the packed value starts within pbuf. Must be larger than, or + * equal to, endbit. + * @endbit: The index (in logical notation, compensated for quirks) where + * the packed value ends within pbuf. Must be smaller than, or equal + * to, startbit. + * @op: If PACK, then uval will be treated as const pointer and copied (packed) + * into pbuf, between startbit and endbit. + * If UNPACK, then pbuf will be treated as const pointer and the logical + * value between startbit and endbit will be copied (unpacked) to uval. + * @quirks: A bit mask of QUIRK_LITTLE_ENDIAN, QUIRK_LSW32_IS_FIRST and + * QUIRK_MSB_ON_THE_RIGHT. + * + * Return: 0 on success, EINVAL or ERANGE if called incorrectly. Assuming + * correct usage, return code may be discarded. + * If op is PACK, pbuf is modified. + * If op is UNPACK, uval is modified. + */ +int packing(void *pbuf, u64 *uval, int startbit, int endbit, size_t pbuflen, + enum packing_op op, u8 quirks) +{ + /* Number of bits for storing "uval" + * also width of the field to access in the pbuf + */ + u64 value_width; + /* Logical byte indices corresponding to the + * start and end of the field. + */ + int plogical_first_u8, plogical_last_u8, box; + + /* startbit is expected to be larger than endbit */ + if (startbit < endbit) + /* Invalid function call */ + return -EINVAL; + + value_width = startbit - endbit + 1; + if (value_width > 64) + return -ERANGE; + + /* Check if "uval" fits in "value_width" bits. + * If value_width is 64, the check will fail, but any + * 64-bit uval will surely fit. + */ + if (op == PACK && value_width < 64 && (*uval >= (1ull << value_width))) + /* Cannot store "uval" inside "value_width" bits. + * Truncating "uval" is most certainly not desirable, + * so simply erroring out is appropriate. + */ + return -ERANGE; + + /* Initialize parameter */ + if (op == UNPACK) + *uval = 0; + + /* Iterate through an idealistic view of the pbuf as an u64 with + * no quirks, u8 by u8 (aligned at u8 boundaries), from high to low + * logical bit significance. "box" denotes the current logical u8. + */ + plogical_first_u8 = startbit / 8; + plogical_last_u8 = endbit / 8; + + for (box = plogical_first_u8; box >= plogical_last_u8; box--) { + /* Bit indices into the currently accessed 8-bit box */ + int box_start_bit, box_end_bit, box_addr; + u8 box_mask; + /* Corresponding bits from the unpacked u64 parameter */ + int proj_start_bit, proj_end_bit; + u64 proj_mask; + + /* This u8 may need to be accessed in its entirety + * (from bit 7 to bit 0), or not, depending on the + * input arguments startbit and endbit. + */ + if (box == plogical_first_u8) + box_start_bit = startbit % 8; + else + box_start_bit = 7; + if (box == plogical_last_u8) + box_end_bit = endbit % 8; + else + box_end_bit = 0; + + /* We have determined the box bit start and end. + * Now we calculate where this (masked) u8 box would fit + * in the unpacked (CPU-readable) u64 - the u8 box's + * projection onto the unpacked u64. Though the + * box is u8, the projection is u64 because it may fall + * anywhere within the unpacked u64. + */ + proj_start_bit = ((box * 8) + box_start_bit) - endbit; + proj_end_bit = ((box * 8) + box_end_bit) - endbit; + proj_mask = GENMASK_ULL(proj_start_bit, proj_end_bit); + box_mask = GENMASK_ULL(box_start_bit, box_end_bit); + + /* Determine the offset of the u8 box inside the pbuf, + * adjusted for quirks. The adjusted box_addr will be used for + * effective addressing inside the pbuf (so it's not + * logical any longer). + */ + box_addr = pbuflen - box - 1; + if (quirks & QUIRK_LITTLE_ENDIAN) + box_addr = get_le_offset(box_addr); + if (quirks & QUIRK_LSW32_IS_FIRST) + box_addr = get_reverse_lsw32_offset(box_addr, + pbuflen); + + if (op == UNPACK) { + u64 pval; + + /* Read from pbuf, write to uval */ + pval = ((u8 *)pbuf)[box_addr] & box_mask; + if (quirks & QUIRK_MSB_ON_THE_RIGHT) + adjust_for_msb_right_quirk(&pval, + &box_start_bit, + &box_end_bit, + &box_mask); + + pval >>= box_end_bit; + pval <<= proj_end_bit; + *uval &= ~proj_mask; + *uval |= pval; + } else { + u64 pval; + + /* Write to pbuf, read from uval */ + pval = (*uval) & proj_mask; + pval >>= proj_end_bit; + if (quirks & QUIRK_MSB_ON_THE_RIGHT) + adjust_for_msb_right_quirk(&pval, + &box_start_bit, + &box_end_bit, + &box_mask); + + pval <<= box_end_bit; + ((u8 *)pbuf)[box_addr] &= ~box_mask; + ((u8 *)pbuf)[box_addr] |= pval; + } + } + return 0; +} +EXPORT_SYMBOL(packing); + +MODULE_LICENSE("GPL v2"); +MODULE_DESCRIPTION("Generic bitfield packing and unpacking"); -- cgit v1.2.3 From 4759209732d3f4657f65720e624fdd73419f7134 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 2 May 2019 23:23:39 +0300 Subject: Documentation: net: dsa: Add details about NXP SJA1105 driver Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- Documentation/networking/dsa/index.rst | 1 + Documentation/networking/dsa/sja1105.rst | 166 +++++++++++++++++++++++++++++++ 2 files changed, 167 insertions(+) create mode 100644 Documentation/networking/dsa/sja1105.rst (limited to 'Documentation') diff --git a/Documentation/networking/dsa/index.rst b/Documentation/networking/dsa/index.rst index 5c488d345a1e..0e5b7a9be406 100644 --- a/Documentation/networking/dsa/index.rst +++ b/Documentation/networking/dsa/index.rst @@ -8,3 +8,4 @@ Distributed Switch Architecture dsa bcm_sf2 lan9303 + sja1105 diff --git a/Documentation/networking/dsa/sja1105.rst b/Documentation/networking/dsa/sja1105.rst new file mode 100644 index 000000000000..7c13b40915c0 --- /dev/null +++ b/Documentation/networking/dsa/sja1105.rst @@ -0,0 +1,166 @@ +========================= +NXP SJA1105 switch driver +========================= + +Overview +======== + +The NXP SJA1105 is a family of 6 devices: + +- SJA1105E: First generation, no TTEthernet +- SJA1105T: First generation, TTEthernet +- SJA1105P: Second generation, no TTEthernet, no SGMII +- SJA1105Q: Second generation, TTEthernet, no SGMII +- SJA1105R: Second generation, no TTEthernet, SGMII +- SJA1105S: Second generation, TTEthernet, SGMII + +These are SPI-managed automotive switches, with all ports being gigabit +capable, and supporting MII/RMII/RGMII and optionally SGMII on one port. + +Being automotive parts, their configuration interface is geared towards +set-and-forget use, with minimal dynamic interaction at runtime. They +require a static configuration to be composed by software and packed +with CRC and table headers, and sent over SPI. + +The static configuration is composed of several configuration tables. Each +table takes a number of entries. Some configuration tables can be (partially) +reconfigured at runtime, some not. Some tables are mandatory, some not: + +============================= ================== ============================= +Table Mandatory Reconfigurable +============================= ================== ============================= +Schedule no no +Schedule entry points if Scheduling no +VL Lookup no no +VL Policing if VL Lookup no +VL Forwarding if VL Lookup no +L2 Lookup no no +L2 Policing yes no +VLAN Lookup yes yes +L2 Forwarding yes partially (fully on P/Q/R/S) +MAC Config yes partially (fully on P/Q/R/S) +Schedule Params if Scheduling no +Schedule Entry Points Params if Scheduling no +VL Forwarding Params if VL Forwarding no +L2 Lookup Params no partially (fully on P/Q/R/S) +L2 Forwarding Params yes no +Clock Sync Params no no +AVB Params no no +General Params yes partially +Retagging no yes +xMII Params yes no +SGMII no yes +============================= ================== ============================= + + +Also the configuration is write-only (software cannot read it back from the +switch except for very few exceptions). + +The driver creates a static configuration at probe time, and keeps it at +all times in memory, as a shadow for the hardware state. When required to +change a hardware setting, the static configuration is also updated. +If that changed setting can be transmitted to the switch through the dynamic +reconfiguration interface, it is; otherwise the switch is reset and +reprogrammed with the updated static configuration. + +Switching features +================== + +The driver supports the configuration of L2 forwarding rules in hardware for +port bridging. The forwarding, broadcast and flooding domain between ports can +be restricted through two methods: either at the L2 forwarding level (isolate +one bridge's ports from another's) or at the VLAN port membership level +(isolate ports within the same bridge). The final forwarding decision taken by +the hardware is a logical AND of these two sets of rules. + +The hardware tags all traffic internally with a port-based VLAN (pvid), or it +decodes the VLAN information from the 802.1Q tag. Advanced VLAN classification +is not possible. Once attributed a VLAN tag, frames are checked against the +port's membership rules and dropped at ingress if they don't match any VLAN. +This behavior is available when switch ports are enslaved to a bridge with +``vlan_filtering 1``. + +Normally the hardware is not configurable with respect to VLAN awareness, but +by changing what TPID the switch searches 802.1Q tags for, the semantics of a +bridge with ``vlan_filtering 0`` can be kept (accept all traffic, tagged or +untagged), and therefore this mode is also supported. + +Segregating the switch ports in multiple bridges is supported (e.g. 2 + 2), but +all bridges should have the same level of VLAN awareness (either both have +``vlan_filtering`` 0, or both 1). Also an inevitable limitation of the fact +that VLAN awareness is global at the switch level is that once a bridge with +``vlan_filtering`` enslaves at least one switch port, the other un-bridged +ports are no longer available for standalone traffic termination. + +Device Tree bindings and board design +===================================== + +This section references ``Documentation/devicetree/bindings/net/dsa/sja1105.txt`` +and aims to showcase some potential switch caveats. + +RMII PHY role and out-of-band signaling +--------------------------------------- + +In the RMII spec, the 50 MHz clock signals are either driven by the MAC or by +an external oscillator (but not by the PHY). +But the spec is rather loose and devices go outside it in several ways. +Some PHYs go against the spec and may provide an output pin where they source +the 50 MHz clock themselves, in an attempt to be helpful. +On the other hand, the SJA1105 is only binary configurable - when in the RMII +MAC role it will also attempt to drive the clock signal. To prevent this from +happening it must be put in RMII PHY role. +But doing so has some unintended consequences. +In the RMII spec, the PHY can transmit extra out-of-band signals via RXD[1:0]. +These are practically some extra code words (/J/ and /K/) sent prior to the +preamble of each frame. The MAC does not have this out-of-band signaling +mechanism defined by the RMII spec. +So when the SJA1105 port is put in PHY role to avoid having 2 drivers on the +clock signal, inevitably an RMII PHY-to-PHY connection is created. The SJA1105 +emulates a PHY interface fully and generates the /J/ and /K/ symbols prior to +frame preambles, which the real PHY is not expected to understand. So the PHY +simply encodes the extra symbols received from the SJA1105-as-PHY onto the +100Base-Tx wire. +On the other side of the wire, some link partners might discard these extra +symbols, while others might choke on them and discard the entire Ethernet +frames that follow along. This looks like packet loss with some link partners +but not with others. +The take-away is that in RMII mode, the SJA1105 must be let to drive the +reference clock if connected to a PHY. + +RGMII fixed-link and internal delays +------------------------------------ + +As mentioned in the bindings document, the second generation of devices has +tunable delay lines as part of the MAC, which can be used to establish the +correct RGMII timing budget. +When powered up, these can shift the Rx and Tx clocks with a phase difference +between 73.8 and 101.7 degrees. +The catch is that the delay lines need to lock onto a clock signal with a +stable frequency. This means that there must be at least 2 microseconds of +silence between the clock at the old vs at the new frequency. Otherwise the +lock is lost and the delay lines must be reset (powered down and back up). +In RGMII the clock frequency changes with link speed (125 MHz at 1000 Mbps, 25 +MHz at 100 Mbps and 2.5 MHz at 10 Mbps), and link speed might change during the +AN process. +In the situation where the switch port is connected through an RGMII fixed-link +to a link partner whose link state life cycle is outside the control of Linux +(such as a different SoC), then the delay lines would remain unlocked (and +inactive) until there is manual intervention (ifdown/ifup on the switch port). +The take-away is that in RGMII mode, the switch's internal delays are only +reliable if the link partner never changes link speeds, or if it does, it does +so in a way that is coordinated with the switch port (practically, both ends of +the fixed-link are under control of the same Linux system). +As to why would a fixed-link interface ever change link speeds: there are +Ethernet controllers out there which come out of reset in 100 Mbps mode, and +their driver inevitably needs to change the speed and clock frequency if it's +required to work at gigabit. + +MDIO bus and PHY management +--------------------------- + +The SJA1105 does not have an MDIO bus and does not perform in-band AN either. +Therefore there is no link state notification coming from the switch device. +A board would need to hook up the PHYs connected to the switch to any other +MDIO bus available to Linux within the system (e.g. to the DSA master's MDIO +bus). Link state management then works by the driver manually keeping in sync +(over SPI commands) the MAC link speed with the settings negotiated by the PHY. -- cgit v1.2.3 From 013fe01d45ed095189fd746e45fdf75016fec1f6 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 2 May 2019 23:23:40 +0300 Subject: dt-bindings: net: dsa: Add documentation for NXP SJA1105 driver Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- .../devicetree/bindings/net/dsa/sja1105.txt | 156 +++++++++++++++++++++ 1 file changed, 156 insertions(+) create mode 100644 Documentation/devicetree/bindings/net/dsa/sja1105.txt (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/net/dsa/sja1105.txt b/Documentation/devicetree/bindings/net/dsa/sja1105.txt new file mode 100644 index 000000000000..13fd21074d48 --- /dev/null +++ b/Documentation/devicetree/bindings/net/dsa/sja1105.txt @@ -0,0 +1,156 @@ +NXP SJA1105 switch driver +========================= + +Required properties: + +- compatible: + Must be one of: + - "nxp,sja1105e" + - "nxp,sja1105t" + - "nxp,sja1105p" + - "nxp,sja1105q" + - "nxp,sja1105r" + - "nxp,sja1105s" + + Although the device ID could be detected at runtime, explicit bindings + are required in order to be able to statically check their validity. + For example, SGMII can only be specified on port 4 of R and S devices, + and the non-SGMII devices, while pin-compatible, are not equal in terms + of support for RGMII internal delays (supported on P/Q/R/S, but not on + E/T). + +Optional properties: + +- sja1105,role-mac: +- sja1105,role-phy: + Boolean properties that can be assigned under each port node. By + default (unless otherwise specified) a port is configured as MAC if it + is driving a PHY (phy-handle is present) or as PHY if it is PHY-less + (fixed-link specified, presumably because it is connected to a MAC). + The effect of this property (in either its implicit or explicit form) + is: + - In the case of MII or RMII it specifies whether the SJA1105 port is a + clock source or sink for this interface (not applicable for RGMII + where there is a Tx and an Rx clock). + - In the case of RGMII it affects the behavior regarding internal + delays: + 1. If sja1105,role-mac is specified, and the phy-mode property is one + of "rgmii-id", "rgmii-txid" or "rgmii-rxid", then the entity + designated to apply the delay/clock skew necessary for RGMII + is the PHY. The SJA1105 MAC does not apply any internal delays. + 2. If sja1105,role-phy is specified, and the phy-mode property is one + of the above, the designated entity to apply the internal delays + is the SJA1105 MAC (if hardware-supported). This is only supported + by the second-generation (P/Q/R/S) hardware. On a first-generation + E or T device, it is an error to specify an RGMII phy-mode other + than "rgmii" for a port that is in fixed-link mode. In that case, + the clock skew must either be added by the MAC at the other end of + the fixed-link, or by PCB serpentine traces on the board. + These properties are required, for example, in the case where SJA1105 + ports are at both ends of a MII/RMII PHY-less setup. One end would need + to have sja1105,role-mac, while the other sja1105,role-phy. + +See Documentation/devicetree/bindings/net/dsa/dsa.txt for the list of standard +DSA required and optional properties. + +Other observations +------------------ + +The SJA1105 SPI interface requires a CS-to-CLK time (t2 in UM10944) of at least +one half of t_CLK. At an SPI frequency of 1MHz, this means a minimum +cs_sck_delay of 500ns. Ensuring that this SPI timing requirement is observed +depends on the SPI bus master driver. + +Example +------- + +Ethernet switch connected via SPI to the host, CPU port wired to enet2: + +arch/arm/boot/dts/ls1021a-tsn.dts: + +/* SPI controller of the LS1021 */ +&dspi0 { + sja1105@1 { + reg = <0x1>; + #address-cells = <1>; + #size-cells = <0>; + compatible = "nxp,sja1105t"; + spi-max-frequency = <4000000>; + fsl,spi-cs-sck-delay = <1000>; + fsl,spi-sck-cs-delay = <1000>; + ports { + #address-cells = <1>; + #size-cells = <0>; + port@0 { + /* ETH5 written on chassis */ + label = "swp5"; + phy-handle = <&rgmii_phy6>; + phy-mode = "rgmii-id"; + reg = <0>; + /* Implicit "sja1105,role-mac;" */ + }; + port@1 { + /* ETH2 written on chassis */ + label = "swp2"; + phy-handle = <&rgmii_phy3>; + phy-mode = "rgmii-id"; + reg = <1>; + /* Implicit "sja1105,role-mac;" */ + }; + port@2 { + /* ETH3 written on chassis */ + label = "swp3"; + phy-handle = <&rgmii_phy4>; + phy-mode = "rgmii-id"; + reg = <2>; + /* Implicit "sja1105,role-mac;" */ + }; + port@3 { + /* ETH4 written on chassis */ + phy-handle = <&rgmii_phy5>; + label = "swp4"; + phy-mode = "rgmii-id"; + reg = <3>; + /* Implicit "sja1105,role-mac;" */ + }; + port@4 { + /* Internal port connected to eth2 */ + ethernet = <&enet2>; + phy-mode = "rgmii"; + reg = <4>; + /* Implicit "sja1105,role-phy;" */ + fixed-link { + speed = <1000>; + full-duplex; + }; + }; + }; + }; +}; + +/* MDIO controller of the LS1021 */ +&mdio0 { + /* BCM5464 */ + rgmii_phy3: ethernet-phy@3 { + reg = <0x3>; + }; + rgmii_phy4: ethernet-phy@4 { + reg = <0x4>; + }; + rgmii_phy5: ethernet-phy@5 { + reg = <0x5>; + }; + rgmii_phy6: ethernet-phy@6 { + reg = <0x6>; + }; +}; + +/* Ethernet master port of the LS1021 */ +&enet2 { + phy-connection-type = "rgmii"; + status = "ok"; + fixed-link { + speed = <1000>; + full-duplex; + }; +}; -- cgit v1.2.3 From 687e3d5550c7b0e4dca0179103741a44cd3f7864 Mon Sep 17 00:00:00 2001 From: Petr Štetiar Date: Fri, 3 May 2019 16:27:07 +0200 Subject: dt-bindings: doc: reflect new NVMEM of_get_mac_address behaviour MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As of_get_mac_address now supports NVMEM under the hood, we need to update the bindings documentation with the new nvmem-cell* properties, which would mean copy&pasting a lot of redundant information to every binding documentation currently referencing some of the MAC address properties. So I've just removed all the references to the optional MAC address properties and replaced them with the small note referencing net/ethernet.txt file. Signed-off-by: Petr Štetiar Signed-off-by: David S. Miller --- Documentation/devicetree/bindings/net/altera_tse.txt | 5 ++--- Documentation/devicetree/bindings/net/amd-xgbe.txt | 5 +++-- Documentation/devicetree/bindings/net/brcm,amac.txt | 4 ++-- Documentation/devicetree/bindings/net/cpsw.txt | 4 +++- Documentation/devicetree/bindings/net/davinci_emac.txt | 5 +++-- Documentation/devicetree/bindings/net/dsa/dsa.txt | 5 ++--- Documentation/devicetree/bindings/net/ethernet.txt | 6 ++++-- Documentation/devicetree/bindings/net/hisilicon-femac.txt | 4 +++- .../devicetree/bindings/net/hisilicon-hix5hd2-gmac.txt | 4 +++- Documentation/devicetree/bindings/net/keystone-netcp.txt | 10 +++++----- Documentation/devicetree/bindings/net/macb.txt | 5 ++--- Documentation/devicetree/bindings/net/marvell-pxa168.txt | 4 +++- Documentation/devicetree/bindings/net/microchip,enc28j60.txt | 3 ++- Documentation/devicetree/bindings/net/microchip,lan78xx.txt | 5 ++--- Documentation/devicetree/bindings/net/qca,qca7000.txt | 4 +++- Documentation/devicetree/bindings/net/samsung-sxgbe.txt | 4 +++- .../devicetree/bindings/net/snps,dwc-qos-ethernet.txt | 5 +++-- .../devicetree/bindings/net/socionext,uniphier-ave4.txt | 4 ++-- Documentation/devicetree/bindings/net/socionext-netsec.txt | 5 +++-- .../devicetree/bindings/net/wireless/mediatek,mt76.txt | 5 +++-- Documentation/devicetree/bindings/net/wireless/qca,ath9k.txt | 4 ++-- 21 files changed, 58 insertions(+), 42 deletions(-) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/net/altera_tse.txt b/Documentation/devicetree/bindings/net/altera_tse.txt index 0e21df94a53f..0b7d4d3758ea 100644 --- a/Documentation/devicetree/bindings/net/altera_tse.txt +++ b/Documentation/devicetree/bindings/net/altera_tse.txt @@ -46,9 +46,8 @@ Required properties: - reg: phy id used to communicate to phy. - device_type: Must be "ethernet-phy". -Optional properties: -- local-mac-address: See ethernet.txt in the same directory. -- max-frame-size: See ethernet.txt in the same directory. +The MAC address will be determined using the optional properties defined in +ethernet.txt. Example: diff --git a/Documentation/devicetree/bindings/net/amd-xgbe.txt b/Documentation/devicetree/bindings/net/amd-xgbe.txt index 93dcb79a5f16..9c27dfcd1133 100644 --- a/Documentation/devicetree/bindings/net/amd-xgbe.txt +++ b/Documentation/devicetree/bindings/net/amd-xgbe.txt @@ -24,8 +24,6 @@ Required properties: - phy-mode: See ethernet.txt file in the same directory Optional properties: -- mac-address: mac address to be assigned to the device. Can be overridden - by UEFI. - dma-coherent: Present if dma operations are coherent - amd,per-channel-interrupt: Indicates that Rx and Tx complete will generate a unique interrupt for each DMA channel - this requires an additional @@ -34,6 +32,9 @@ Optional properties: 0 - 1GbE and 10GbE (default) 1 - 2.5GbE and 10GbE +The MAC address will be determined using the optional properties defined in +ethernet.txt. + The following optional properties are represented by an array with each value corresponding to a particular speed. The first array value represents the setting for the 1GbE speed, the second value for the 2.5GbE speed and diff --git a/Documentation/devicetree/bindings/net/brcm,amac.txt b/Documentation/devicetree/bindings/net/brcm,amac.txt index 0bfad656a9ff..0120ebe93262 100644 --- a/Documentation/devicetree/bindings/net/brcm,amac.txt +++ b/Documentation/devicetree/bindings/net/brcm,amac.txt @@ -16,8 +16,8 @@ Required properties: registers (required for Northstar2) - interrupts: Interrupt number -Optional properties: -- mac-address: See ethernet.txt file in the same directory +The MAC address will be determined using the optional properties +defined in ethernet.txt. Examples: diff --git a/Documentation/devicetree/bindings/net/cpsw.txt b/Documentation/devicetree/bindings/net/cpsw.txt index 3264e1978d25..7c7ac5eb0313 100644 --- a/Documentation/devicetree/bindings/net/cpsw.txt +++ b/Documentation/devicetree/bindings/net/cpsw.txt @@ -49,10 +49,12 @@ Required properties: Optional properties: - dual_emac_res_vlan : Specifies VID to be used to segregate the ports -- mac-address : See ethernet.txt file in the same directory - phy_id : Specifies slave phy id (deprecated, use phy-handle) - phy-handle : See ethernet.txt file in the same directory +The MAC address will be determined using the optional properties +defined in ethernet.txt. + Slave sub-nodes: - fixed-link : See fixed-link.txt file in the same directory diff --git a/Documentation/devicetree/bindings/net/davinci_emac.txt b/Documentation/devicetree/bindings/net/davinci_emac.txt index ca83dcc84fb8..5e3579e72e2d 100644 --- a/Documentation/devicetree/bindings/net/davinci_emac.txt +++ b/Documentation/devicetree/bindings/net/davinci_emac.txt @@ -20,11 +20,12 @@ Required properties: Optional properties: - phy-handle: See ethernet.txt file in the same directory. If absent, davinci_emac driver defaults to 100/FULL. -- nvmem-cells: phandle, reference to an nvmem node for the MAC address -- nvmem-cell-names: string, should be "mac-address" if nvmem is to be used - ti,davinci-rmii-en: 1 byte, 1 means use RMII - ti,davinci-no-bd-ram: boolean, does EMAC have BD RAM? +The MAC address will be determined using the optional properties +defined in ethernet.txt. + Example (enbw_cmc board): eth0: emac@1e20000 { compatible = "ti,davinci-dm6467-emac"; diff --git a/Documentation/devicetree/bindings/net/dsa/dsa.txt b/Documentation/devicetree/bindings/net/dsa/dsa.txt index c107d2848888..f66bb7ecdb82 100644 --- a/Documentation/devicetree/bindings/net/dsa/dsa.txt +++ b/Documentation/devicetree/bindings/net/dsa/dsa.txt @@ -65,9 +65,8 @@ properties, described in binding documents: Documentation/devicetree/bindings/net/fixed-link.txt for details. -- local-mac-address : See - Documentation/devicetree/bindings/net/ethernet.txt - for details. +The MAC address will be determined using the optional properties +defined in ethernet.txt. Example diff --git a/Documentation/devicetree/bindings/net/ethernet.txt b/Documentation/devicetree/bindings/net/ethernet.txt index a68621580584..699244428a28 100644 --- a/Documentation/devicetree/bindings/net/ethernet.txt +++ b/Documentation/devicetree/bindings/net/ethernet.txt @@ -4,12 +4,14 @@ NOTE: All 'phy*' properties documented below are Ethernet specific. For the generic PHY 'phys' property, see Documentation/devicetree/bindings/phy/phy-bindings.txt. -- local-mac-address: array of 6 bytes, specifies the MAC address that was - assigned to the network device; - mac-address: array of 6 bytes, specifies the MAC address that was last used by the boot program; should be used in cases where the MAC address assigned to the device by the boot program is different from the "local-mac-address" property; +- local-mac-address: array of 6 bytes, specifies the MAC address that was + assigned to the network device; +- nvmem-cells: phandle, reference to an nvmem node for the MAC address +- nvmem-cell-names: string, should be "mac-address" if nvmem is to be used - max-speed: number, specifies maximum speed in Mbit/s supported by the device; - max-frame-size: number, maximum transfer unit (IEEE defined MTU), rather than the maximum frame size (there's contradiction in the Devicetree diff --git a/Documentation/devicetree/bindings/net/hisilicon-femac.txt b/Documentation/devicetree/bindings/net/hisilicon-femac.txt index d11af5ecace8..5f96976f3cea 100644 --- a/Documentation/devicetree/bindings/net/hisilicon-femac.txt +++ b/Documentation/devicetree/bindings/net/hisilicon-femac.txt @@ -14,7 +14,6 @@ Required properties: the PHY reset signal(optional). - reset-names: should contain the reset signal name "mac"(required) and "phy"(optional). -- mac-address: see ethernet.txt [1]. - phy-mode: see ethernet.txt [1]. - phy-handle: see ethernet.txt [1]. - hisilicon,phy-reset-delays-us: triplet of delays if PHY reset signal given. @@ -22,6 +21,9 @@ Required properties: The 2nd cell is reset pulse in micro seconds. The 3rd cell is reset post-delay in micro seconds. +The MAC address will be determined using the optional properties +defined in ethernet.txt[1]. + [1] Documentation/devicetree/bindings/net/ethernet.txt Example: diff --git a/Documentation/devicetree/bindings/net/hisilicon-hix5hd2-gmac.txt b/Documentation/devicetree/bindings/net/hisilicon-hix5hd2-gmac.txt index eea73adc678f..cddf46bf6b63 100644 --- a/Documentation/devicetree/bindings/net/hisilicon-hix5hd2-gmac.txt +++ b/Documentation/devicetree/bindings/net/hisilicon-hix5hd2-gmac.txt @@ -18,7 +18,6 @@ Required properties: - #size-cells: must be <0>. - phy-mode: see ethernet.txt [1]. - phy-handle: see ethernet.txt [1]. -- mac-address: see ethernet.txt [1]. - clocks: clock phandle and specifier pair. - clock-names: contain the clock name "mac_core"(required) and "mac_ifc"(optional). - resets: should contain the phandle to the MAC core reset signal(optional), @@ -31,6 +30,9 @@ Required properties: The 2nd cell is reset pulse in micro seconds. The 3rd cell is reset post-delay in micro seconds. +The MAC address will be determined using the properties defined in +ethernet.txt[1]. + - PHY subnode: inherits from phy binding [2] [1] Documentation/devicetree/bindings/net/ethernet.txt diff --git a/Documentation/devicetree/bindings/net/keystone-netcp.txt b/Documentation/devicetree/bindings/net/keystone-netcp.txt index 04ba1dc34fd6..3a65aabc76a2 100644 --- a/Documentation/devicetree/bindings/net/keystone-netcp.txt +++ b/Documentation/devicetree/bindings/net/keystone-netcp.txt @@ -135,14 +135,14 @@ Optional properties: are swapped. The netcp driver will swap the two DWORDs back to the proper order when this property is set to 2 when it obtains the mac address from efuse. -- local-mac-address: the driver is designed to use the of_get_mac_address api - only if efuse-mac is 0. When efuse-mac is 0, the MAC - address is obtained from local-mac-address. If this - attribute is not present, then the driver will use a - random MAC address. - "netcp-device label": phandle to the device specification for each of NetCP sub-module attached to this interface. +The MAC address will be determined using the optional properties defined in +ethernet.txt, as provided by the of_get_mac_address API and only if efuse-mac +is set to 0. If any of the optional MAC address properties are not present, +then the driver will use random MAC address. + Example binding: netcp: netcp@2000000 { diff --git a/Documentation/devicetree/bindings/net/macb.txt b/Documentation/devicetree/bindings/net/macb.txt index 8b80515729d7..9c5e94482b5f 100644 --- a/Documentation/devicetree/bindings/net/macb.txt +++ b/Documentation/devicetree/bindings/net/macb.txt @@ -26,9 +26,8 @@ Required properties: Optional elements: 'tsu_clk' - clocks: Phandles to input clocks. -Optional properties: -- nvmem-cells: phandle, reference to an nvmem node for the MAC address -- nvmem-cell-names: string, should be "mac-address" if nvmem is to be used +The MAC address will be determined using the optional properties +defined in ethernet.txt. Optional properties for PHY child node: - reset-gpios : Should specify the gpio for phy reset diff --git a/Documentation/devicetree/bindings/net/marvell-pxa168.txt b/Documentation/devicetree/bindings/net/marvell-pxa168.txt index 845a148a346e..5574af3554aa 100644 --- a/Documentation/devicetree/bindings/net/marvell-pxa168.txt +++ b/Documentation/devicetree/bindings/net/marvell-pxa168.txt @@ -11,7 +11,9 @@ Optional properties: - #address-cells: must be 1 when using sub-nodes. - #size-cells: must be 0 when using sub-nodes. - phy-handle: see ethernet.txt file in the same directory. -- local-mac-address: see ethernet.txt file in the same directory. + +The MAC address will be determined using the optional properties +defined in ethernet.txt. Sub-nodes: Each PHY can be represented as a sub-node. This is not mandatory. diff --git a/Documentation/devicetree/bindings/net/microchip,enc28j60.txt b/Documentation/devicetree/bindings/net/microchip,enc28j60.txt index 24626e082b83..a8275921a896 100644 --- a/Documentation/devicetree/bindings/net/microchip,enc28j60.txt +++ b/Documentation/devicetree/bindings/net/microchip,enc28j60.txt @@ -21,8 +21,9 @@ Optional properties: - spi-max-frequency: Maximum frequency of the SPI bus when accessing the ENC28J60. According to the ENC28J80 datasheet, the chip allows a maximum of 20 MHz, however, board designs may need to limit this value. -- local-mac-address: See ethernet.txt in the same directory. +The MAC address will be determined using the optional properties +defined in ethernet.txt. Example (for NXP i.MX28 with pin control stuff for GPIO irq): diff --git a/Documentation/devicetree/bindings/net/microchip,lan78xx.txt b/Documentation/devicetree/bindings/net/microchip,lan78xx.txt index 76786a0f6d3d..11a679530ae6 100644 --- a/Documentation/devicetree/bindings/net/microchip,lan78xx.txt +++ b/Documentation/devicetree/bindings/net/microchip,lan78xx.txt @@ -7,9 +7,8 @@ The Device Tree properties, if present, override the OTP and EEPROM. Required properties: - compatible: Should be one of "usb424,7800", "usb424,7801" or "usb424,7850". -Optional properties: -- local-mac-address: see ethernet.txt -- mac-address: see ethernet.txt +The MAC address will be determined using the optional properties +defined in ethernet.txt. Optional properties of the embedded PHY: - microchip,led-modes: a 0..4 element vector, with each element configuring diff --git a/Documentation/devicetree/bindings/net/qca,qca7000.txt b/Documentation/devicetree/bindings/net/qca,qca7000.txt index e4a8a51086df..21c36e524993 100644 --- a/Documentation/devicetree/bindings/net/qca,qca7000.txt +++ b/Documentation/devicetree/bindings/net/qca,qca7000.txt @@ -23,7 +23,6 @@ Optional properties: Numbers smaller than 1000000 or greater than 16000000 are invalid. Missing the property will set the SPI frequency to 8000000 Hertz. -- local-mac-address : see ./ethernet.txt - qca,legacy-mode : Set the SPI data transfer of the QCA7000 to legacy mode. In this mode the SPI master must toggle the chip select between each data word. In burst mode these gaps aren't @@ -31,6 +30,9 @@ Optional properties: the QCA7000 is setup via GPIO pin strapping. If the property is missing the driver defaults to burst mode. +The MAC address will be determined using the optional properties +defined in ethernet.txt. + SPI Example: /* Freescale i.MX28 SPI master*/ diff --git a/Documentation/devicetree/bindings/net/samsung-sxgbe.txt b/Documentation/devicetree/bindings/net/samsung-sxgbe.txt index 46e591178911..2cff6d8a585a 100644 --- a/Documentation/devicetree/bindings/net/samsung-sxgbe.txt +++ b/Documentation/devicetree/bindings/net/samsung-sxgbe.txt @@ -21,10 +21,12 @@ Required properties: range. Optional properties: -- mac-address: 6 bytes, mac address - max-frame-size: Maximum Transfer Unit (IEEE defined MTU), rather than the maximum frame size. +The MAC address will be determined using the optional properties +defined in ethernet.txt. + Example: aliases { diff --git a/Documentation/devicetree/bindings/net/snps,dwc-qos-ethernet.txt b/Documentation/devicetree/bindings/net/snps,dwc-qos-ethernet.txt index 36f1aef585f0..ad3c6e109ce1 100644 --- a/Documentation/devicetree/bindings/net/snps,dwc-qos-ethernet.txt +++ b/Documentation/devicetree/bindings/net/snps,dwc-qos-ethernet.txt @@ -103,8 +103,6 @@ Required properties: Optional properties: - dma-coherent: Present if dma operations are coherent -- mac-address: See ethernet.txt in the same directory -- local-mac-address: See ethernet.txt in the same directory - phy-reset-gpios: Phandle and specifier for any GPIO used to reset the PHY. See ../gpio/gpio.txt. - snps,en-lpi: If present it enables use of the AXI low-power interface @@ -133,6 +131,9 @@ Optional properties: - device_type: Must be "ethernet-phy". - fixed-mode device tree subnode: see fixed-link.txt in the same directory +The MAC address will be determined using the optional properties +defined in ethernet.txt. + Examples: ethernet2@40010000 { clock-names = "phy_ref_clk", "apb_pclk"; diff --git a/Documentation/devicetree/bindings/net/socionext,uniphier-ave4.txt b/Documentation/devicetree/bindings/net/socionext,uniphier-ave4.txt index fc8f01718690..4e85fc495e87 100644 --- a/Documentation/devicetree/bindings/net/socionext,uniphier-ave4.txt +++ b/Documentation/devicetree/bindings/net/socionext,uniphier-ave4.txt @@ -31,8 +31,8 @@ Required properties: - socionext,syscon-phy-mode: A phandle to syscon with one argument that configures phy mode. The argument is the ID of MAC instance. -Optional properties: - - local-mac-address: See ethernet.txt in the same directory. +The MAC address will be determined using the optional properties +defined in ethernet.txt. Required subnode: - mdio: A container for child nodes representing phy nodes. diff --git a/Documentation/devicetree/bindings/net/socionext-netsec.txt b/Documentation/devicetree/bindings/net/socionext-netsec.txt index 0cff94fb0433..9d6c9feb12ff 100644 --- a/Documentation/devicetree/bindings/net/socionext-netsec.txt +++ b/Documentation/devicetree/bindings/net/socionext-netsec.txt @@ -26,11 +26,12 @@ Required properties: Optional properties: (See ethernet.txt file in the same directory) - dma-coherent: Boolean property, must only be present if memory accesses performed by the device are cache coherent. -- local-mac-address: See ethernet.txt in the same directory. -- mac-address: See ethernet.txt in the same directory. - max-speed: See ethernet.txt in the same directory. - max-frame-size: See ethernet.txt in the same directory. +The MAC address will be determined using the optional properties +defined in ethernet.txt. + Example: eth0: ethernet@522d0000 { compatible = "socionext,synquacer-netsec"; diff --git a/Documentation/devicetree/bindings/net/wireless/mediatek,mt76.txt b/Documentation/devicetree/bindings/net/wireless/mediatek,mt76.txt index 7b9a776230c0..74665502f4cf 100644 --- a/Documentation/devicetree/bindings/net/wireless/mediatek,mt76.txt +++ b/Documentation/devicetree/bindings/net/wireless/mediatek,mt76.txt @@ -13,11 +13,12 @@ properties: Optional properties: -- mac-address: See ethernet.txt in the parent directory -- local-mac-address: See ethernet.txt in the parent directory - ieee80211-freq-limit: See ieee80211.txt - mediatek,mtd-eeprom: Specify a MTD partition + offset containing EEPROM data +The driver is using of_get_mac_address API, so the MAC address can be as well +be set with corresponding optional properties defined in net/ethernet.txt. + Optional nodes: - led: Properties for a connected LED Optional properties: diff --git a/Documentation/devicetree/bindings/net/wireless/qca,ath9k.txt b/Documentation/devicetree/bindings/net/wireless/qca,ath9k.txt index b7396c8c271c..aaaeeb5f935b 100644 --- a/Documentation/devicetree/bindings/net/wireless/qca,ath9k.txt +++ b/Documentation/devicetree/bindings/net/wireless/qca,ath9k.txt @@ -34,9 +34,9 @@ Optional properties: ath9k wireless chip (in this case the calibration / EEPROM data will be loaded from userspace using the kernel firmware loader). -- mac-address: See ethernet.txt in the parent directory -- local-mac-address: See ethernet.txt in the parent directory +The MAC address will be determined using the optional properties defined in +net/ethernet.txt. In this example, the node is defined as child node of the PCI controller: &pci0 { -- cgit v1.2.3 From 0a58d471de3a34e435c5358cf533e74905eb0e7a Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sun, 5 May 2019 13:19:29 +0300 Subject: Documentation: net: dsa: sja1105: Add info about supported traffic modes This adds a table which illustrates what combinations of management / regular traffic work depending on the state the switch ports are in. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- Documentation/networking/dsa/sja1105.rst | 54 ++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) (limited to 'Documentation') diff --git a/Documentation/networking/dsa/sja1105.rst b/Documentation/networking/dsa/sja1105.rst index 7c13b40915c0..ea7bac438cfd 100644 --- a/Documentation/networking/dsa/sja1105.rst +++ b/Documentation/networking/dsa/sja1105.rst @@ -63,6 +63,38 @@ If that changed setting can be transmitted to the switch through the dynamic reconfiguration interface, it is; otherwise the switch is reset and reprogrammed with the updated static configuration. +Traffic support +=============== + +The switches do not support switch tagging in hardware. But they do support +customizing the TPID by which VLAN traffic is identified as such. The switch +driver is leveraging ``CONFIG_NET_DSA_TAG_8021Q`` by requesting that special +VLANs (with a custom TPID of ``ETH_P_EDSA`` instead of ``ETH_P_8021Q``) are +installed on its ports when not in ``vlan_filtering`` mode. This does not +interfere with the reception and transmission of real 802.1Q-tagged traffic, +because the switch does no longer parse those packets as VLAN after the TPID +change. +The TPID is restored when ``vlan_filtering`` is requested by the user through +the bridge layer, and general IP termination becomes no longer possible through +the switch netdevices in this mode. + +The switches have two programmable filters for link-local destination MACs. +These are used to trap BPDUs and PTP traffic to the master netdevice, and are +further used to support STP and 1588 ordinary clock/boundary clock +functionality. + +The following traffic modes are supported over the switch netdevices: + ++--------------------+------------+------------------+------------------+ +| | Standalone | Bridged with | Bridged with | +| | ports | vlan_filtering 0 | vlan_filtering 1 | ++====================+============+==================+==================+ +| Regular traffic | Yes | Yes | No (use master) | ++--------------------+------------+------------------+------------------+ +| Management traffic | Yes | Yes | Yes | +| (BPDU, PTP) | | | | ++--------------------+------------+------------------+------------------+ + Switching features ================== @@ -92,6 +124,28 @@ that VLAN awareness is global at the switch level is that once a bridge with ``vlan_filtering`` enslaves at least one switch port, the other un-bridged ports are no longer available for standalone traffic termination. +Topology and loop detection through STP is supported. + +L2 FDB manipulation (add/delete/dump) is currently possible for the first +generation devices. Aging time of FDB entries, as well as enabling fully static +management (no address learning and no flooding of unknown traffic) is not yet +configurable in the driver. + +A special comment about bridging with other netdevices (illustrated with an +example): + +A board has eth0, eth1, swp0@eth1, swp1@eth1, swp2@eth1, swp3@eth1. +The switch ports (swp0-3) are under br0. +It is desired that eth0 is turned into another switched port that communicates +with swp0-3. + +If br0 has vlan_filtering 0, then eth0 can simply be added to br0 with the +intended results. +If br0 has vlan_filtering 1, then a new br1 interface needs to be created that +enslaves eth0 and eth1 (the DSA master of the switch ports). This is because in +this mode, the switch ports beneath br0 are not capable of regular traffic, and +are only used as a conduit for switchdev operations. + Device Tree bindings and board design ===================================== -- cgit v1.2.3 From 822dd046d7e22a8d01728200a003da230e4c6f7f Mon Sep 17 00:00:00 2001 From: Maxime Chevallier Date: Tue, 7 May 2019 17:35:55 +0200 Subject: dt-bindings: net: Fix a typo in the phy-mode list for ethernet bindings The phy_mode "2000base-x" is actually supposed to be "1000base-x", even though the commit title of the original patch says otherwise. Fixes: 55601a880690 ("net: phy: Add 2000base-x, 2500base-x and rxaui modes") Signed-off-by: Maxime Chevallier Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- Documentation/devicetree/bindings/net/ethernet.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/net/ethernet.txt b/Documentation/devicetree/bindings/net/ethernet.txt index a68621580584..d45b5b56fa39 100644 --- a/Documentation/devicetree/bindings/net/ethernet.txt +++ b/Documentation/devicetree/bindings/net/ethernet.txt @@ -36,7 +36,7 @@ Documentation/devicetree/bindings/phy/phy-bindings.txt. * "smii" * "xgmii" * "trgmii" - * "2000base-x", + * "1000base-x", * "2500base-x", * "rxaui" * "xaui" -- cgit v1.2.3