diff options
45 files changed, 1209 insertions, 140 deletions
diff --git a/Documentation/networking/scaling.rst b/Documentation/networking/scaling.rst index 8f0347b9fb3d..3d435caa3ef2 100644 --- a/Documentation/networking/scaling.rst +++ b/Documentation/networking/scaling.rst @@ -465,9 +465,9 @@ XPS Configuration ----------------- XPS is only available if the kconfig symbol CONFIG_XPS is enabled (on by -default for SMP). The functionality remains disabled until explicitly -configured. To enable XPS, the bitmap of CPUs/receive-queues that may -use a transmit queue is configured using the sysfs file entry: +default for SMP). If compiled in, it is driver dependent whether, and +how, XPS is configured at device init. The mapping of CPUs/receive-queues +to transmit queue can be inspected and configured using sysfs: For selection based on CPUs map:: diff --git a/drivers/net/can/m_can/m_can_platform.c b/drivers/net/can/m_can/m_can_platform.c index 38ea5e600fb8..e6d0cb9ee02f 100644 --- a/drivers/net/can/m_can/m_can_platform.c +++ b/drivers/net/can/m_can/m_can_platform.c @@ -144,8 +144,6 @@ static int __maybe_unused m_can_runtime_suspend(struct device *dev) struct net_device *ndev = dev_get_drvdata(dev); struct m_can_classdev *mcan_class = netdev_priv(ndev); - m_can_class_suspend(dev); - clk_disable_unprepare(mcan_class->cclk); clk_disable_unprepare(mcan_class->hclk); diff --git a/drivers/net/dsa/microchip/ksz_common.c b/drivers/net/dsa/microchip/ksz_common.c index 01f5784f69cb..0ef854911f21 100644 --- a/drivers/net/dsa/microchip/ksz_common.c +++ b/drivers/net/dsa/microchip/ksz_common.c @@ -103,14 +103,8 @@ void ksz_init_mib_timer(struct ksz_device *dev) INIT_DELAYED_WORK(&dev->mib_read, ksz_mib_read_work); - /* Read MIB counters every 30 seconds to avoid overflow. */ - dev->mib_read_interval = msecs_to_jiffies(30000); - for (i = 0; i < dev->mib_port_cnt; i++) dev->dev_ops->port_init_cnt(dev, i); - - /* Start the timer 2 seconds later. */ - schedule_delayed_work(&dev->mib_read, msecs_to_jiffies(2000)); } EXPORT_SYMBOL_GPL(ksz_init_mib_timer); @@ -143,7 +137,9 @@ void ksz_mac_link_down(struct dsa_switch *ds, int port, unsigned int mode, /* Read all MIB counters when the link is going down. */ p->read = true; - schedule_delayed_work(&dev->mib_read, 0); + /* timer started */ + if (dev->mib_read_interval) + schedule_delayed_work(&dev->mib_read, 0); } EXPORT_SYMBOL_GPL(ksz_mac_link_down); @@ -451,6 +447,12 @@ int ksz_switch_register(struct ksz_device *dev, return ret; } + /* Read MIB counters every 30 seconds to avoid overflow. */ + dev->mib_read_interval = msecs_to_jiffies(30000); + + /* Start the MIB timer. */ + schedule_delayed_work(&dev->mib_read, 0); + return 0; } EXPORT_SYMBOL(ksz_switch_register); diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c index f642c1b475c4..1b88bd1c2dbe 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c @@ -60,6 +60,89 @@ static struct ch_tc_pedit_fields pedits[] = { PEDIT_FIELDS(IP6_, DST_127_96, 4, nat_lip, 12), }; +static const struct cxgb4_natmode_config cxgb4_natmode_config_array[] = { + /* Default supported NAT modes */ + { + .chip = CHELSIO_T5, + .flags = CXGB4_ACTION_NATMODE_NONE, + .natmode = NAT_MODE_NONE, + }, + { + .chip = CHELSIO_T5, + .flags = CXGB4_ACTION_NATMODE_DIP, + .natmode = NAT_MODE_DIP, + }, + { + .chip = CHELSIO_T5, + .flags = CXGB4_ACTION_NATMODE_DIP | CXGB4_ACTION_NATMODE_DPORT, + .natmode = NAT_MODE_DIP_DP, + }, + { + .chip = CHELSIO_T5, + .flags = CXGB4_ACTION_NATMODE_DIP | CXGB4_ACTION_NATMODE_DPORT | + CXGB4_ACTION_NATMODE_SIP, + .natmode = NAT_MODE_DIP_DP_SIP, + }, + { + .chip = CHELSIO_T5, + .flags = CXGB4_ACTION_NATMODE_DIP | CXGB4_ACTION_NATMODE_DPORT | + CXGB4_ACTION_NATMODE_SPORT, + .natmode = NAT_MODE_DIP_DP_SP, + }, + { + .chip = CHELSIO_T5, + .flags = CXGB4_ACTION_NATMODE_SIP | CXGB4_ACTION_NATMODE_SPORT, + .natmode = NAT_MODE_SIP_SP, + }, + { + .chip = CHELSIO_T5, + .flags = CXGB4_ACTION_NATMODE_DIP | CXGB4_ACTION_NATMODE_SIP | + CXGB4_ACTION_NATMODE_SPORT, + .natmode = NAT_MODE_DIP_SIP_SP, + }, + { + .chip = CHELSIO_T5, + .flags = CXGB4_ACTION_NATMODE_DIP | CXGB4_ACTION_NATMODE_SIP | + CXGB4_ACTION_NATMODE_DPORT | + CXGB4_ACTION_NATMODE_SPORT, + .natmode = NAT_MODE_ALL, + }, + /* T6+ can ignore L4 ports when they're disabled. */ + { + .chip = CHELSIO_T6, + .flags = CXGB4_ACTION_NATMODE_SIP, + .natmode = NAT_MODE_SIP_SP, + }, + { + .chip = CHELSIO_T6, + .flags = CXGB4_ACTION_NATMODE_DIP | CXGB4_ACTION_NATMODE_SPORT, + .natmode = NAT_MODE_DIP_DP_SP, + }, + { + .chip = CHELSIO_T6, + .flags = CXGB4_ACTION_NATMODE_DIP | CXGB4_ACTION_NATMODE_SIP, + .natmode = NAT_MODE_ALL, + }, +}; + +static void cxgb4_action_natmode_tweak(struct ch_filter_specification *fs, + u8 natmode_flags) +{ + u8 i = 0; + + /* Translate the enabled NAT 4-tuple fields to one of the + * hardware supported NAT mode configurations. This ensures + * that we pick a valid combination, where the disabled fields + * do not get overwritten to 0. + */ + for (i = 0; i < ARRAY_SIZE(cxgb4_natmode_config_array); i++) { + if (cxgb4_natmode_config_array[i].flags == natmode_flags) { + fs->nat_mode = cxgb4_natmode_config_array[i].natmode; + return; + } + } +} + static struct ch_tc_flower_entry *allocate_flower_entry(void) { struct ch_tc_flower_entry *new = kzalloc(sizeof(*new), GFP_KERNEL); @@ -289,7 +372,8 @@ static void offload_pedit(struct ch_filter_specification *fs, u32 val, u32 mask, } static void process_pedit_field(struct ch_filter_specification *fs, u32 val, - u32 mask, u32 offset, u8 htype) + u32 mask, u32 offset, u8 htype, + u8 *natmode_flags) { switch (htype) { case FLOW_ACT_MANGLE_HDR_TYPE_ETH: @@ -314,60 +398,94 @@ static void process_pedit_field(struct ch_filter_specification *fs, u32 val, switch (offset) { case PEDIT_IP4_SRC: offload_pedit(fs, val, mask, IP4_SRC); + *natmode_flags |= CXGB4_ACTION_NATMODE_SIP; break; case PEDIT_IP4_DST: offload_pedit(fs, val, mask, IP4_DST); + *natmode_flags |= CXGB4_ACTION_NATMODE_DIP; } - fs->nat_mode = NAT_MODE_ALL; break; case FLOW_ACT_MANGLE_HDR_TYPE_IP6: switch (offset) { case PEDIT_IP6_SRC_31_0: offload_pedit(fs, val, mask, IP6_SRC_31_0); + *natmode_flags |= CXGB4_ACTION_NATMODE_SIP; break; case PEDIT_IP6_SRC_63_32: offload_pedit(fs, val, mask, IP6_SRC_63_32); + *natmode_flags |= CXGB4_ACTION_NATMODE_SIP; break; case PEDIT_IP6_SRC_95_64: offload_pedit(fs, val, mask, IP6_SRC_95_64); + *natmode_flags |= CXGB4_ACTION_NATMODE_SIP; break; case PEDIT_IP6_SRC_127_96: offload_pedit(fs, val, mask, IP6_SRC_127_96); + *natmode_flags |= CXGB4_ACTION_NATMODE_SIP; break; case PEDIT_IP6_DST_31_0: offload_pedit(fs, val, mask, IP6_DST_31_0); + *natmode_flags |= CXGB4_ACTION_NATMODE_DIP; break; case PEDIT_IP6_DST_63_32: offload_pedit(fs, val, mask, IP6_DST_63_32); + *natmode_flags |= CXGB4_ACTION_NATMODE_DIP; break; case PEDIT_IP6_DST_95_64: offload_pedit(fs, val, mask, IP6_DST_95_64); + *natmode_flags |= CXGB4_ACTION_NATMODE_DIP; break; case PEDIT_IP6_DST_127_96: offload_pedit(fs, val, mask, IP6_DST_127_96); + *natmode_flags |= CXGB4_ACTION_NATMODE_DIP; } - fs->nat_mode = NAT_MODE_ALL; break; case FLOW_ACT_MANGLE_HDR_TYPE_TCP: switch (offset) { case PEDIT_TCP_SPORT_DPORT: - if (~mask & PEDIT_TCP_UDP_SPORT_MASK) + if (~mask & PEDIT_TCP_UDP_SPORT_MASK) { fs->nat_fport = val; - else + *natmode_flags |= CXGB4_ACTION_NATMODE_SPORT; + } else { fs->nat_lport = val >> 16; + *natmode_flags |= CXGB4_ACTION_NATMODE_DPORT; + } } - fs->nat_mode = NAT_MODE_ALL; break; case FLOW_ACT_MANGLE_HDR_TYPE_UDP: switch (offset) { case PEDIT_UDP_SPORT_DPORT: - if (~mask & PEDIT_TCP_UDP_SPORT_MASK) + if (~mask & PEDIT_TCP_UDP_SPORT_MASK) { fs->nat_fport = val; - else + *natmode_flags |= CXGB4_ACTION_NATMODE_SPORT; + } else { fs->nat_lport = val >> 16; + *natmode_flags |= CXGB4_ACTION_NATMODE_DPORT; + } } - fs->nat_mode = NAT_MODE_ALL; + break; + } +} + +static int cxgb4_action_natmode_validate(struct adapter *adap, u8 natmode_flags, + struct netlink_ext_ack *extack) +{ + u8 i = 0; + + /* Extract the NAT mode to enable based on what 4-tuple fields + * are enabled to be overwritten. This ensures that the + * disabled fields don't get overwritten to 0. + */ + for (i = 0; i < ARRAY_SIZE(cxgb4_natmode_config_array); i++) { + const struct cxgb4_natmode_config *c; + + c = &cxgb4_natmode_config_array[i]; + if (CHELSIO_CHIP_VERSION(adap->params.chip) >= c->chip && + natmode_flags == c->flags) + return 0; } + NL_SET_ERR_MSG_MOD(extack, "Unsupported NAT mode 4-tuple combination"); + return -EOPNOTSUPP; } void cxgb4_process_flow_actions(struct net_device *in, @@ -375,6 +493,7 @@ void cxgb4_process_flow_actions(struct net_device *in, struct ch_filter_specification *fs) { struct flow_action_entry *act; + u8 natmode_flags = 0; int i; flow_action_for_each(i, act, actions) { @@ -426,7 +545,8 @@ void cxgb4_process_flow_actions(struct net_device *in, val = act->mangle.val; offset = act->mangle.offset; - process_pedit_field(fs, val, mask, offset, htype); + process_pedit_field(fs, val, mask, offset, htype, + &natmode_flags); } break; case FLOW_ACTION_QUEUE: @@ -438,6 +558,9 @@ void cxgb4_process_flow_actions(struct net_device *in, break; } } + if (natmode_flags) + cxgb4_action_natmode_tweak(fs, natmode_flags); + } static bool valid_l4_mask(u32 mask) @@ -454,7 +577,8 @@ static bool valid_l4_mask(u32 mask) } static bool valid_pedit_action(struct net_device *dev, - const struct flow_action_entry *act) + const struct flow_action_entry *act, + u8 *natmode_flags) { u32 mask, offset; u8 htype; @@ -479,7 +603,10 @@ static bool valid_pedit_action(struct net_device *dev, case FLOW_ACT_MANGLE_HDR_TYPE_IP4: switch (offset) { case PEDIT_IP4_SRC: + *natmode_flags |= CXGB4_ACTION_NATMODE_SIP; + break; case PEDIT_IP4_DST: + *natmode_flags |= CXGB4_ACTION_NATMODE_DIP; break; default: netdev_err(dev, "%s: Unsupported pedit field\n", @@ -493,10 +620,13 @@ static bool valid_pedit_action(struct net_device *dev, case PEDIT_IP6_SRC_63_32: case PEDIT_IP6_SRC_95_64: case PEDIT_IP6_SRC_127_96: + *natmode_flags |= CXGB4_ACTION_NATMODE_SIP; + break; case PEDIT_IP6_DST_31_0: case PEDIT_IP6_DST_63_32: case PEDIT_IP6_DST_95_64: case PEDIT_IP6_DST_127_96: + *natmode_flags |= CXGB4_ACTION_NATMODE_DIP; break; default: netdev_err(dev, "%s: Unsupported pedit field\n", @@ -512,6 +642,10 @@ static bool valid_pedit_action(struct net_device *dev, __func__); return false; } + if (~mask & PEDIT_TCP_UDP_SPORT_MASK) + *natmode_flags |= CXGB4_ACTION_NATMODE_SPORT; + else + *natmode_flags |= CXGB4_ACTION_NATMODE_DPORT; break; default: netdev_err(dev, "%s: Unsupported pedit field\n", @@ -527,6 +661,10 @@ static bool valid_pedit_action(struct net_device *dev, __func__); return false; } + if (~mask & PEDIT_TCP_UDP_SPORT_MASK) + *natmode_flags |= CXGB4_ACTION_NATMODE_SPORT; + else + *natmode_flags |= CXGB4_ACTION_NATMODE_DPORT; break; default: netdev_err(dev, "%s: Unsupported pedit field\n", @@ -546,10 +684,12 @@ int cxgb4_validate_flow_actions(struct net_device *dev, struct netlink_ext_ack *extack, u8 matchall_filter) { + struct adapter *adap = netdev2adap(dev); struct flow_action_entry *act; bool act_redir = false; bool act_pedit = false; bool act_vlan = false; + u8 natmode_flags = 0; int i; if (!flow_action_basic_hw_stats_check(actions, extack)) @@ -563,7 +703,6 @@ int cxgb4_validate_flow_actions(struct net_device *dev, break; case FLOW_ACTION_MIRRED: case FLOW_ACTION_REDIRECT: { - struct adapter *adap = netdev2adap(dev); struct net_device *n_dev, *target_dev; bool found = false; unsigned int i; @@ -620,7 +759,8 @@ int cxgb4_validate_flow_actions(struct net_device *dev, } break; case FLOW_ACTION_MANGLE: { - bool pedit_valid = valid_pedit_action(dev, act); + bool pedit_valid = valid_pedit_action(dev, act, + &natmode_flags); if (!pedit_valid) return -EOPNOTSUPP; @@ -642,6 +782,15 @@ int cxgb4_validate_flow_actions(struct net_device *dev, return -EINVAL; } + if (act_pedit) { + int ret; + + ret = cxgb4_action_natmode_validate(adap, natmode_flags, + extack); + if (ret) + return ret; + } + return 0; } diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.h index 6296e1d5a12b..3a2fa00c8cde 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.h +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.h @@ -108,6 +108,21 @@ struct ch_tc_pedit_fields { #define PEDIT_TCP_SPORT_DPORT 0x0 #define PEDIT_UDP_SPORT_DPORT 0x0 +enum cxgb4_action_natmode_flags { + CXGB4_ACTION_NATMODE_NONE = 0, + CXGB4_ACTION_NATMODE_DIP = (1 << 0), + CXGB4_ACTION_NATMODE_SIP = (1 << 1), + CXGB4_ACTION_NATMODE_DPORT = (1 << 2), + CXGB4_ACTION_NATMODE_SPORT = (1 << 3), +}; + +/* TC PEDIT action to NATMODE translation entry */ +struct cxgb4_natmode_config { + enum chip_type chip; + u8 flags; + u8 natmode; +}; + void cxgb4_process_flow_actions(struct net_device *in, struct flow_action *actions, struct ch_filter_specification *fs); diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c index c043afb38b6e..8f7eca1e7716 100644 --- a/drivers/net/ethernet/freescale/fec_main.c +++ b/drivers/net/ethernet/freescale/fec_main.c @@ -1912,6 +1912,27 @@ out: return ret; } +static void fec_enet_phy_reset_after_clk_enable(struct net_device *ndev) +{ + struct fec_enet_private *fep = netdev_priv(ndev); + struct phy_device *phy_dev = ndev->phydev; + + if (phy_dev) { + phy_reset_after_clk_enable(phy_dev); + } else if (fep->phy_node) { + /* + * If the PHY still is not bound to the MAC, but there is + * OF PHY node and a matching PHY device instance already, + * use the OF PHY node to obtain the PHY device instance, + * and then use that PHY device instance when triggering + * the PHY reset. + */ + phy_dev = of_phy_find_device(fep->phy_node); + phy_reset_after_clk_enable(phy_dev); + put_device(&phy_dev->mdio.dev); + } +} + static int fec_enet_clk_enable(struct net_device *ndev, bool enable) { struct fec_enet_private *fep = netdev_priv(ndev); @@ -1938,7 +1959,7 @@ static int fec_enet_clk_enable(struct net_device *ndev, bool enable) if (ret) goto failed_clk_ref; - phy_reset_after_clk_enable(ndev->phydev); + fec_enet_phy_reset_after_clk_enable(ndev); } else { clk_disable_unprepare(fep->clk_enet_out); if (fep->clk_ptp) { @@ -2983,16 +3004,16 @@ fec_enet_open(struct net_device *ndev) /* Init MAC prior to mii bus probe */ fec_restart(ndev); - /* Probe and connect to PHY when open the interface */ - ret = fec_enet_mii_probe(ndev); - if (ret) - goto err_enet_mii_probe; - /* Call phy_reset_after_clk_enable() again if it failed during * phy_reset_after_clk_enable() before because the PHY wasn't probed. */ if (reset_again) - phy_reset_after_clk_enable(ndev->phydev); + fec_enet_phy_reset_after_clk_enable(ndev); + + /* Probe and connect to PHY when open the interface */ + ret = fec_enet_mii_probe(ndev); + if (ret) + goto err_enet_mii_probe; if (fep->quirks & FEC_QUIRK_ERR006687) imx6q_cpuidle_fec_irqs_used(); diff --git a/drivers/net/ethernet/ibm/ibmveth.c b/drivers/net/ethernet/ibm/ibmveth.c index c5c732601e35..7ef3369953b6 100644 --- a/drivers/net/ethernet/ibm/ibmveth.c +++ b/drivers/net/ethernet/ibm/ibmveth.c @@ -1349,6 +1349,7 @@ static int ibmveth_poll(struct napi_struct *napi, int budget) int offset = ibmveth_rxq_frame_offset(adapter); int csum_good = ibmveth_rxq_csum_good(adapter); int lrg_pkt = ibmveth_rxq_large_packet(adapter); + __sum16 iph_check = 0; skb = ibmveth_rxq_get_buffer(adapter); @@ -1385,16 +1386,26 @@ static int ibmveth_poll(struct napi_struct *napi, int budget) skb_put(skb, length); skb->protocol = eth_type_trans(skb, netdev); - if (csum_good) { - skb->ip_summed = CHECKSUM_UNNECESSARY; - ibmveth_rx_csum_helper(skb, adapter); + /* PHYP without PLSO support places a -1 in the ip + * checksum for large send frames. + */ + if (skb->protocol == cpu_to_be16(ETH_P_IP)) { + struct iphdr *iph = (struct iphdr *)skb->data; + + iph_check = iph->check; } - if (length > netdev->mtu + ETH_HLEN) { + if ((length > netdev->mtu + ETH_HLEN) || + lrg_pkt || iph_check == 0xffff) { ibmveth_rx_mss_helper(skb, mss, lrg_pkt); adapter->rx_large_packets++; } + if (csum_good) { + skb->ip_summed = CHECKSUM_UNNECESSARY; + ibmveth_rx_csum_helper(skb, adapter); + } + napi_gro_receive(napi, skb); /* send it up */ netdev->stats.rx_packets++; diff --git a/drivers/net/ethernet/korina.c b/drivers/net/ethernet/korina.c index 03e034918d14..af441d699a57 100644 --- a/drivers/net/ethernet/korina.c +++ b/drivers/net/ethernet/korina.c @@ -1113,7 +1113,7 @@ out: return rc; probe_err_register: - kfree(lp->td_ring); + kfree(KSEG0ADDR(lp->td_ring)); probe_err_td_ring: iounmap(lp->tx_dma_regs); probe_err_dma_tx: @@ -1133,6 +1133,7 @@ static int korina_remove(struct platform_device *pdev) iounmap(lp->eth_regs); iounmap(lp->rx_dma_regs); iounmap(lp->tx_dma_regs); + kfree(KSEG0ADDR(lp->td_ring)); unregister_netdev(bif->dev); free_netdev(bif->dev); diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c index 99d7737e8ad6..502d1b97855c 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c @@ -943,6 +943,9 @@ int mlx4_en_poll_rx_cq(struct napi_struct *napi, int budget) bool clean_complete = true; int done; + if (!budget) + return 0; + if (priv->tx_ring_num[TX_XDP]) { xdp_tx_cq = priv->tx_cq[TX_XDP][cq->ring]; if (xdp_tx_cq->xdp_busy) { diff --git a/drivers/net/ethernet/mellanox/mlx4/en_tx.c b/drivers/net/ethernet/mellanox/mlx4/en_tx.c index 45869b29368f..3ddb7268e415 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_tx.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c @@ -350,7 +350,7 @@ u32 mlx4_en_recycle_tx_desc(struct mlx4_en_priv *priv, .dma = tx_info->map0_dma, }; - if (!mlx4_en_rx_recycle(ring->recycle_ring, &frame)) { + if (!napi_mode || !mlx4_en_rx_recycle(ring->recycle_ring, &frame)) { dma_unmap_page(priv->ddev, tx_info->map0_dma, PAGE_SIZE, priv->dma_dir); put_page(tx_info->page); diff --git a/drivers/net/ethernet/ti/tlan.c b/drivers/net/ethernet/ti/tlan.c index 317fad787d78..267c080ee084 100644 --- a/drivers/net/ethernet/ti/tlan.c +++ b/drivers/net/ethernet/ti/tlan.c @@ -2504,7 +2504,7 @@ static void tlan_phy_power_down(struct net_device *dev) } /* Wait for 50 ms and powerup - * This is abitrary. It is intended to make sure the + * This is arbitrary. It is intended to make sure the * transceiver settles. */ tlan_set_timer(dev, msecs_to_jiffies(50), TLAN_TIMER_PHY_PUP); diff --git a/drivers/net/ipa/ipa_endpoint.c b/drivers/net/ipa/ipa_endpoint.c index 24d688e3cd93..b40b711cf4bd 100644 --- a/drivers/net/ipa/ipa_endpoint.c +++ b/drivers/net/ipa/ipa_endpoint.c @@ -1432,6 +1432,9 @@ void ipa_endpoint_resume_one(struct ipa_endpoint *endpoint) void ipa_endpoint_suspend(struct ipa *ipa) { + if (!ipa->setup_complete) + return; + if (ipa->modem_netdev) ipa_modem_suspend(ipa->modem_netdev); @@ -1443,6 +1446,9 @@ void ipa_endpoint_suspend(struct ipa *ipa) void ipa_endpoint_resume(struct ipa *ipa) { + if (!ipa->setup_complete) + return; + ipa_endpoint_resume_one(ipa->name_map[IPA_ENDPOINT_AP_COMMAND_TX]); ipa_endpoint_resume_one(ipa->name_map[IPA_ENDPOINT_AP_LAN_RX]); diff --git a/include/net/netfilter/nf_log.h b/include/net/netfilter/nf_log.h index 0d3920896d50..716db4a0fed8 100644 --- a/include/net/netfilter/nf_log.h +++ b/include/net/netfilter/nf_log.h @@ -108,6 +108,7 @@ int nf_log_dump_tcp_header(struct nf_log_buf *m, const struct sk_buff *skb, unsigned int logflags); void nf_log_dump_sk_uid_gid(struct net *net, struct nf_log_buf *m, struct sock *sk); +void nf_log_dump_vlan(struct nf_log_buf *m, const struct sk_buff *skb); void nf_log_dump_packet_common(struct nf_log_buf *m, u_int8_t pf, unsigned int hooknum, const struct sk_buff *skb, const struct net_device *in, diff --git a/include/net/tc_act/tc_tunnel_key.h b/include/net/tc_act/tc_tunnel_key.h index e1057b255f69..879fe8cff581 100644 --- a/include/net/tc_act/tc_tunnel_key.h +++ b/include/net/tc_act/tc_tunnel_key.h @@ -56,7 +56,10 @@ static inline struct ip_tunnel_info *tcf_tunnel_info(const struct tc_action *a) { #ifdef CONFIG_NET_CLS_ACT struct tcf_tunnel_key *t = to_tunnel_key(a); - struct tcf_tunnel_key_params *params = rtnl_dereference(t->params); + struct tcf_tunnel_key_params *params; + + params = rcu_dereference_protected(t->params, + lockdep_is_held(&a->tcfa_lock)); return ¶ms->tcft_enc_metadata->u.tun_info; #else diff --git a/include/net/tls.h b/include/net/tls.h index e5dac7e74e79..baf1e99d8193 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -679,10 +679,6 @@ int decrypt_skb(struct sock *sk, struct sk_buff *skb, struct scatterlist *sgout); struct sk_buff *tls_encrypt_skb(struct sk_buff *skb); -struct sk_buff *tls_validate_xmit_skb(struct sock *sk, - struct net_device *dev, - struct sk_buff *skb); - int tls_sw_fallback_init(struct sock *sk, struct tls_offload_context_tx *offload_ctx, struct tls_crypto_info *crypto_info); diff --git a/net/can/j1939/transport.c b/net/can/j1939/transport.c index 0cec4152f979..e09d087ba240 100644 --- a/net/can/j1939/transport.c +++ b/net/can/j1939/transport.c @@ -580,6 +580,7 @@ sk_buff *j1939_tp_tx_dat_new(struct j1939_priv *priv, skb->dev = priv->ndev; can_skb_reserve(skb); can_skb_prv(skb)->ifindex = priv->ndev->ifindex; + can_skb_prv(skb)->skbcnt = 0; /* reserve CAN header */ skb_reserve(skb, offsetof(struct can_frame, data)); @@ -1487,6 +1488,7 @@ j1939_session *j1939_session_fresh_new(struct j1939_priv *priv, skb->dev = priv->ndev; can_skb_reserve(skb); can_skb_prv(skb)->ifindex = priv->ndev->ifindex; + can_skb_prv(skb)->skbcnt = 0; skcb = j1939_skb_to_cb(skb); memcpy(skcb, rel_skcb, sizeof(*skcb)); diff --git a/net/core/sock.c b/net/core/sock.c index d9a537e6876a..4e8729357122 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -757,7 +757,6 @@ static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns) } else { sock_reset_flag(sk, SOCK_RCVTSTAMP); sock_reset_flag(sk, SOCK_RCVTSTAMPNS); - sock_reset_flag(sk, SOCK_TSTAMP_NEW); } } @@ -994,8 +993,6 @@ set_sndbuf: __sock_set_timestamps(sk, valbool, true, true); break; case SO_TIMESTAMPING_NEW: - sock_set_flag(sk, SOCK_TSTAMP_NEW); - fallthrough; case SO_TIMESTAMPING_OLD: if (val & ~SOF_TIMESTAMPING_MASK) { ret = -EINVAL; @@ -1024,16 +1021,14 @@ set_sndbuf: } sk->sk_tsflags = val; + sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW); + if (val & SOF_TIMESTAMPING_RX_SOFTWARE) sock_enable_timestamp(sk, SOCK_TIMESTAMPING_RX_SOFTWARE); - else { - if (optname == SO_TIMESTAMPING_NEW) - sock_reset_flag(sk, SOCK_TSTAMP_NEW); - + else sock_disable_timestamp(sk, (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)); - } break; case SO_RCVLOWAT: diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 8f2e974a1e4d..e8d704de728c 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -457,6 +457,23 @@ out_bh_enable: local_bh_enable(); } +/* + * The device used for looking up which routing table to use for sending an ICMP + * error is preferably the source whenever it is set, which should ensure the + * icmp error can be sent to the source host, else lookup using the routing + * table of the destination device, else use the main routing table (index 0). + */ +static struct net_device *icmp_get_route_lookup_dev(struct sk_buff *skb) +{ + struct net_device *route_lookup_dev = NULL; + + if (skb->dev) + route_lookup_dev = skb->dev; + else if (skb_dst(skb)) + route_lookup_dev = skb_dst(skb)->dev; + return route_lookup_dev; +} + static struct rtable *icmp_route_lookup(struct net *net, struct flowi4 *fl4, struct sk_buff *skb_in, @@ -465,6 +482,7 @@ static struct rtable *icmp_route_lookup(struct net *net, int type, int code, struct icmp_bxm *param) { + struct net_device *route_lookup_dev; struct rtable *rt, *rt2; struct flowi4 fl4_dec; int err; @@ -479,7 +497,8 @@ static struct rtable *icmp_route_lookup(struct net *net, fl4->flowi4_proto = IPPROTO_ICMP; fl4->fl4_icmp_type = type; fl4->fl4_icmp_code = code; - fl4->flowi4_oif = l3mdev_master_ifindex(skb_dst(skb_in)->dev); + route_lookup_dev = icmp_get_route_lookup_dev(skb_in); + fl4->flowi4_oif = l3mdev_master_ifindex(route_lookup_dev); security_skb_classify_flow(skb_in, flowi4_to_flowi(fl4)); rt = ip_route_output_key_hash(net, fl4, skb_in); @@ -503,7 +522,7 @@ static struct rtable *icmp_route_lookup(struct net *net, if (err) goto relookup_failed; - if (inet_addr_type_dev_table(net, skb_dst(skb_in)->dev, + if (inet_addr_type_dev_table(net, route_lookup_dev, fl4_dec.saddr) == RTN_LOCAL) { rt2 = __ip_route_output_key(net, &fl4_dec); if (IS_ERR(rt2)) diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 4e31f23e4117..e70291748889 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -625,9 +625,7 @@ static netdev_tx_t ipgre_xmit(struct sk_buff *skb, } if (dev->header_ops) { - /* Need space for new headers */ - if (skb_cow_head(skb, dev->needed_headroom - - (tunnel->hlen + sizeof(struct iphdr)))) + if (skb_cow_head(skb, 0)) goto free_skb; tnl_params = (const struct iphdr *)skb->data; @@ -748,7 +746,11 @@ static void ipgre_link_update(struct net_device *dev, bool set_mtu) len = tunnel->tun_hlen - len; tunnel->hlen = tunnel->hlen + len; - dev->needed_headroom = dev->needed_headroom + len; + if (dev->header_ops) + dev->hard_header_len += len; + else + dev->needed_headroom += len; + if (set_mtu) dev->mtu = max_t(int, dev->mtu - len, 68); @@ -944,6 +946,7 @@ static void __gre_tunnel_init(struct net_device *dev) tunnel->parms.iph.protocol = IPPROTO_GRE; tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen; + dev->needed_headroom = tunnel->hlen + sizeof(tunnel->parms.iph); dev->features |= GRE_FEATURES; dev->hw_features |= GRE_FEATURES; @@ -987,10 +990,14 @@ static int ipgre_tunnel_init(struct net_device *dev) return -EINVAL; dev->flags = IFF_BROADCAST; dev->header_ops = &ipgre_header_ops; + dev->hard_header_len = tunnel->hlen + sizeof(*iph); + dev->needed_headroom = 0; } #endif } else if (!tunnel->collect_md) { dev->header_ops = &ipgre_header_ops; + dev->hard_header_len = tunnel->hlen + sizeof(*iph); + dev->needed_headroom = 0; } return ip_tunnel_init(dev); diff --git a/net/ipv4/netfilter/nf_log_arp.c b/net/ipv4/netfilter/nf_log_arp.c index 7a83f881efa9..136030ad2e54 100644 --- a/net/ipv4/netfilter/nf_log_arp.c +++ b/net/ipv4/netfilter/nf_log_arp.c @@ -43,16 +43,31 @@ static void dump_arp_packet(struct nf_log_buf *m, const struct nf_loginfo *info, const struct sk_buff *skb, unsigned int nhoff) { - const struct arphdr *ah; - struct arphdr _arph; const struct arppayload *ap; struct arppayload _arpp; + const struct arphdr *ah; + unsigned int logflags; + struct arphdr _arph; ah = skb_header_pointer(skb, 0, sizeof(_arph), &_arph); if (ah == NULL) { nf_log_buf_add(m, "TRUNCATED"); return; } + + if (info->type == NF_LOG_TYPE_LOG) + logflags = info->u.log.logflags; + else + logflags = NF_LOG_DEFAULT_MASK; + + if (logflags & NF_LOG_MACDECODE) { + nf_log_buf_add(m, "MACSRC=%pM MACDST=%pM ", + eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest); + nf_log_dump_vlan(m, skb); + nf_log_buf_add(m, "MACPROTO=%04x ", + ntohs(eth_hdr(skb)->h_proto)); + } + nf_log_buf_add(m, "ARP HTYPE=%d PTYPE=0x%04x OPCODE=%d", ntohs(ah->ar_hrd), ntohs(ah->ar_pro), ntohs(ah->ar_op)); diff --git a/net/ipv4/netfilter/nf_log_ipv4.c b/net/ipv4/netfilter/nf_log_ipv4.c index 0c72156130b6..d07583fac8f8 100644 --- a/net/ipv4/netfilter/nf_log_ipv4.c +++ b/net/ipv4/netfilter/nf_log_ipv4.c @@ -284,8 +284,10 @@ static void dump_ipv4_mac_header(struct nf_log_buf *m, switch (dev->type) { case ARPHRD_ETHER: - nf_log_buf_add(m, "MACSRC=%pM MACDST=%pM MACPROTO=%04x ", - eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest, + nf_log_buf_add(m, "MACSRC=%pM MACDST=%pM ", + eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest); + nf_log_dump_vlan(m, skb); + nf_log_buf_add(m, "MACPROTO=%04x ", ntohs(eth_hdr(skb)->h_proto)); return; default: diff --git a/net/ipv4/route.c b/net/ipv4/route.c index d15a78b26dfa..dc2a399cd9f4 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2770,10 +2770,12 @@ struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4, if (IS_ERR(rt)) return rt; - if (flp4->flowi4_proto) + if (flp4->flowi4_proto) { + flp4->flowi4_oif = rt->dst.dev->ifindex; rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst, flowi4_to_flowi(flp4), sk, 0); + } return rt; } diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index a4e4912ad607..91209a2760aa 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -501,8 +501,11 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, if (__ipv6_addr_needs_scope_id(addr_type)) { iif = icmp6_iif(skb); } else { - dst = skb_dst(skb); - iif = l3mdev_master_ifindex(dst ? dst->dev : skb->dev); + /* + * The source device is used for looking up which routing table + * to use for sending an ICMP error. + */ + iif = l3mdev_master_ifindex(skb->dev); } /* diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 141c0a4c569a..605cdd38a919 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -2622,8 +2622,10 @@ static void *ipv6_route_seq_start(struct seq_file *seq, loff_t *pos) iter->skip = *pos; if (iter->tbl) { + loff_t p = 0; + ipv6_route_seq_setup_walk(iter, net); - return ipv6_route_seq_next(seq, NULL, pos); + return ipv6_route_seq_next(seq, NULL, &p); } else { return NULL; } diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index a2a65e327f49..5941bfbc7c47 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -468,8 +468,6 @@ int ip6_forward(struct sk_buff *skb) * check and decrement ttl */ if (hdr->hop_limit <= 1) { - /* Force OUTPUT device used as source address */ - skb->dev = dst->dev; icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0); __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); diff --git a/net/ipv6/netfilter/nf_log_ipv6.c b/net/ipv6/netfilter/nf_log_ipv6.c index da64550a5707..8210ff34ed9b 100644 --- a/net/ipv6/netfilter/nf_log_ipv6.c +++ b/net/ipv6/netfilter/nf_log_ipv6.c @@ -297,9 +297,11 @@ static void dump_ipv6_mac_header(struct nf_log_buf *m, switch (dev->type) { case ARPHRD_ETHER: - nf_log_buf_add(m, "MACSRC=%pM MACDST=%pM MACPROTO=%04x ", - eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest, - ntohs(eth_hdr(skb)->h_proto)); + nf_log_buf_add(m, "MACSRC=%pM MACDST=%pM ", + eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest); + nf_log_dump_vlan(m, skb); + nf_log_buf_add(m, "MACPROTO=%04x ", + ntohs(eth_hdr(skb)->h_proto)); return; default: break; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index bde6d48a9942..7e0ce7af8234 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2745,7 +2745,8 @@ static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, if (confirm_neigh) dst_confirm_neigh(dst, daddr); - mtu = max_t(u32, mtu, IPV6_MIN_MTU); + if (mtu < IPV6_MIN_MTU) + return; if (mtu >= dst_mtu(dst)) return; diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 4d35fa998fcc..092a2d48bfd3 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -517,7 +517,7 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb, return ret; } - if (subflow->use_64bit_ack) { + if (READ_ONCE(msk->use_64bit_ack)) { ack_size = TCPOLEN_MPTCP_DSS_ACK64; opts->ext_copy.data_ack = READ_ONCE(msk->ack_seq); opts->ext_copy.ack64 = 1; @@ -657,6 +657,12 @@ bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, if (unlikely(mptcp_check_fallback(sk))) return false; + /* prevent adding of any MPTCP related options on reset packet + * until we support MP_TCPRST/MP_FASTCLOSE + */ + if (unlikely(skb && TCP_SKB_CB(skb)->tcp_flags & TCPHDR_RST)) + return false; + if (mptcp_established_options_mp(sk, skb, &opt_size, remaining, opts)) ret = true; else if (mptcp_established_options_dss(sk, skb, &opt_size, remaining, @@ -711,7 +717,7 @@ bool mptcp_synack_options(const struct request_sock *req, unsigned int *size, return false; } -static bool check_fully_established(struct mptcp_sock *msk, struct sock *sk, +static bool check_fully_established(struct mptcp_sock *msk, struct sock *ssk, struct mptcp_subflow_context *subflow, struct sk_buff *skb, struct mptcp_options_received *mp_opt) @@ -728,15 +734,20 @@ static bool check_fully_established(struct mptcp_sock *msk, struct sock *sk, TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq && subflow->mp_join && mp_opt->mp_join && READ_ONCE(msk->pm.server_side)) - tcp_send_ack(sk); + tcp_send_ack(ssk); goto fully_established; } - /* we should process OoO packets before the first subflow is fully - * established, but not expected for MP_JOIN subflows + /* we must process OoO packets before the first subflow is fully + * established. OoO packets are instead a protocol violation + * for MP_JOIN subflows as the peer must not send any data + * before receiving the forth ack - cfr. RFC 8684 section 3.2. */ - if (TCP_SKB_CB(skb)->seq != subflow->ssn_offset + 1) + if (TCP_SKB_CB(skb)->seq != subflow->ssn_offset + 1) { + if (subflow->mp_join) + goto reset; return subflow->mp_capable; + } if (mp_opt->dss && mp_opt->use_ack) { /* subflows are fully established as soon as we get any @@ -748,9 +759,12 @@ static bool check_fully_established(struct mptcp_sock *msk, struct sock *sk, } /* If the first established packet does not contain MP_CAPABLE + data - * then fallback to TCP + * then fallback to TCP. Fallback scenarios requires a reset for + * MP_JOIN subflows. */ if (!mp_opt->mp_capable) { + if (subflow->mp_join) + goto reset; subflow->mp_capable = 0; pr_fallback(msk); __mptcp_do_fallback(msk); @@ -767,12 +781,16 @@ fully_established: subflow->pm_notified = 1; if (subflow->mp_join) { - clear_3rdack_retransmission(sk); + clear_3rdack_retransmission(ssk); mptcp_pm_subflow_established(msk, subflow); } else { mptcp_pm_fully_established(msk); } return true; + +reset: + mptcp_subflow_reset(ssk); + return false; } static u64 expand_ack(u64 old_ack, u64 cur_ack, bool use_64bit) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 9816608da452..185dacb39781 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -1710,6 +1710,20 @@ static void pm_work(struct mptcp_sock *msk) spin_unlock_bh(&msk->pm.lock); } +static void __mptcp_close_subflow(struct mptcp_sock *msk) +{ + struct mptcp_subflow_context *subflow, *tmp; + + list_for_each_entry_safe(subflow, tmp, &msk->conn_list, node) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + + if (inet_sk_state_load(ssk) != TCP_CLOSE) + continue; + + __mptcp_close_ssk((struct sock *)msk, ssk, subflow, 0); + } +} + static void mptcp_worker(struct work_struct *work) { struct mptcp_sock *msk = container_of(work, struct mptcp_sock, work); @@ -1727,6 +1741,9 @@ static void mptcp_worker(struct work_struct *work) mptcp_clean_una(sk); mptcp_check_data_fin_ack(sk); __mptcp_flush_join_list(msk); + if (test_and_clear_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags)) + __mptcp_close_subflow(msk); + __mptcp_move_skbs(msk); if (msk->pm.status) diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index aa0ab18d2e57..13ab89dc1914 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -90,6 +90,7 @@ #define MPTCP_WORK_RTX 2 #define MPTCP_WORK_EOF 3 #define MPTCP_FALLBACK_DONE 4 +#define MPTCP_WORK_CLOSE_SUBFLOW 5 struct mptcp_options_received { u64 sndr_key; @@ -211,6 +212,7 @@ struct mptcp_sock { bool fully_established; bool rcv_data_fin; bool snd_data_fin_enable; + bool use_64bit_ack; /* Set when we received a 64-bit DSN */ spinlock_t join_list_lock; struct work_struct work; struct sk_buff *ooo_last_skb; @@ -310,7 +312,6 @@ struct mptcp_subflow_context { mpc_map : 1, backup : 1, rx_eof : 1, - use_64bit_ack : 1, /* Set when we received a 64-bit DSN */ can_ack : 1; /* only after processing the remote a key */ enum mptcp_data_avail data_avail; u32 remote_nonce; @@ -369,6 +370,7 @@ void mptcp_subflow_shutdown(struct sock *sk, struct sock *ssk, int how); void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, struct mptcp_subflow_context *subflow, long timeout); +void mptcp_subflow_reset(struct sock *ssk); /* called with sk socket lock held */ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc, diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 5d91e3a2cd30..ac4a1fe3550b 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -271,6 +271,19 @@ static bool subflow_thmac_valid(struct mptcp_subflow_context *subflow) return thmac == subflow->thmac; } +void mptcp_subflow_reset(struct sock *ssk) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); + struct sock *sk = subflow->conn; + + tcp_set_state(ssk, TCP_CLOSE); + tcp_send_active_reset(ssk, GFP_ATOMIC); + tcp_done(ssk); + if (!test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &mptcp_sk(sk)->flags) && + schedule_work(&mptcp_sk(sk)->work)) + sock_hold(sk); +} + static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); @@ -343,8 +356,7 @@ fallback: return; do_reset: - tcp_send_active_reset(sk, GFP_ATOMIC); - tcp_done(sk); + mptcp_subflow_reset(sk); } struct request_sock_ops mptcp_subflow_request_sock_ops; @@ -770,12 +782,11 @@ static enum mapping_status get_mapping_status(struct sock *ssk, if (!mpext->dsn64) { map_seq = expand_seq(subflow->map_seq, subflow->map_data_len, mpext->data_seq); - subflow->use_64bit_ack = 0; pr_debug("expanded seq=%llu", subflow->map_seq); } else { map_seq = mpext->data_seq; - subflow->use_64bit_ack = 1; } + WRITE_ONCE(mptcp_sk(subflow->conn)->use_64bit_ack, !!mpext->dsn64); if (subflow->map_valid) { /* Allow replacing only with an identical map */ diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index b00866d777fe..d2e5a8f644b8 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -609,6 +609,8 @@ static inline int ip_vs_tunnel_xmit_prepare(struct sk_buff *skb, if (ret == NF_ACCEPT) { nf_reset_ct(skb); skb_forward_csum(skb); + if (skb->dev) + skb->tstamp = 0; } return ret; } @@ -649,6 +651,8 @@ static inline int ip_vs_nat_send_or_cont(int pf, struct sk_buff *skb, if (!local) { skb_forward_csum(skb); + if (skb->dev) + skb->tstamp = 0; NF_HOOK(pf, NF_INET_LOCAL_OUT, cp->ipvs->net, NULL, skb, NULL, skb_dst(skb)->dev, dst_output); } else @@ -669,6 +673,8 @@ static inline int ip_vs_send_or_cont(int pf, struct sk_buff *skb, if (!local) { ip_vs_drop_early_demux_sk(skb); skb_forward_csum(skb); + if (skb->dev) + skb->tstamp = 0; NF_HOOK(pf, NF_INET_LOCAL_OUT, cp->ipvs->net, NULL, skb, NULL, skb_dst(skb)->dev, dst_output); } else diff --git a/net/netfilter/nf_log_common.c b/net/netfilter/nf_log_common.c index ae5628ddbe6d..fd7c5f0f5c25 100644 --- a/net/netfilter/nf_log_common.c +++ b/net/netfilter/nf_log_common.c @@ -171,6 +171,18 @@ nf_log_dump_packet_common(struct nf_log_buf *m, u_int8_t pf, } EXPORT_SYMBOL_GPL(nf_log_dump_packet_common); +void nf_log_dump_vlan(struct nf_log_buf *m, const struct sk_buff *skb) +{ + u16 vid; + + if (!skb_vlan_tag_present(skb)) + return; + + vid = skb_vlan_tag_get(skb); + nf_log_buf_add(m, "VPROTO=%04x VID=%u ", ntohs(skb->vlan_proto), vid); +} +EXPORT_SYMBOL_GPL(nf_log_dump_vlan); + /* bridge and netdev logging families share this code. */ void nf_log_l2packet(struct net *net, u_int8_t pf, __be16 protocol, diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 7f1c184c00d2..9957e0ed8658 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2138,7 +2138,8 @@ static bool nft_hook_list_equal(struct list_head *hook_list1, } static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy, - u32 flags) + u32 flags, const struct nlattr *attr, + struct netlink_ext_ack *extack) { const struct nlattr * const *nla = ctx->nla; struct nft_table *table = ctx->table; @@ -2154,9 +2155,10 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy, return -EOPNOTSUPP; if (nla[NFTA_CHAIN_HOOK]) { - if (!nft_is_base_chain(chain)) + if (!nft_is_base_chain(chain)) { + NL_SET_BAD_ATTR(extack, attr); return -EEXIST; - + } err = nft_chain_parse_hook(ctx->net, nla, &hook, ctx->family, false); if (err < 0) @@ -2165,6 +2167,7 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy, basechain = nft_base_chain(chain); if (basechain->type != hook.type) { nft_chain_release_hook(&hook); + NL_SET_BAD_ATTR(extack, attr); return -EEXIST; } @@ -2172,6 +2175,7 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy, if (!nft_hook_list_equal(&basechain->hook_list, &hook.list)) { nft_chain_release_hook(&hook); + NL_SET_BAD_ATTR(extack, attr); return -EEXIST; } } else { @@ -2179,6 +2183,7 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy, if (ops->hooknum != hook.num || ops->priority != hook.priority) { nft_chain_release_hook(&hook); + NL_SET_BAD_ATTR(extack, attr); return -EEXIST; } } @@ -2191,8 +2196,10 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy, chain2 = nft_chain_lookup(ctx->net, table, nla[NFTA_CHAIN_NAME], genmask); - if (!IS_ERR(chain2)) + if (!IS_ERR(chain2)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_NAME]); return -EEXIST; + } } if (nla[NFTA_CHAIN_COUNTERS]) { @@ -2235,6 +2242,7 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy, nft_trans_chain_update(tmp) && nft_trans_chain_name(tmp) && strcmp(name, nft_trans_chain_name(tmp)) == 0) { + NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_NAME]); kfree(name); goto err; } @@ -2357,7 +2365,8 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, return -EOPNOTSUPP; flags |= chain->flags & NFT_CHAIN_BASE; - return nf_tables_updchain(&ctx, genmask, policy, flags); + return nf_tables_updchain(&ctx, genmask, policy, flags, attr, + extack); } return nf_tables_addchain(&ctx, family, genmask, policy, flags); diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index f1dbb5025c0b..d790c43c473f 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -1596,7 +1596,7 @@ out: return rc; } -#define SMCD_DMBE_SIZES 7 /* 0 -> 16KB, 1 -> 32KB, .. 6 -> 1MB */ +#define SMCD_DMBE_SIZES 6 /* 0 -> 16KB, 1 -> 32KB, .. 6 -> 1MB */ static struct smc_buf_desc *smcd_new_buf_create(struct smc_link_group *lgr, bool is_dmb, int bufsize) @@ -1615,7 +1615,8 @@ static struct smc_buf_desc *smcd_new_buf_create(struct smc_link_group *lgr, rc = smc_ism_register_dmb(lgr, bufsize, buf_desc); if (rc) { kfree(buf_desc); - return (rc == -ENOMEM) ? ERR_PTR(-EAGAIN) : ERR_PTR(rc); + return (rc == -ENOMEM) ? ERR_PTR(-EAGAIN) : + ERR_PTR(-EIO); } buf_desc->pages = virt_to_page(buf_desc->cpu_addr); /* CDC header stored in buf. So, pretend it was smaller */ diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 2db967f2fb50..273eaf1bfe49 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -233,8 +233,6 @@ static bool smc_llc_flow_start(struct smc_llc_flow *flow, default: flow->type = SMC_LLC_FLOW_NONE; } - if (qentry == lgr->delayed_event) - lgr->delayed_event = NULL; smc_llc_flow_qentry_set(flow, qentry); spin_unlock_bh(&lgr->llc_flow_lock); return true; @@ -1209,7 +1207,7 @@ static void smc_llc_process_srv_add_link(struct smc_link_group *lgr) /* enqueue a local add_link req to trigger a new add_link flow */ void smc_llc_add_link_local(struct smc_link *link) { - struct smc_llc_msg_add_link add_llc = {0}; + struct smc_llc_msg_add_link add_llc = {}; add_llc.hd.length = sizeof(add_llc); add_llc.hd.common.type = SMC_LLC_ADD_LINK; @@ -1242,7 +1240,7 @@ out: */ void smc_llc_srv_delete_link_local(struct smc_link *link, u8 del_link_id) { - struct smc_llc_msg_del_link del_llc = {0}; + struct smc_llc_msg_del_link del_llc = {}; del_llc.hd.length = sizeof(del_llc); del_llc.hd.common.type = SMC_LLC_DELETE_LINK; @@ -1314,7 +1312,7 @@ out: */ void smc_llc_send_link_delete_all(struct smc_link_group *lgr, bool ord, u32 rsn) { - struct smc_llc_msg_del_link delllc = {0}; + struct smc_llc_msg_del_link delllc = {}; int i; delllc.hd.common.type = SMC_LLC_DELETE_LINK; @@ -1603,13 +1601,12 @@ static void smc_llc_event_work(struct work_struct *work) struct smc_llc_qentry *qentry; if (!lgr->llc_flow_lcl.type && lgr->delayed_event) { - if (smc_link_usable(lgr->delayed_event->link)) { - smc_llc_event_handler(lgr->delayed_event); - } else { - qentry = lgr->delayed_event; - lgr->delayed_event = NULL; + qentry = lgr->delayed_event; + lgr->delayed_event = NULL; + if (smc_link_usable(qentry->link)) + smc_llc_event_handler(qentry); + else kfree(qentry); - } } again: diff --git a/net/tipc/msg.c b/net/tipc/msg.c index 11a429f4c6cd..2a78aa701572 100644 --- a/net/tipc/msg.c +++ b/net/tipc/msg.c @@ -150,7 +150,8 @@ int tipc_buf_append(struct sk_buff **headbuf, struct sk_buff **buf) if (fragid == FIRST_FRAGMENT) { if (unlikely(head)) goto err; - frag = skb_unshare(frag, GFP_ATOMIC); + if (skb_cloned(frag)) + frag = skb_copy(frag, GFP_ATOMIC); if (unlikely(!frag)) goto err; head = *headbuf = frag; diff --git a/net/tipc/name_distr.c b/net/tipc/name_distr.c index 2f9c148f17e2..fe4edce459ad 100644 --- a/net/tipc/name_distr.c +++ b/net/tipc/name_distr.c @@ -327,8 +327,13 @@ static struct sk_buff *tipc_named_dequeue(struct sk_buff_head *namedq, struct tipc_msg *hdr; u16 seqno; + spin_lock_bh(&namedq->lock); skb_queue_walk_safe(namedq, skb, tmp) { - skb_linearize(skb); + if (unlikely(skb_linearize(skb))) { + __skb_unlink(skb, namedq); + kfree_skb(skb); + continue; + } hdr = buf_msg(skb); seqno = msg_named_seqno(hdr); if (msg_is_last_bulk(hdr)) { @@ -338,12 +343,14 @@ static struct sk_buff *tipc_named_dequeue(struct sk_buff_head *namedq, if (msg_is_bulk(hdr) || msg_is_legacy(hdr)) { __skb_unlink(skb, namedq); + spin_unlock_bh(&namedq->lock); return skb; } if (*open && (*rcv_nxt == seqno)) { (*rcv_nxt)++; __skb_unlink(skb, namedq); + spin_unlock_bh(&namedq->lock); return skb; } @@ -353,6 +360,7 @@ static struct sk_buff *tipc_named_dequeue(struct sk_buff_head *namedq, continue; } } + spin_unlock_bh(&namedq->lock); return NULL; } diff --git a/net/tipc/node.c b/net/tipc/node.c index cf4b239fc569..d269ebe382e1 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -1496,7 +1496,7 @@ static void node_lost_contact(struct tipc_node *n, /* Clean up broadcast state */ tipc_bcast_remove_peer(n->net, n->bc_entry.link); - __skb_queue_purge(&n->bc_entry.namedq); + skb_queue_purge(&n->bc_entry.namedq); /* Abort any ongoing link failover */ for (i = 0; i < MAX_BEARERS; i++) { diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index b74e2741f74f..cec86229a6a0 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -418,14 +418,14 @@ static int tls_push_data(struct sock *sk, struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_prot_info *prot = &tls_ctx->prot_info; struct tls_offload_context_tx *ctx = tls_offload_ctx_tx(tls_ctx); - int more = flags & (MSG_SENDPAGE_NOTLAST | MSG_MORE); struct tls_record_info *record = ctx->open_record; int tls_push_record_flags; struct page_frag *pfrag; size_t orig_size = size; u32 max_open_record_len; - int copy, rc = 0; + bool more = false; bool done = false; + int copy, rc = 0; long timeo; if (flags & @@ -492,9 +492,8 @@ handle_error: if (!size) { last_record: tls_push_record_flags = flags; - if (more) { - tls_ctx->pending_open_record_frags = - !!record->num_frags; + if (flags & (MSG_SENDPAGE_NOTLAST | MSG_MORE)) { + more = true; break; } @@ -526,6 +525,8 @@ last_record: } } while (!done); + tls_ctx->pending_open_record_frags = more; + if (orig_size - size > 0) rc = orig_size - size; diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index 4773ce72edbd..ef352477cac6 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -20,6 +20,7 @@ TEST_PROGS += vrf-xfrm-tests.sh TEST_PROGS += rxtimestamp.sh TEST_PROGS += devlink_port_split.py TEST_PROGS += drop_monitor_tests.sh +TEST_PROGS += vrf_route_leaking.sh TEST_PROGS_EXTENDED := in_netns.sh TEST_GEN_FILES = socket nettest TEST_GEN_FILES += psock_fanout psock_tpacket msg_zerocopy reuseport_addr_any diff --git a/tools/testing/selftests/net/vrf_route_leaking.sh b/tools/testing/selftests/net/vrf_route_leaking.sh new file mode 100755 index 000000000000..23cf924754a5 --- /dev/null +++ b/tools/testing/selftests/net/vrf_route_leaking.sh @@ -0,0 +1,626 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Copyright (c) 2019 David Ahern <dsahern@gmail.com>. All rights reserved. +# Copyright (c) 2020 Michael Jeanson <mjeanson@efficios.com>. All rights reserved. +# +# Requires CONFIG_NET_VRF, CONFIG_VETH, CONFIG_BRIDGE and CONFIG_NET_NS. +# +# +# Symmetric routing topology +# +# blue red +# +----+ .253 +----+ .253 +----+ +# | h1 |-------------------| r1 |-------------------| h2 | +# +----+ .1 +----+ .2 +----+ +# 172.16.1/24 172.16.2/24 +# 2001:db8:16:1/64 2001:db8:16:2/64 +# +# +# Route from h1 to h2 and back goes through r1, incoming vrf blue has a route +# to the outgoing vrf red for the n2 network and red has a route back to n1. +# The red VRF interface has a MTU of 1400. +# +# The first test sends a ping with a ttl of 1 from h1 to h2 and parses the +# output of the command to check that a ttl expired error is received. +# +# The second test runs traceroute from h1 to h2 and parses the output to check +# for a hop on r1. +# +# The third test sends a ping with a packet size of 1450 from h1 to h2 and +# parses the output of the command to check that a fragmentation error is +# received. +# +# +# Asymmetric routing topology +# +# This topology represents a customer setup where the issue with icmp errors +# and VRF route leaking was initialy reported. The MTU test isn't done here +# because of the lack of a return route in the red VRF. +# +# blue red +# .253 +----+ .253 +# +----| r1 |----+ +# | +----+ | +# +----+ | | +----+ +# | h1 |--------------+ +--------------| h2 | +# +----+ .1 | | .2 +----+ +# 172.16.1/24 | +----+ | 172.16.2/24 +# 2001:db8:16:1/64 +----| r2 |----+ 2001:db8:16:2/64 +# .254 +----+ .254 +# +# +# Route from h1 to h2 goes through r1, incoming vrf blue has a route to the +# outgoing vrf red for the n2 network but red doesn't have a route back to n1. +# Route from h2 to h1 goes through r2. +# +# The objective is to check that the incoming vrf routing table is selected +# to send an ICMP error back to the source when the ttl of a packet reaches 1 +# while it is forwarded between different vrfs. + +VERBOSE=0 +PAUSE_ON_FAIL=no +DEFAULT_TTYPE=sym + +H1_N1=172.16.1.0/24 +H1_N1_6=2001:db8:16:1::/64 + +H1_N1_IP=172.16.1.1 +R1_N1_IP=172.16.1.253 +R2_N1_IP=172.16.1.254 + +H1_N1_IP6=2001:db8:16:1::1 +R1_N1_IP6=2001:db8:16:1::253 +R2_N1_IP6=2001:db8:16:1::254 + +H2_N2=172.16.2.0/24 +H2_N2_6=2001:db8:16:2::/64 + +H2_N2_IP=172.16.2.2 +R1_N2_IP=172.16.2.253 +R2_N2_IP=172.16.2.254 + +H2_N2_IP6=2001:db8:16:2::2 +R1_N2_IP6=2001:db8:16:2::253 +R2_N2_IP6=2001:db8:16:2::254 + +################################################################################ +# helpers + +log_section() +{ + echo + echo "###########################################################################" + echo "$*" + echo "###########################################################################" + echo +} + +log_test() +{ + local rc=$1 + local expected=$2 + local msg="$3" + + if [ "${rc}" -eq "${expected}" ]; then + printf "TEST: %-60s [ OK ]\n" "${msg}" + nsuccess=$((nsuccess+1)) + else + ret=1 + nfail=$((nfail+1)) + printf "TEST: %-60s [FAIL]\n" "${msg}" + if [ "${PAUSE_ON_FAIL}" = "yes" ]; then + echo + echo "hit enter to continue, 'q' to quit" + read -r a + [ "$a" = "q" ] && exit 1 + fi + fi +} + +run_cmd() +{ + local cmd="$*" + local out + local rc + + if [ "$VERBOSE" = "1" ]; then + echo "COMMAND: $cmd" + fi + + # shellcheck disable=SC2086 + out=$(eval $cmd 2>&1) + rc=$? + if [ "$VERBOSE" = "1" ] && [ -n "$out" ]; then + echo "$out" + fi + + [ "$VERBOSE" = "1" ] && echo + + return $rc +} + +run_cmd_grep() +{ + local grep_pattern="$1" + shift + local cmd="$*" + local out + local rc + + if [ "$VERBOSE" = "1" ]; then + echo "COMMAND: $cmd" + fi + + # shellcheck disable=SC2086 + out=$(eval $cmd 2>&1) + if [ "$VERBOSE" = "1" ] && [ -n "$out" ]; then + echo "$out" + fi + + echo "$out" | grep -q "$grep_pattern" + rc=$? + + [ "$VERBOSE" = "1" ] && echo + + return $rc +} + +################################################################################ +# setup and teardown + +cleanup() +{ + local ns + + for ns in h1 h2 r1 r2; do + ip netns del $ns 2>/dev/null + done +} + +setup_vrf() +{ + local ns=$1 + + ip -netns "${ns}" rule del pref 0 + ip -netns "${ns}" rule add pref 32765 from all lookup local + ip -netns "${ns}" -6 rule del pref 0 + ip -netns "${ns}" -6 rule add pref 32765 from all lookup local +} + +create_vrf() +{ + local ns=$1 + local vrf=$2 + local table=$3 + + ip -netns "${ns}" link add "${vrf}" type vrf table "${table}" + ip -netns "${ns}" link set "${vrf}" up + ip -netns "${ns}" route add vrf "${vrf}" unreachable default metric 8192 + ip -netns "${ns}" -6 route add vrf "${vrf}" unreachable default metric 8192 + + ip -netns "${ns}" addr add 127.0.0.1/8 dev "${vrf}" + ip -netns "${ns}" -6 addr add ::1 dev "${vrf}" nodad +} + +setup_sym() +{ + local ns + + # make sure we are starting with a clean slate + cleanup + + # + # create nodes as namespaces + # + for ns in h1 h2 r1; do + ip netns add $ns + ip -netns $ns link set lo up + + case "${ns}" in + h[12]) ip netns exec $ns sysctl -q -w net.ipv6.conf.all.forwarding=0 + ip netns exec $ns sysctl -q -w net.ipv6.conf.all.keep_addr_on_down=1 + ;; + r1) ip netns exec $ns sysctl -q -w net.ipv4.ip_forward=1 + ip netns exec $ns sysctl -q -w net.ipv6.conf.all.forwarding=1 + esac + done + + # + # create interconnects + # + ip -netns h1 link add eth0 type veth peer name r1h1 + ip -netns h1 link set r1h1 netns r1 name eth0 up + + ip -netns h2 link add eth0 type veth peer name r1h2 + ip -netns h2 link set r1h2 netns r1 name eth1 up + + # + # h1 + # + ip -netns h1 addr add dev eth0 ${H1_N1_IP}/24 + ip -netns h1 -6 addr add dev eth0 ${H1_N1_IP6}/64 nodad + ip -netns h1 link set eth0 up + + # h1 to h2 via r1 + ip -netns h1 route add ${H2_N2} via ${R1_N1_IP} dev eth0 + ip -netns h1 -6 route add ${H2_N2_6} via "${R1_N1_IP6}" dev eth0 + + # + # h2 + # + ip -netns h2 addr add dev eth0 ${H2_N2_IP}/24 + ip -netns h2 -6 addr add dev eth0 ${H2_N2_IP6}/64 nodad + ip -netns h2 link set eth0 up + + # h2 to h1 via r1 + ip -netns h2 route add default via ${R1_N2_IP} dev eth0 + ip -netns h2 -6 route add default via ${R1_N2_IP6} dev eth0 + + # + # r1 + # + setup_vrf r1 + create_vrf r1 blue 1101 + create_vrf r1 red 1102 + ip -netns r1 link set mtu 1400 dev eth1 + ip -netns r1 link set eth0 vrf blue up + ip -netns r1 link set eth1 vrf red up + ip -netns r1 addr add dev eth0 ${R1_N1_IP}/24 + ip -netns r1 -6 addr add dev eth0 ${R1_N1_IP6}/64 nodad + ip -netns r1 addr add dev eth1 ${R1_N2_IP}/24 + ip -netns r1 -6 addr add dev eth1 ${R1_N2_IP6}/64 nodad + + # Route leak from blue to red + ip -netns r1 route add vrf blue ${H2_N2} dev red + ip -netns r1 -6 route add vrf blue ${H2_N2_6} dev red + + # Route leak from red to blue + ip -netns r1 route add vrf red ${H1_N1} dev blue + ip -netns r1 -6 route add vrf red ${H1_N1_6} dev blue + + + # Wait for ip config to settle + sleep 2 +} + +setup_asym() +{ + local ns + + # make sure we are starting with a clean slate + cleanup + + # + # create nodes as namespaces + # + for ns in h1 h2 r1 r2; do + ip netns add $ns + ip -netns $ns link set lo up + + case "${ns}" in + h[12]) ip netns exec $ns sysctl -q -w net.ipv6.conf.all.forwarding=0 + ip netns exec $ns sysctl -q -w net.ipv6.conf.all.keep_addr_on_down=1 + ;; + r[12]) ip netns exec $ns sysctl -q -w net.ipv4.ip_forward=1 + ip netns exec $ns sysctl -q -w net.ipv6.conf.all.forwarding=1 + esac + done + + # + # create interconnects + # + ip -netns h1 link add eth0 type veth peer name r1h1 + ip -netns h1 link set r1h1 netns r1 name eth0 up + + ip -netns h1 link add eth1 type veth peer name r2h1 + ip -netns h1 link set r2h1 netns r2 name eth0 up + + ip -netns h2 link add eth0 type veth peer name r1h2 + ip -netns h2 link set r1h2 netns r1 name eth1 up + + ip -netns h2 link add eth1 type veth peer name r2h2 + ip -netns h2 link set r2h2 netns r2 name eth1 up + + # + # h1 + # + ip -netns h1 link add br0 type bridge + ip -netns h1 link set br0 up + ip -netns h1 addr add dev br0 ${H1_N1_IP}/24 + ip -netns h1 -6 addr add dev br0 ${H1_N1_IP6}/64 nodad + ip -netns h1 link set eth0 master br0 up + ip -netns h1 link set eth1 master br0 up + + # h1 to h2 via r1 + ip -netns h1 route add ${H2_N2} via ${R1_N1_IP} dev br0 + ip -netns h1 -6 route add ${H2_N2_6} via "${R1_N1_IP6}" dev br0 + + # + # h2 + # + ip -netns h2 link add br0 type bridge + ip -netns h2 link set br0 up + ip -netns h2 addr add dev br0 ${H2_N2_IP}/24 + ip -netns h2 -6 addr add dev br0 ${H2_N2_IP6}/64 nodad + ip -netns h2 link set eth0 master br0 up + ip -netns h2 link set eth1 master br0 up + + # h2 to h1 via r2 + ip -netns h2 route add default via ${R2_N2_IP} dev br0 + ip -netns h2 -6 route add default via ${R2_N2_IP6} dev br0 + + # + # r1 + # + setup_vrf r1 + create_vrf r1 blue 1101 + create_vrf r1 red 1102 + ip -netns r1 link set mtu 1400 dev eth1 + ip -netns r1 link set eth0 vrf blue up + ip -netns r1 link set eth1 vrf red up + ip -netns r1 addr add dev eth0 ${R1_N1_IP}/24 + ip -netns r1 -6 addr add dev eth0 ${R1_N1_IP6}/64 nodad + ip -netns r1 addr add dev eth1 ${R1_N2_IP}/24 + ip -netns r1 -6 addr add dev eth1 ${R1_N2_IP6}/64 nodad + + # Route leak from blue to red + ip -netns r1 route add vrf blue ${H2_N2} dev red + ip -netns r1 -6 route add vrf blue ${H2_N2_6} dev red + + # No route leak from red to blue + + # + # r2 + # + ip -netns r2 addr add dev eth0 ${R2_N1_IP}/24 + ip -netns r2 -6 addr add dev eth0 ${R2_N1_IP6}/64 nodad + ip -netns r2 addr add dev eth1 ${R2_N2_IP}/24 + ip -netns r2 -6 addr add dev eth1 ${R2_N2_IP6}/64 nodad + + # Wait for ip config to settle + sleep 2 +} + +check_connectivity() +{ + ip netns exec h1 ping -c1 -w1 ${H2_N2_IP} >/dev/null 2>&1 + log_test $? 0 "Basic IPv4 connectivity" + return $? +} + +check_connectivity6() +{ + ip netns exec h1 "${ping6}" -c1 -w1 ${H2_N2_IP6} >/dev/null 2>&1 + log_test $? 0 "Basic IPv6 connectivity" + return $? +} + +check_traceroute() +{ + if [ ! -x "$(command -v traceroute)" ]; then + echo "SKIP: Could not run IPV4 test without traceroute" + return 1 + fi +} + +check_traceroute6() +{ + if [ ! -x "$(command -v traceroute6)" ]; then + echo "SKIP: Could not run IPV6 test without traceroute6" + return 1 + fi +} + +ipv4_traceroute() +{ + local ttype="$1" + + [ "x$ttype" = "x" ] && ttype="$DEFAULT_TTYPE" + + log_section "IPv4 ($ttype route): VRF ICMP error route lookup traceroute" + + check_traceroute || return + + setup_"$ttype" + + check_connectivity || return + + run_cmd_grep "${R1_N1_IP}" ip netns exec h1 traceroute ${H2_N2_IP} + log_test $? 0 "Traceroute reports a hop on r1" +} + +ipv4_traceroute_asym() +{ + ipv4_traceroute asym +} + +ipv6_traceroute() +{ + local ttype="$1" + + [ "x$ttype" = "x" ] && ttype="$DEFAULT_TTYPE" + + log_section "IPv6 ($ttype route): VRF ICMP error route lookup traceroute" + + check_traceroute6 || return + + setup_"$ttype" + + check_connectivity6 || return + + run_cmd_grep "${R1_N1_IP6}" ip netns exec h1 traceroute6 ${H2_N2_IP6} + log_test $? 0 "Traceroute6 reports a hop on r1" +} + +ipv6_traceroute_asym() +{ + ipv6_traceroute asym +} + +ipv4_ping_ttl() +{ + local ttype="$1" + + [ "x$ttype" = "x" ] && ttype="$DEFAULT_TTYPE" + + log_section "IPv4 ($ttype route): VRF ICMP ttl error route lookup ping" + + setup_"$ttype" + + check_connectivity || return + + run_cmd_grep "Time to live exceeded" ip netns exec h1 ping -t1 -c1 -W2 ${H2_N2_IP} + log_test $? 0 "Ping received ICMP ttl exceeded" +} + +ipv4_ping_ttl_asym() +{ + ipv4_ping_ttl asym +} + +ipv4_ping_frag() +{ + local ttype="$1" + + [ "x$ttype" = "x" ] && ttype="$DEFAULT_TTYPE" + + log_section "IPv4 ($ttype route): VRF ICMP fragmentation error route lookup ping" + + setup_"$ttype" + + check_connectivity || return + + run_cmd_grep "Frag needed" ip netns exec h1 ping -s 1450 -Mdo -c1 -W2 ${H2_N2_IP} + log_test $? 0 "Ping received ICMP Frag needed" +} + +ipv4_ping_frag_asym() +{ + ipv4_ping_frag asym +} + +ipv6_ping_ttl() +{ + local ttype="$1" + + [ "x$ttype" = "x" ] && ttype="$DEFAULT_TTYPE" + + log_section "IPv6 ($ttype route): VRF ICMP ttl error route lookup ping" + + setup_"$ttype" + + check_connectivity6 || return + + run_cmd_grep "Time exceeded: Hop limit" ip netns exec h1 "${ping6}" -t1 -c1 -W2 ${H2_N2_IP6} + log_test $? 0 "Ping received ICMP Hop limit" +} + +ipv6_ping_ttl_asym() +{ + ipv6_ping_ttl asym +} + +ipv6_ping_frag() +{ + local ttype="$1" + + [ "x$ttype" = "x" ] && ttype="$DEFAULT_TTYPE" + + log_section "IPv6 ($ttype route): VRF ICMP fragmentation error route lookup ping" + + setup_"$ttype" + + check_connectivity6 || return + + run_cmd_grep "Packet too big" ip netns exec h1 "${ping6}" -s 1450 -Mdo -c1 -W2 ${H2_N2_IP6} + log_test $? 0 "Ping received ICMP Packet too big" +} + +ipv6_ping_frag_asym() +{ + ipv6_ping_frag asym +} + +################################################################################ +# usage + +usage() +{ + cat <<EOF +usage: ${0##*/} OPTS + + -4 Run IPv4 tests only + -6 Run IPv6 tests only + -t TEST Run only TEST + -p Pause on fail + -v verbose mode (show commands and output) +EOF +} + +################################################################################ +# main + +# Some systems don't have a ping6 binary anymore +command -v ping6 > /dev/null 2>&1 && ping6=$(command -v ping6) || ping6=$(command -v ping) + +TESTS_IPV4="ipv4_ping_ttl ipv4_traceroute ipv4_ping_frag ipv4_ping_ttl_asym ipv4_traceroute_asym" +TESTS_IPV6="ipv6_ping_ttl ipv6_traceroute ipv6_ping_frag ipv6_ping_ttl_asym ipv6_traceroute_asym" + +ret=0 +nsuccess=0 +nfail=0 + +while getopts :46t:pvh o +do + case $o in + 4) TESTS=ipv4;; + 6) TESTS=ipv6;; + t) TESTS=$OPTARG;; + p) PAUSE_ON_FAIL=yes;; + v) VERBOSE=1;; + h) usage; exit 0;; + *) usage; exit 1;; + esac +done + +# +# show user test config +# +if [ -z "$TESTS" ]; then + TESTS="$TESTS_IPV4 $TESTS_IPV6" +elif [ "$TESTS" = "ipv4" ]; then + TESTS="$TESTS_IPV4" +elif [ "$TESTS" = "ipv6" ]; then + TESTS="$TESTS_IPV6" +fi + +for t in $TESTS +do + case $t in + ipv4_ping_ttl|ping) ipv4_ping_ttl;;& + ipv4_ping_ttl_asym|ping) ipv4_ping_ttl_asym;;& + ipv4_traceroute|traceroute) ipv4_traceroute;;& + ipv4_traceroute_asym|traceroute) ipv4_traceroute_asym;;& + ipv4_ping_frag|ping) ipv4_ping_frag;;& + + ipv6_ping_ttl|ping) ipv6_ping_ttl;;& + ipv6_ping_ttl_asym|ping) ipv6_ping_ttl_asym;;& + ipv6_traceroute|traceroute) ipv6_traceroute;;& + ipv6_traceroute_asym|traceroute) ipv6_traceroute_asym;;& + ipv6_ping_frag|ping) ipv6_ping_frag;;& + + # setup namespaces and config, but do not run any tests + setup_sym|setup) setup_sym; exit 0;; + setup_asym) setup_asym; exit 0;; + + help) echo "Test names: $TESTS"; exit 0;; + esac +done + +cleanup + +printf "\nTests passed: %3d\n" ${nsuccess} +printf "Tests failed: %3d\n" ${nfail} + +exit $ret diff --git a/tools/testing/selftests/netfilter/nf-queue.c b/tools/testing/selftests/netfilter/nf-queue.c index 29c73bce38fa..9e56b9d47037 100644 --- a/tools/testing/selftests/netfilter/nf-queue.c +++ b/tools/testing/selftests/netfilter/nf-queue.c @@ -17,9 +17,12 @@ struct options { bool count_packets; + bool gso_enabled; int verbose; unsigned int queue_num; unsigned int timeout; + uint32_t verdict; + uint32_t delay_ms; }; static unsigned int queue_stats[5]; @@ -27,7 +30,7 @@ static struct options opts; static void help(const char *p) { - printf("Usage: %s [-c|-v [-vv] ] [-t timeout] [-q queue_num]\n", p); + printf("Usage: %s [-c|-v [-vv] ] [-t timeout] [-q queue_num] [-Qdst_queue ] [ -d ms_delay ] [-G]\n", p); } static int parse_attr_cb(const struct nlattr *attr, void *data) @@ -162,7 +165,7 @@ nfq_build_cfg_params(char *buf, uint8_t mode, int range, int queue_num) } static struct nlmsghdr * -nfq_build_verdict(char *buf, int id, int queue_num, int verd) +nfq_build_verdict(char *buf, int id, int queue_num, uint32_t verd) { struct nfqnl_msg_verdict_hdr vh = { .verdict = htonl(verd), @@ -189,9 +192,6 @@ static void print_stats(void) unsigned int last, total; int i; - if (!opts.count_packets) - return; - total = 0; last = queue_stats[0]; @@ -234,7 +234,8 @@ struct mnl_socket *open_queue(void) nlh = nfq_build_cfg_params(buf, NFQNL_COPY_PACKET, 0xFFFF, queue_num); - flags = NFQA_CFG_F_GSO | NFQA_CFG_F_UID_GID; + flags = opts.gso_enabled ? NFQA_CFG_F_GSO : 0; + flags |= NFQA_CFG_F_UID_GID; mnl_attr_put_u32(nlh, NFQA_CFG_FLAGS, htonl(flags)); mnl_attr_put_u32(nlh, NFQA_CFG_MASK, htonl(flags)); @@ -255,6 +256,17 @@ struct mnl_socket *open_queue(void) return nl; } +static void sleep_ms(uint32_t delay) +{ + struct timespec ts = { .tv_sec = delay / 1000 }; + + delay %= 1000; + + ts.tv_nsec = delay * 1000llu * 1000llu; + + nanosleep(&ts, NULL); +} + static int mainloop(void) { unsigned int buflen = 64 * 1024 + MNL_SOCKET_BUFFER_SIZE; @@ -278,7 +290,7 @@ static int mainloop(void) ret = mnl_socket_recvfrom(nl, buf, buflen); if (ret == -1) { - if (errno == ENOBUFS) + if (errno == ENOBUFS || errno == EINTR) continue; if (errno == EAGAIN) { @@ -298,7 +310,10 @@ static int mainloop(void) } id = ret - MNL_CB_OK; - nlh = nfq_build_verdict(buf, id, opts.queue_num, NF_ACCEPT); + if (opts.delay_ms) + sleep_ms(opts.delay_ms); + + nlh = nfq_build_verdict(buf, id, opts.queue_num, opts.verdict); if (mnl_socket_sendto(nl, nlh, nlh->nlmsg_len) < 0) { perror("mnl_socket_sendto"); exit(EXIT_FAILURE); @@ -314,7 +329,7 @@ static void parse_opts(int argc, char **argv) { int c; - while ((c = getopt(argc, argv, "chvt:q:")) != -1) { + while ((c = getopt(argc, argv, "chvt:q:Q:d:G")) != -1) { switch (c) { case 'c': opts.count_packets = true; @@ -328,20 +343,48 @@ static void parse_opts(int argc, char **argv) if (opts.queue_num > 0xffff) opts.queue_num = 0; break; + case 'Q': + opts.verdict = atoi(optarg); + if (opts.verdict > 0xffff) { + fprintf(stderr, "Expected destination queue number\n"); + exit(1); + } + + opts.verdict <<= 16; + opts.verdict |= NF_QUEUE; + break; + case 'd': + opts.delay_ms = atoi(optarg); + if (opts.delay_ms == 0) { + fprintf(stderr, "Expected nonzero delay (in milliseconds)\n"); + exit(1); + } + break; case 't': opts.timeout = atoi(optarg); break; + case 'G': + opts.gso_enabled = false; + break; case 'v': opts.verbose++; break; } } + + if (opts.verdict != NF_ACCEPT && (opts.verdict >> 16 == opts.queue_num)) { + fprintf(stderr, "Cannot use same destination and source queue\n"); + exit(1); + } } int main(int argc, char *argv[]) { int ret; + opts.verdict = NF_ACCEPT; + opts.gso_enabled = true; + parse_opts(argc, argv); ret = mainloop(); diff --git a/tools/testing/selftests/netfilter/nft_meta.sh b/tools/testing/selftests/netfilter/nft_meta.sh index d250b84dd5bc..087f0e6e71ce 100755 --- a/tools/testing/selftests/netfilter/nft_meta.sh +++ b/tools/testing/selftests/netfilter/nft_meta.sh @@ -7,8 +7,7 @@ ksft_skip=4 sfx=$(mktemp -u "XXXXXXXX") ns0="ns0-$sfx" -nft --version > /dev/null 2>&1 -if [ $? -ne 0 ];then +if ! nft --version > /dev/null 2>&1; then echo "SKIP: Could not run test without nft tool" exit $ksft_skip fi @@ -24,6 +23,8 @@ ip -net "$ns0" addr add 127.0.0.1 dev lo trap cleanup EXIT +currentyear=$(date +%G) +lastyear=$((currentyear-1)) ip netns exec "$ns0" nft -f /dev/stdin <<EOF table inet filter { counter iifcount {} @@ -33,6 +34,9 @@ table inet filter { counter infproto4count {} counter il4protocounter {} counter imarkcounter {} + counter icpu0counter {} + counter ilastyearcounter {} + counter icurrentyearcounter {} counter oifcount {} counter oifnamecount {} @@ -54,6 +58,9 @@ table inet filter { meta nfproto ipv4 counter name "infproto4count" meta l4proto icmp counter name "il4protocounter" meta mark 42 counter name "imarkcounter" + meta cpu 0 counter name "icpu0counter" + meta time "$lastyear-01-01" - "$lastyear-12-31" counter name ilastyearcounter + meta time "$currentyear-01-01" - "$currentyear-12-31" counter name icurrentyearcounter } chain output { @@ -84,11 +91,10 @@ check_one_counter() local want="packets $2" local verbose="$3" - cnt=$(ip netns exec "$ns0" nft list counter inet filter $cname | grep -q "$want") - if [ $? -ne 0 ];then + if ! ip netns exec "$ns0" nft list counter inet filter $cname | grep -q "$want"; then echo "FAIL: $cname, want \"$want\", got" ret=1 - ip netns exec "$ns0" nft list counter inet filter $counter + ip netns exec "$ns0" nft list counter inet filter $cname fi } @@ -100,8 +106,7 @@ check_lo_counters() for counter in iifcount iifnamecount iifgroupcount iiftypecount infproto4count \ oifcount oifnamecount oifgroupcount oiftypecount onfproto4count \ - il4protocounter \ - ol4protocounter \ + il4protocounter icurrentyearcounter ol4protocounter \ ; do check_one_counter "$counter" "$want" "$verbose" done @@ -116,9 +121,22 @@ check_one_counter oskuidcounter "1" true check_one_counter oskgidcounter "1" true check_one_counter imarkcounter "1" true check_one_counter omarkcounter "1" true +check_one_counter ilastyearcounter "0" true if [ $ret -eq 0 ];then echo "OK: nftables meta iif/oif counters at expected values" +else + exit $ret +fi + +#First CPU execution and counter +taskset -p 01 $$ > /dev/null +ip netns exec "$ns0" nft reset counters > /dev/null +ip netns exec "$ns0" ping -q -c 1 127.0.0.1 > /dev/null +check_one_counter icpu0counter "2" true + +if [ $ret -eq 0 ];then + echo "OK: nftables meta cpu counter at expected values" fi exit $ret diff --git a/tools/testing/selftests/netfilter/nft_queue.sh b/tools/testing/selftests/netfilter/nft_queue.sh index 6898448b4266..3d202b90b33d 100755 --- a/tools/testing/selftests/netfilter/nft_queue.sh +++ b/tools/testing/selftests/netfilter/nft_queue.sh @@ -12,6 +12,7 @@ sfx=$(mktemp -u "XXXXXXXX") ns1="ns1-$sfx" ns2="ns2-$sfx" nsrouter="nsrouter-$sfx" +timeout=4 cleanup() { @@ -20,6 +21,7 @@ cleanup() ip netns del ${nsrouter} rm -f "$TMPFILE0" rm -f "$TMPFILE1" + rm -f "$TMPFILE2" "$TMPFILE3" } nft --version > /dev/null 2>&1 @@ -42,6 +44,8 @@ fi TMPFILE0=$(mktemp) TMPFILE1=$(mktemp) +TMPFILE2=$(mktemp) +TMPFILE3=$(mktemp) trap cleanup EXIT ip netns add ${ns1} @@ -83,7 +87,7 @@ load_ruleset() { local name=$1 local prio=$2 -ip netns exec ${nsrouter} nft -f - <<EOF +ip netns exec ${nsrouter} nft -f /dev/stdin <<EOF table inet $name { chain nfq { ip protocol icmp queue bypass @@ -118,7 +122,7 @@ EOF load_counter_ruleset() { local prio=$1 -ip netns exec ${nsrouter} nft -f - <<EOF +ip netns exec ${nsrouter} nft -f /dev/stdin <<EOF table inet countrules { chain pre { type filter hook prerouting priority $prio; policy accept; @@ -175,7 +179,7 @@ test_ping_router() { test_queue_blackhole() { local proto=$1 -ip netns exec ${nsrouter} nft -f - <<EOF +ip netns exec ${nsrouter} nft -f /dev/stdin <<EOF table $proto blackh { chain forward { type filter hook forward priority 0; policy accept; @@ -184,10 +188,10 @@ table $proto blackh { } EOF if [ $proto = "ip" ] ;then - ip netns exec ${ns1} ping -c 1 -q 10.0.2.99 > /dev/null + ip netns exec ${ns1} ping -W 2 -c 1 -q 10.0.2.99 > /dev/null lret=$? elif [ $proto = "ip6" ]; then - ip netns exec ${ns1} ping -c 1 -q dead:2::99 > /dev/null + ip netns exec ${ns1} ping -W 2 -c 1 -q dead:2::99 > /dev/null lret=$? else lret=111 @@ -214,8 +218,8 @@ test_queue() local last="" # spawn nf-queue listeners - ip netns exec ${nsrouter} ./nf-queue -c -q 0 -t 3 > "$TMPFILE0" & - ip netns exec ${nsrouter} ./nf-queue -c -q 1 -t 3 > "$TMPFILE1" & + ip netns exec ${nsrouter} ./nf-queue -c -q 0 -t $timeout > "$TMPFILE0" & + ip netns exec ${nsrouter} ./nf-queue -c -q 1 -t $timeout > "$TMPFILE1" & sleep 1 test_ping ret=$? @@ -250,11 +254,11 @@ test_queue() test_tcp_forward() { - ip netns exec ${nsrouter} ./nf-queue -q 2 -t 10 & + ip netns exec ${nsrouter} ./nf-queue -q 2 -t $timeout & local nfqpid=$! tmpfile=$(mktemp) || exit 1 - dd conv=sparse status=none if=/dev/zero bs=1M count=100 of=$tmpfile + dd conv=sparse status=none if=/dev/zero bs=1M count=200 of=$tmpfile ip netns exec ${ns2} nc -w 5 -l -p 12345 <"$tmpfile" >/dev/null & local rpid=$! @@ -270,15 +274,13 @@ test_tcp_forward() test_tcp_localhost() { - tc -net "${nsrouter}" qdisc add dev lo root netem loss random 1% - tmpfile=$(mktemp) || exit 1 - dd conv=sparse status=none if=/dev/zero bs=1M count=900 of=$tmpfile + dd conv=sparse status=none if=/dev/zero bs=1M count=200 of=$tmpfile ip netns exec ${nsrouter} nc -w 5 -l -p 12345 <"$tmpfile" >/dev/null & local rpid=$! - ip netns exec ${nsrouter} ./nf-queue -q 3 -t 30 & + ip netns exec ${nsrouter} ./nf-queue -q 3 -t $timeout & local nfqpid=$! sleep 1 @@ -287,6 +289,47 @@ test_tcp_localhost() wait $rpid [ $? -eq 0 ] && echo "PASS: tcp via loopback" + wait 2>/dev/null +} + +test_tcp_localhost_requeue() +{ +ip netns exec ${nsrouter} nft -f /dev/stdin <<EOF +flush ruleset +table inet filter { + chain output { + type filter hook output priority 0; policy accept; + tcp dport 12345 limit rate 1/second burst 1 packets counter queue num 0 + } + chain post { + type filter hook postrouting priority 0; policy accept; + tcp dport 12345 limit rate 1/second burst 1 packets counter queue num 0 + } +} +EOF + tmpfile=$(mktemp) || exit 1 + dd conv=sparse status=none if=/dev/zero bs=1M count=200 of=$tmpfile + ip netns exec ${nsrouter} nc -w 5 -l -p 12345 <"$tmpfile" >/dev/null & + local rpid=$! + + ip netns exec ${nsrouter} ./nf-queue -c -q 1 -t $timeout > "$TMPFILE2" & + + # nfqueue 1 will be called via output hook. But this time, + # re-queue the packet to nfqueue program on queue 2. + ip netns exec ${nsrouter} ./nf-queue -G -d 150 -c -q 0 -Q 1 -t $timeout > "$TMPFILE3" & + + sleep 1 + ip netns exec ${nsrouter} nc -w 5 127.0.0.1 12345 <"$tmpfile" > /dev/null + rm -f "$tmpfile" + + wait + + if ! diff -u "$TMPFILE2" "$TMPFILE3" ; then + echo "FAIL: lost packets during requeue?!" 1>&2 + return + fi + + echo "PASS: tcp via loopback and re-queueing" } ip netns exec ${nsrouter} sysctl net.ipv6.conf.all.forwarding=1 > /dev/null @@ -328,5 +371,6 @@ test_queue 20 test_tcp_forward test_tcp_localhost +test_tcp_localhost_requeue exit $ret |