diff options
54 files changed, 1174 insertions, 437 deletions
diff --git a/MAINTAINERS b/MAINTAINERS index f72bf454516d..3f9b7d5382d7 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8782,14 +8782,16 @@ F: drivers/net/ethernet/neterion/ NETFILTER M: Pablo Neira Ayuso <pablo@netfilter.org> M: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> +M: Florian Westphal <fw@strlen.de> L: netfilter-devel@vger.kernel.org L: coreteam@netfilter.org W: http://www.netfilter.org/ W: http://www.iptables.org/ +W: http://www.nftables.org/ Q: http://patchwork.ozlabs.org/project/netfilter-devel/list/ T: git git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf.git T: git git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf-next.git -S: Supported +S: Maintained F: include/linux/netfilter* F: include/linux/netfilter/ F: include/net/netfilter/ diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_cfg.h b/drivers/net/ethernet/aquantia/atlantic/aq_cfg.h index 5f99237a9d52..214986436ece 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_cfg.h +++ b/drivers/net/ethernet/aquantia/atlantic/aq_cfg.h @@ -68,7 +68,7 @@ #define AQ_CFG_DRV_AUTHOR "aQuantia" #define AQ_CFG_DRV_DESC "aQuantia Corporation(R) Network Driver" -#define AQ_CFG_DRV_NAME "aquantia" +#define AQ_CFG_DRV_NAME "atlantic" #define AQ_CFG_DRV_VERSION __stringify(NIC_MAJOR_DRIVER_VERSION)"."\ __stringify(NIC_MINOR_DRIVER_VERSION)"."\ __stringify(NIC_BUILD_DRIVER_VERSION)"."\ diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c index f395b951f5e7..537d571ee601 100644 --- a/drivers/net/ethernet/broadcom/tg3.c +++ b/drivers/net/ethernet/broadcom/tg3.c @@ -11729,10 +11729,6 @@ static int tg3_close(struct net_device *dev) tg3_stop(tp); - /* Clear stats across close / open calls */ - memset(&tp->net_stats_prev, 0, sizeof(tp->net_stats_prev)); - memset(&tp->estats_prev, 0, sizeof(tp->estats_prev)); - if (pci_device_is_present(tp->pdev)) { tg3_power_down_prepare(tp); diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index 4fcd2f0378ba..4f2d329dba99 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -194,7 +194,8 @@ static void free_long_term_buff(struct ibmvnic_adapter *adapter, if (!ltb->buff) return; - if (!adapter->failover) + if (adapter->reset_reason != VNIC_RESET_FAILOVER && + adapter->reset_reason != VNIC_RESET_MOBILITY) send_request_unmap(adapter, ltb->map_id); dma_free_coherent(dev, ltb->size, ltb->buff, ltb->addr); } @@ -292,9 +293,6 @@ static void replenish_pools(struct ibmvnic_adapter *adapter) { int i; - if (adapter->migrated) - return; - adapter->replenish_task_cycles++; for (i = 0; i < be32_to_cpu(adapter->login_rsp_buf->num_rxadd_subcrqs); i++) { @@ -350,7 +348,7 @@ static void release_rx_pools(struct ibmvnic_adapter *adapter) free_long_term_buff(adapter, &rx_pool->long_term_buff); if (!rx_pool->rx_buff) - continue; + continue; for (j = 0; j < rx_pool->size; j++) { if (rx_pool->rx_buff[j].skb) { @@ -554,11 +552,20 @@ static int ibmvnic_login(struct net_device *netdev) static void release_resources(struct ibmvnic_adapter *adapter) { + int i; + release_tx_pools(adapter); release_rx_pools(adapter); release_stats_token(adapter); release_error_buffers(adapter); + + if (adapter->napi) { + for (i = 0; i < adapter->req_rx_queues; i++) { + if (&adapter->napi[i]) + netif_napi_del(&adapter->napi[i]); + } + } } static int set_link_state(struct ibmvnic_adapter *adapter, u8 link_state) @@ -569,11 +576,6 @@ static int set_link_state(struct ibmvnic_adapter *adapter, u8 link_state) bool resend; int rc; - if (adapter->logical_link_state == link_state) { - netdev_dbg(netdev, "Link state already %d\n", link_state); - return 0; - } - netdev_err(netdev, "setting link state %d\n", link_state); memset(&crq, 0, sizeof(crq)); crq.logical_link_state.first = IBMVNIC_CRQ_CMD; @@ -624,22 +626,10 @@ static int set_real_num_queues(struct net_device *netdev) return rc; } -static int ibmvnic_open(struct net_device *netdev) +static int init_resources(struct ibmvnic_adapter *adapter) { - struct ibmvnic_adapter *adapter = netdev_priv(netdev); - struct device *dev = &adapter->vdev->dev; - int rc = 0; - int i; - - if (adapter->is_closed) { - rc = ibmvnic_init(adapter); - if (rc) - return rc; - } - - rc = ibmvnic_login(netdev); - if (rc) - return rc; + struct net_device *netdev = adapter->netdev; + int i, rc; rc = set_real_num_queues(netdev); if (rc) @@ -647,7 +637,7 @@ static int ibmvnic_open(struct net_device *netdev) rc = init_sub_crq_irqs(adapter); if (rc) { - dev_err(dev, "failed to initialize sub crq irqs\n"); + netdev_err(netdev, "failed to initialize sub crq irqs\n"); return -1; } @@ -659,90 +649,184 @@ static int ibmvnic_open(struct net_device *netdev) adapter->napi = kcalloc(adapter->req_rx_queues, sizeof(struct napi_struct), GFP_KERNEL); if (!adapter->napi) - goto ibmvnic_open_fail; + return -ENOMEM; + for (i = 0; i < adapter->req_rx_queues; i++) { netif_napi_add(netdev, &adapter->napi[i], ibmvnic_poll, NAPI_POLL_WEIGHT); - napi_enable(&adapter->napi[i]); } send_map_query(adapter); rc = init_rx_pools(netdev); if (rc) - goto ibmvnic_open_fail; + return rc; rc = init_tx_pools(netdev); - if (rc) - goto ibmvnic_open_fail; + return rc; +} + +static int __ibmvnic_open(struct net_device *netdev) +{ + struct ibmvnic_adapter *adapter = netdev_priv(netdev); + enum vnic_state prev_state = adapter->state; + int i, rc; + adapter->state = VNIC_OPENING; replenish_pools(adapter); + for (i = 0; i < adapter->req_rx_queues; i++) + napi_enable(&adapter->napi[i]); + /* We're ready to receive frames, enable the sub-crq interrupts and * set the logical link state to up */ - for (i = 0; i < adapter->req_rx_queues; i++) - enable_scrq_irq(adapter, adapter->rx_scrq[i]); + for (i = 0; i < adapter->req_rx_queues; i++) { + if (prev_state == VNIC_CLOSED) + enable_irq(adapter->rx_scrq[i]->irq); + else + enable_scrq_irq(adapter, adapter->rx_scrq[i]); + } - for (i = 0; i < adapter->req_tx_queues; i++) - enable_scrq_irq(adapter, adapter->tx_scrq[i]); + for (i = 0; i < adapter->req_tx_queues; i++) { + if (prev_state == VNIC_CLOSED) + enable_irq(adapter->tx_scrq[i]->irq); + else + enable_scrq_irq(adapter, adapter->tx_scrq[i]); + } rc = set_link_state(adapter, IBMVNIC_LOGICAL_LNK_UP); - if (rc) - goto ibmvnic_open_fail; + if (rc) { + for (i = 0; i < adapter->req_rx_queues; i++) + napi_disable(&adapter->napi[i]); + release_resources(adapter); + return rc; + } netif_tx_start_all_queues(netdev); - adapter->is_closed = false; - return 0; + if (prev_state == VNIC_CLOSED) { + for (i = 0; i < adapter->req_rx_queues; i++) + napi_schedule(&adapter->napi[i]); + } -ibmvnic_open_fail: - for (i = 0; i < adapter->req_rx_queues; i++) - napi_disable(&adapter->napi[i]); - release_resources(adapter); - return -ENOMEM; + adapter->state = VNIC_OPEN; + return rc; } -static void disable_sub_crqs(struct ibmvnic_adapter *adapter) +static int ibmvnic_open(struct net_device *netdev) { - int i; + struct ibmvnic_adapter *adapter = netdev_priv(netdev); + int rc; - if (adapter->tx_scrq) { - for (i = 0; i < adapter->req_tx_queues; i++) - if (adapter->tx_scrq[i]) - disable_irq(adapter->tx_scrq[i]->irq); + mutex_lock(&adapter->reset_lock); + + if (adapter->state != VNIC_CLOSED) { + rc = ibmvnic_login(netdev); + if (rc) { + mutex_unlock(&adapter->reset_lock); + return rc; + } + + rc = init_resources(adapter); + if (rc) { + netdev_err(netdev, "failed to initialize resources\n"); + release_resources(adapter); + mutex_unlock(&adapter->reset_lock); + return rc; + } } - if (adapter->rx_scrq) { - for (i = 0; i < adapter->req_rx_queues; i++) - if (adapter->rx_scrq[i]) - disable_irq(adapter->rx_scrq[i]->irq); + rc = __ibmvnic_open(netdev); + mutex_unlock(&adapter->reset_lock); + + return rc; +} + +static void clean_tx_pools(struct ibmvnic_adapter *adapter) +{ + struct ibmvnic_tx_pool *tx_pool; + u64 tx_entries; + int tx_scrqs; + int i, j; + + if (!adapter->tx_pool) + return; + + tx_scrqs = be32_to_cpu(adapter->login_rsp_buf->num_txsubm_subcrqs); + tx_entries = adapter->req_tx_entries_per_subcrq; + + /* Free any remaining skbs in the tx buffer pools */ + for (i = 0; i < tx_scrqs; i++) { + tx_pool = &adapter->tx_pool[i]; + if (!tx_pool) + continue; + + for (j = 0; j < tx_entries; j++) { + if (tx_pool->tx_buff[j].skb) { + dev_kfree_skb_any(tx_pool->tx_buff[j].skb); + tx_pool->tx_buff[j].skb = NULL; + } + } } } -static int ibmvnic_close(struct net_device *netdev) +static int __ibmvnic_close(struct net_device *netdev) { struct ibmvnic_adapter *adapter = netdev_priv(netdev); int rc = 0; int i; - adapter->closing = true; - disable_sub_crqs(adapter); + adapter->state = VNIC_CLOSING; + netif_tx_stop_all_queues(netdev); if (adapter->napi) { for (i = 0; i < adapter->req_rx_queues; i++) napi_disable(&adapter->napi[i]); } - if (!adapter->failover) - netif_tx_stop_all_queues(netdev); + clean_tx_pools(adapter); + + if (adapter->tx_scrq) { + for (i = 0; i < adapter->req_tx_queues; i++) + if (adapter->tx_scrq[i]->irq) + disable_irq(adapter->tx_scrq[i]->irq); + } rc = set_link_state(adapter, IBMVNIC_LOGICAL_LNK_DN); + if (rc) + return rc; - release_resources(adapter); + if (adapter->rx_scrq) { + for (i = 0; i < adapter->req_rx_queues; i++) { + int retries = 10; + + while (pending_scrq(adapter, adapter->rx_scrq[i])) { + retries--; + mdelay(100); + + if (retries == 0) + break; + } + + if (adapter->rx_scrq[i]->irq) + disable_irq(adapter->rx_scrq[i]->irq); + } + } + + adapter->state = VNIC_CLOSED; + return rc; +} + +static int ibmvnic_close(struct net_device *netdev) +{ + struct ibmvnic_adapter *adapter = netdev_priv(netdev); + int rc; + + mutex_lock(&adapter->reset_lock); + rc = __ibmvnic_close(netdev); + mutex_unlock(&adapter->reset_lock); - adapter->is_closed = true; - adapter->closing = false; return rc; } @@ -901,13 +985,7 @@ static int ibmvnic_xmit(struct sk_buff *skb, struct net_device *netdev) int index = 0; int ret = 0; - tx_pool = &adapter->tx_pool[queue_num]; - tx_scrq = adapter->tx_scrq[queue_num]; - txq = netdev_get_tx_queue(netdev, skb_get_queue_mapping(skb)); - handle_array = (u64 *)((u8 *)(adapter->login_rsp_buf) + - be32_to_cpu(adapter->login_rsp_buf-> - off_txsubm_subcrqs)); - if (adapter->migrated) { + if (adapter->resetting) { if (!netif_subqueue_stopped(netdev, skb)) netif_stop_subqueue(netdev, queue_num); dev_kfree_skb_any(skb); @@ -918,6 +996,12 @@ static int ibmvnic_xmit(struct sk_buff *skb, struct net_device *netdev) goto out; } + tx_pool = &adapter->tx_pool[queue_num]; + tx_scrq = adapter->tx_scrq[queue_num]; + txq = netdev_get_tx_queue(netdev, skb_get_queue_mapping(skb)); + handle_array = (u64 *)((u8 *)(adapter->login_rsp_buf) + + be32_to_cpu(adapter->login_rsp_buf->off_txsubm_subcrqs)); + index = tx_pool->free_map[tx_pool->consumer_index]; offset = index * adapter->req_mtu; dst = tx_pool->long_term_buff.buff + offset; @@ -1099,18 +1183,185 @@ static int ibmvnic_set_mac(struct net_device *netdev, void *p) return 0; } -static void ibmvnic_tx_timeout(struct net_device *dev) +/** + * do_reset returns zero if we are able to keep processing reset events, or + * non-zero if we hit a fatal error and must halt. + */ +static int do_reset(struct ibmvnic_adapter *adapter, + struct ibmvnic_rwi *rwi, u32 reset_state) { - struct ibmvnic_adapter *adapter = netdev_priv(dev); - int rc; + struct net_device *netdev = adapter->netdev; + int i, rc; + + netif_carrier_off(netdev); + adapter->reset_reason = rwi->reset_reason; + + if (rwi->reset_reason == VNIC_RESET_MOBILITY) { + rc = ibmvnic_reenable_crq_queue(adapter); + if (rc) + return 0; + } - /* Adapter timed out, resetting it */ + rc = __ibmvnic_close(netdev); + if (rc) + return rc; + + /* remove the closed state so when we call open it appears + * we are coming from the probed state. + */ + adapter->state = VNIC_PROBED; + + release_resources(adapter); release_sub_crqs(adapter); - rc = ibmvnic_reset_crq(adapter); + release_crq_queue(adapter); + + rc = ibmvnic_init(adapter); if (rc) - dev_err(&adapter->vdev->dev, "Adapter timeout, reset failed\n"); - else - ibmvnic_send_crq_init(adapter); + return 0; + + /* If the adapter was in PROBE state prior to the reset, exit here. */ + if (reset_state == VNIC_PROBED) + return 0; + + rc = ibmvnic_login(netdev); + if (rc) { + adapter->state = VNIC_PROBED; + return 0; + } + + rtnl_lock(); + rc = init_resources(adapter); + rtnl_unlock(); + if (rc) + return rc; + + if (reset_state == VNIC_CLOSED) + return 0; + + rc = __ibmvnic_open(netdev); + if (rc) { + if (list_empty(&adapter->rwi_list)) + adapter->state = VNIC_CLOSED; + else + adapter->state = reset_state; + + return 0; + } + + netif_carrier_on(netdev); + + /* kick napi */ + for (i = 0; i < adapter->req_rx_queues; i++) + napi_schedule(&adapter->napi[i]); + + return 0; +} + +static struct ibmvnic_rwi *get_next_rwi(struct ibmvnic_adapter *adapter) +{ + struct ibmvnic_rwi *rwi; + + mutex_lock(&adapter->rwi_lock); + + if (!list_empty(&adapter->rwi_list)) { + rwi = list_first_entry(&adapter->rwi_list, struct ibmvnic_rwi, + list); + list_del(&rwi->list); + } else { + rwi = NULL; + } + + mutex_unlock(&adapter->rwi_lock); + return rwi; +} + +static void free_all_rwi(struct ibmvnic_adapter *adapter) +{ + struct ibmvnic_rwi *rwi; + + rwi = get_next_rwi(adapter); + while (rwi) { + kfree(rwi); + rwi = get_next_rwi(adapter); + } +} + +static void __ibmvnic_reset(struct work_struct *work) +{ + struct ibmvnic_rwi *rwi; + struct ibmvnic_adapter *adapter; + struct net_device *netdev; + u32 reset_state; + int rc; + + adapter = container_of(work, struct ibmvnic_adapter, ibmvnic_reset); + netdev = adapter->netdev; + + mutex_lock(&adapter->reset_lock); + adapter->resetting = true; + reset_state = adapter->state; + + rwi = get_next_rwi(adapter); + while (rwi) { + rc = do_reset(adapter, rwi, reset_state); + kfree(rwi); + if (rc) + break; + + rwi = get_next_rwi(adapter); + } + + if (rc) { + free_all_rwi(adapter); + return; + } + + adapter->resetting = false; + mutex_unlock(&adapter->reset_lock); +} + +static void ibmvnic_reset(struct ibmvnic_adapter *adapter, + enum ibmvnic_reset_reason reason) +{ + struct ibmvnic_rwi *rwi, *tmp; + struct net_device *netdev = adapter->netdev; + struct list_head *entry; + + if (adapter->state == VNIC_REMOVING || + adapter->state == VNIC_REMOVED) { + netdev_dbg(netdev, "Adapter removing, skipping reset\n"); + return; + } + + mutex_lock(&adapter->rwi_lock); + + list_for_each(entry, &adapter->rwi_list) { + tmp = list_entry(entry, struct ibmvnic_rwi, list); + if (tmp->reset_reason == reason) { + netdev_err(netdev, "Matching reset found, skipping\n"); + mutex_unlock(&adapter->rwi_lock); + return; + } + } + + rwi = kzalloc(sizeof(*rwi), GFP_KERNEL); + if (!rwi) { + mutex_unlock(&adapter->rwi_lock); + ibmvnic_close(netdev); + return; + } + + rwi->reset_reason = reason; + list_add_tail(&rwi->list, &adapter->rwi_list); + mutex_unlock(&adapter->rwi_lock); + schedule_work(&adapter->ibmvnic_reset); +} + +static void ibmvnic_tx_timeout(struct net_device *dev) +{ + struct ibmvnic_adapter *adapter = netdev_priv(dev); + + ibmvnic_reset(adapter, VNIC_RESET_TIMEOUT); } static void remove_buff_from_pool(struct ibmvnic_adapter *adapter, @@ -1153,7 +1404,7 @@ restart_poll: /* free the entry */ next->rx_comp.first = 0; remove_buff_from_pool(adapter, rx_buff); - break; + continue; } length = be32_to_cpu(next->rx_comp.len); @@ -1177,6 +1428,7 @@ restart_poll: skb_put(skb, length); skb->protocol = eth_type_trans(skb, netdev); + skb_record_rx_queue(skb, scrq_num); if (flags & IBMVNIC_IP_CHKSUM_GOOD && flags & IBMVNIC_TCP_UDP_CHKSUM_GOOD) { @@ -1557,19 +1809,8 @@ restart_loop: } if (txbuff->last_frag) { - if (atomic_sub_return(next->tx_comp.num_comps, - &scrq->used) <= - (adapter->req_tx_entries_per_subcrq / 2) && - netif_subqueue_stopped(adapter->netdev, - txbuff->skb)) { - netif_wake_subqueue(adapter->netdev, - scrq->pool_index); - netdev_dbg(adapter->netdev, - "Started queue %d\n", - scrq->pool_index); - } - dev_kfree_skb_any(txbuff->skb); + txbuff->skb = NULL; } adapter->tx_pool[pool].free_map[adapter->tx_pool[pool]. @@ -1580,6 +1821,15 @@ restart_loop: } /* remove tx_comp scrq*/ next->tx_comp.first = 0; + + if (atomic_sub_return(next->tx_comp.num_comps, &scrq->used) <= + (adapter->req_tx_entries_per_subcrq / 2) && + __netif_subqueue_stopped(adapter->netdev, + scrq->pool_index)) { + netif_wake_subqueue(adapter->netdev, scrq->pool_index); + netdev_info(adapter->netdev, "Started queue %d\n", + scrq->pool_index); + } } enable_scrq_irq(adapter, scrq); @@ -1853,7 +2103,8 @@ static int pending_scrq(struct ibmvnic_adapter *adapter, { union sub_crq *entry = &scrq->msgs[scrq->cur]; - if (entry->generic.first & IBMVNIC_CRQ_CMD_RSP || adapter->closing) + if (entry->generic.first & IBMVNIC_CRQ_CMD_RSP || + adapter->state == VNIC_CLOSING) return 1; else return 0; @@ -1991,18 +2242,6 @@ static int ibmvnic_send_crq_init(struct ibmvnic_adapter *adapter) return ibmvnic_send_crq(adapter, &crq); } -static int ibmvnic_send_crq_init_complete(struct ibmvnic_adapter *adapter) -{ - union ibmvnic_crq crq; - - memset(&crq, 0, sizeof(crq)); - crq.generic.first = IBMVNIC_CRQ_INIT_CMD; - crq.generic.cmd = IBMVNIC_CRQ_INIT_COMPLETE; - netdev_dbg(adapter->netdev, "Sending CRQ init complete\n"); - - return ibmvnic_send_crq(adapter, &crq); -} - static int send_version_xchg(struct ibmvnic_adapter *adapter) { union ibmvnic_crq crq; @@ -2500,6 +2739,9 @@ static void handle_error_indication(union ibmvnic_crq *crq, if (be32_to_cpu(crq->error_indication.error_id)) request_error_information(adapter, crq); + + if (crq->error_indication.flags & IBMVNIC_FATAL_ERROR) + ibmvnic_reset(adapter, VNIC_RESET_FATAL); } static void handle_change_mac_rsp(union ibmvnic_crq *crq, @@ -2888,26 +3130,6 @@ out: } } -static void ibmvnic_xport_event(struct work_struct *work) -{ - struct ibmvnic_adapter *adapter = container_of(work, - struct ibmvnic_adapter, - ibmvnic_xport); - struct device *dev = &adapter->vdev->dev; - long rc; - - release_sub_crqs(adapter); - if (adapter->migrated) { - rc = ibmvnic_reenable_crq_queue(adapter); - if (rc) - dev_err(dev, "Error after enable rc=%ld\n", rc); - adapter->migrated = false; - rc = ibmvnic_send_crq_init(adapter); - if (rc) - dev_err(dev, "Error sending init rc=%ld\n", rc); - } -} - static void ibmvnic_handle_crq(union ibmvnic_crq *crq, struct ibmvnic_adapter *adapter) { @@ -2925,12 +3147,6 @@ static void ibmvnic_handle_crq(union ibmvnic_crq *crq, switch (gen_crq->cmd) { case IBMVNIC_CRQ_INIT: dev_info(dev, "Partner initialized\n"); - /* Send back a response */ - rc = ibmvnic_send_crq_init_complete(adapter); - if (!rc) - schedule_work(&adapter->vnic_crq_init); - else - dev_err(dev, "Can't send initrsp rc=%ld\n", rc); break; case IBMVNIC_CRQ_INIT_COMPLETE: dev_info(dev, "Partner initialization complete\n"); @@ -2941,19 +3157,18 @@ static void ibmvnic_handle_crq(union ibmvnic_crq *crq, } return; case IBMVNIC_CRQ_XPORT_EVENT: + netif_carrier_off(netdev); if (gen_crq->cmd == IBMVNIC_PARTITION_MIGRATED) { - dev_info(dev, "Re-enabling adapter\n"); - adapter->migrated = true; - schedule_work(&adapter->ibmvnic_xport); + dev_info(dev, "Migrated, re-enabling adapter\n"); + ibmvnic_reset(adapter, VNIC_RESET_MOBILITY); } else if (gen_crq->cmd == IBMVNIC_DEVICE_FAILOVER) { dev_info(dev, "Backing device failover detected\n"); - netif_carrier_off(netdev); - adapter->failover = true; + ibmvnic_reset(adapter, VNIC_RESET_FAILOVER); } else { /* The adapter lost the connection */ dev_err(dev, "Virtual Adapter failed (rc=%d)\n", gen_crq->cmd); - schedule_work(&adapter->ibmvnic_xport); + ibmvnic_reset(adapter, VNIC_RESET_FATAL); } return; case IBMVNIC_CRQ_CMD_RSP: @@ -3234,64 +3449,6 @@ map_failed: return retrc; } -static void handle_crq_init_rsp(struct work_struct *work) -{ - struct ibmvnic_adapter *adapter = container_of(work, - struct ibmvnic_adapter, - vnic_crq_init); - struct device *dev = &adapter->vdev->dev; - struct net_device *netdev = adapter->netdev; - unsigned long timeout = msecs_to_jiffies(30000); - bool restart = false; - int rc; - - if (adapter->failover) { - release_sub_crqs(adapter); - if (netif_running(netdev)) { - netif_tx_disable(netdev); - ibmvnic_close(netdev); - restart = true; - } - } - - reinit_completion(&adapter->init_done); - send_version_xchg(adapter); - if (!wait_for_completion_timeout(&adapter->init_done, timeout)) { - dev_err(dev, "Passive init timeout\n"); - goto task_failed; - } - - netdev->mtu = adapter->req_mtu - ETH_HLEN; - - if (adapter->failover) { - adapter->failover = false; - if (restart) { - rc = ibmvnic_open(netdev); - if (rc) - goto restart_failed; - } - netif_carrier_on(netdev); - return; - } - - rc = register_netdev(netdev); - if (rc) { - dev_err(dev, - "failed to register netdev rc=%d\n", rc); - goto register_failed; - } - dev_info(dev, "ibmvnic registered\n"); - - return; - -restart_failed: - dev_err(dev, "Failed to restart ibmvnic, rc=%d\n", rc); -register_failed: - release_sub_crqs(adapter); -task_failed: - dev_err(dev, "Passive initialization was not successful\n"); -} - static int ibmvnic_init(struct ibmvnic_adapter *adapter) { struct device *dev = &adapter->vdev->dev; @@ -3346,10 +3503,10 @@ static int ibmvnic_probe(struct vio_dev *dev, const struct vio_device_id *id) return -ENOMEM; adapter = netdev_priv(netdev); + adapter->state = VNIC_PROBING; dev_set_drvdata(&dev->dev, netdev); adapter->vdev = dev; adapter->netdev = netdev; - adapter->failover = false; ether_addr_copy(adapter->mac_addr, mac_addr_p); ether_addr_copy(netdev->dev_addr, adapter->mac_addr); @@ -3358,14 +3515,17 @@ static int ibmvnic_probe(struct vio_dev *dev, const struct vio_device_id *id) netdev->ethtool_ops = &ibmvnic_ethtool_ops; SET_NETDEV_DEV(netdev, &dev->dev); - INIT_WORK(&adapter->vnic_crq_init, handle_crq_init_rsp); - INIT_WORK(&adapter->ibmvnic_xport, ibmvnic_xport_event); - spin_lock_init(&adapter->stats_lock); INIT_LIST_HEAD(&adapter->errors); spin_lock_init(&adapter->error_list_lock); + INIT_WORK(&adapter->ibmvnic_reset, __ibmvnic_reset); + INIT_LIST_HEAD(&adapter->rwi_list); + mutex_init(&adapter->reset_lock); + mutex_init(&adapter->rwi_lock); + adapter->resetting = false; + rc = ibmvnic_init(adapter); if (rc) { free_netdev(netdev); @@ -3373,7 +3533,6 @@ static int ibmvnic_probe(struct vio_dev *dev, const struct vio_device_id *id) } netdev->mtu = adapter->req_mtu - ETH_HLEN; - adapter->is_closed = false; rc = register_netdev(netdev); if (rc) { @@ -3383,6 +3542,7 @@ static int ibmvnic_probe(struct vio_dev *dev, const struct vio_device_id *id) } dev_info(&dev->dev, "ibmvnic registered\n"); + adapter->state = VNIC_PROBED; return 0; } @@ -3391,12 +3551,17 @@ static int ibmvnic_remove(struct vio_dev *dev) struct net_device *netdev = dev_get_drvdata(&dev->dev); struct ibmvnic_adapter *adapter = netdev_priv(netdev); + adapter->state = VNIC_REMOVING; unregister_netdev(netdev); + mutex_lock(&adapter->reset_lock); release_resources(adapter); release_sub_crqs(adapter); release_crq_queue(adapter); + adapter->state = VNIC_REMOVED; + + mutex_unlock(&adapter->reset_lock); free_netdev(netdev); dev_set_drvdata(&dev->dev, NULL); diff --git a/drivers/net/ethernet/ibm/ibmvnic.h b/drivers/net/ethernet/ibm/ibmvnic.h index a69979f6f19d..4702b48cfa44 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.h +++ b/drivers/net/ethernet/ibm/ibmvnic.h @@ -913,6 +913,25 @@ struct ibmvnic_error_buff { __be32 error_id; }; +enum vnic_state {VNIC_PROBING = 1, + VNIC_PROBED, + VNIC_OPENING, + VNIC_OPEN, + VNIC_CLOSING, + VNIC_CLOSED, + VNIC_REMOVING, + VNIC_REMOVED}; + +enum ibmvnic_reset_reason {VNIC_RESET_FAILOVER = 1, + VNIC_RESET_MOBILITY, + VNIC_RESET_FATAL, + VNIC_RESET_TIMEOUT}; + +struct ibmvnic_rwi { + enum ibmvnic_reset_reason reset_reason; + struct list_head list; +}; + struct ibmvnic_adapter { struct vio_dev *vdev; struct net_device *netdev; @@ -922,7 +941,6 @@ struct ibmvnic_adapter { dma_addr_t ip_offload_tok; struct ibmvnic_control_ip_offload_buffer ip_offload_ctrl; dma_addr_t ip_offload_ctrl_tok; - bool migrated; u32 msg_enable; /* Statistics */ @@ -962,7 +980,6 @@ struct ibmvnic_adapter { u64 promisc; struct ibmvnic_tx_pool *tx_pool; - bool closing; struct completion init_done; int init_done_rc; @@ -1007,9 +1024,11 @@ struct ibmvnic_adapter { __be64 tx_rx_desc_req; u8 map_id; - struct work_struct vnic_crq_init; - struct work_struct ibmvnic_xport; struct tasklet_struct tasklet; - bool failover; - bool is_closed; + enum vnic_state state; + enum ibmvnic_reset_reason reset_reason; + struct mutex reset_lock, rwi_lock; + struct list_head rwi_list; + struct work_struct ibmvnic_reset; + bool resetting; }; diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c index db20376260f5..82bd6b0935f1 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c @@ -2532,11 +2532,11 @@ nfp_net_check_config(struct nfp_net *nn, struct nfp_net_dp *dp, if (!dp->xdp_prog) return 0; if (dp->fl_bufsz > PAGE_SIZE) { - NL_MOD_TRY_SET_ERR_MSG(extack, "MTU too large w/ XDP enabled"); + NL_SET_ERR_MSG_MOD(extack, "MTU too large w/ XDP enabled"); return -EINVAL; } if (dp->num_tx_rings > nn->max_tx_rings) { - NL_MOD_TRY_SET_ERR_MSG(extack, "Insufficient number of TX rings w/ XDP enabled"); + NL_SET_ERR_MSG_MOD(extack, "Insufficient number of TX rings w/ XDP enabled"); return -EINVAL; } diff --git a/drivers/net/ethernet/nvidia/forcedeth.c b/drivers/net/ethernet/nvidia/forcedeth.c index 978d32944c80..aa912f43e15f 100644 --- a/drivers/net/ethernet/nvidia/forcedeth.c +++ b/drivers/net/ethernet/nvidia/forcedeth.c @@ -4248,11 +4248,9 @@ static int nv_get_link_ksettings(struct net_device *dev, /* We do not track link speed / duplex setting if the * interface is disabled. Force a link check */ if (nv_update_linkspeed(dev)) { - if (!netif_carrier_ok(dev)) - netif_carrier_on(dev); + netif_carrier_on(dev); } else { - if (netif_carrier_ok(dev)) - netif_carrier_off(dev); + netif_carrier_off(dev); } } diff --git a/drivers/net/ethernet/qlogic/qed/qed_dev.c b/drivers/net/ethernet/qlogic/qed/qed_dev.c index 5f31140d0b77..bb70522ad362 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_dev.c +++ b/drivers/net/ethernet/qlogic/qed/qed_dev.c @@ -2536,6 +2536,9 @@ static int qed_hw_get_nvm_info(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt) DP_NOTICE(p_hwfn, "Unknown Speed in 0x%08x\n", link_temp); } + p_hwfn->mcp_info->link_capabilities.default_speed_autoneg = + link->speed.autoneg; + link_temp &= NVM_CFG1_PORT_DRV_FLOW_CONTROL_MASK; link_temp >>= NVM_CFG1_PORT_DRV_FLOW_CONTROL_OFFSET; link->pause.autoneg = !!(link_temp & @@ -3586,7 +3589,7 @@ static int qed_set_coalesce(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt, } int qed_set_rxq_coalesce(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt, - u16 coalesce, u8 qid, u16 sb_id) + u16 coalesce, u16 qid, u16 sb_id) { struct ustorm_eth_queue_zone eth_qzone; u8 timeset, timer_res; @@ -3607,7 +3610,7 @@ int qed_set_rxq_coalesce(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt, } timeset = (u8)(coalesce >> timer_res); - rc = qed_fw_l2_queue(p_hwfn, (u16)qid, &fw_qid); + rc = qed_fw_l2_queue(p_hwfn, qid, &fw_qid); if (rc) return rc; @@ -3628,7 +3631,7 @@ out: } int qed_set_txq_coalesce(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt, - u16 coalesce, u8 qid, u16 sb_id) + u16 coalesce, u16 qid, u16 sb_id) { struct xstorm_eth_queue_zone eth_qzone; u8 timeset, timer_res; @@ -3649,7 +3652,7 @@ int qed_set_txq_coalesce(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt, } timeset = (u8)(coalesce >> timer_res); - rc = qed_fw_l2_queue(p_hwfn, (u16)qid, &fw_qid); + rc = qed_fw_l2_queue(p_hwfn, qid, &fw_qid); if (rc) return rc; diff --git a/drivers/net/ethernet/qlogic/qed/qed_dev_api.h b/drivers/net/ethernet/qlogic/qed/qed_dev_api.h index cefe3ee9064a..12d16c096e36 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_dev_api.h +++ b/drivers/net/ethernet/qlogic/qed/qed_dev_api.h @@ -454,7 +454,7 @@ int qed_final_cleanup(struct qed_hwfn *p_hwfn, * @return int */ int qed_set_rxq_coalesce(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt, - u16 coalesce, u8 qid, u16 sb_id); + u16 coalesce, u16 qid, u16 sb_id); /** * @brief qed_set_txq_coalesce - Configure coalesce parameters for a Tx queue @@ -471,7 +471,7 @@ int qed_set_rxq_coalesce(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt, * @return int */ int qed_set_txq_coalesce(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt, - u16 coalesce, u8 qid, u16 sb_id); + u16 coalesce, u16 qid, u16 sb_id); const char *qed_hw_get_resc_name(enum qed_resources res_id); #endif diff --git a/drivers/net/ethernet/qlogic/qed/qed_main.c b/drivers/net/ethernet/qlogic/qed/qed_main.c index 59992cf20d42..b7ad36b91e12 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_main.c +++ b/drivers/net/ethernet/qlogic/qed/qed_main.c @@ -1372,7 +1372,7 @@ static void qed_fill_link(struct qed_hwfn *hwfn, /* TODO - at the moment assume supported and advertised speed equal */ if_link->supported_caps = QED_LM_FIBRE_BIT; - if (params.speed.autoneg) + if (link_caps.default_speed_autoneg) if_link->supported_caps |= QED_LM_Autoneg_BIT; if (params.pause.autoneg || (params.pause.forced_rx && params.pause.forced_tx)) @@ -1382,6 +1382,10 @@ static void qed_fill_link(struct qed_hwfn *hwfn, if_link->supported_caps |= QED_LM_Pause_BIT; if_link->advertised_caps = if_link->supported_caps; + if (params.speed.autoneg) + if_link->advertised_caps |= QED_LM_Autoneg_BIT; + else + if_link->advertised_caps &= ~QED_LM_Autoneg_BIT; if (params.speed.advertised_speeds & NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_1G) if_link->advertised_caps |= QED_LM_1000baseT_Half_BIT | @@ -1521,7 +1525,7 @@ static void qed_get_coalesce(struct qed_dev *cdev, u16 *rx_coal, u16 *tx_coal) } static int qed_set_coalesce(struct qed_dev *cdev, u16 rx_coal, u16 tx_coal, - u8 qid, u16 sb_id) + u16 qid, u16 sb_id) { struct qed_hwfn *hwfn; struct qed_ptt *ptt; diff --git a/drivers/net/ethernet/qlogic/qed/qed_mcp.h b/drivers/net/ethernet/qlogic/qed/qed_mcp.h index 5ae35d6cc7d1..2b09b8545236 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_mcp.h +++ b/drivers/net/ethernet/qlogic/qed/qed_mcp.h @@ -61,6 +61,7 @@ struct qed_mcp_link_params { struct qed_mcp_link_capabilities { u32 speed_capabilities; + bool default_speed_autoneg; }; struct qed_mcp_link_state { diff --git a/drivers/net/ethernet/qlogic/qede/qede_ethtool.c b/drivers/net/ethernet/qlogic/qede/qede_ethtool.c index 4dcfe9614731..172b292241a5 100644 --- a/drivers/net/ethernet/qlogic/qede/qede_ethtool.c +++ b/drivers/net/ethernet/qlogic/qede/qede_ethtool.c @@ -493,6 +493,11 @@ static int qede_set_link_ksettings(struct net_device *dev, params.override_flags |= QED_LINK_OVERRIDE_SPEED_ADV_SPEEDS; params.override_flags |= QED_LINK_OVERRIDE_SPEED_AUTONEG; if (base->autoneg == AUTONEG_ENABLE) { + if (!(current_link.supported_caps & QED_LM_Autoneg_BIT)) { + DP_INFO(edev, "Auto negotiation is not supported\n"); + return -EOPNOTSUPP; + } + params.autoneg = true; params.forced_speed = 0; QEDE_ETHTOOL_TO_DRV_CAPS(params.adv_speeds, cmd, advertising) @@ -706,8 +711,7 @@ static int qede_set_coalesce(struct net_device *dev, { struct qede_dev *edev = netdev_priv(dev); int i, rc = 0; - u16 rxc, txc; - u8 sb_id; + u16 rxc, txc, sb_id; if (!netif_running(dev)) { DP_INFO(edev, "Interface is down\n"); @@ -729,7 +733,7 @@ static int qede_set_coalesce(struct net_device *dev, for_each_queue(i) { sb_id = edev->fp_array[i].sb_info->igu_sb_id; rc = edev->ops->common->set_coalesce(edev->cdev, rxc, txc, - (u8)i, sb_id); + (u16)i, sb_id); if (rc) { DP_INFO(edev, "Set coalesce error, rc = %d\n", rc); return rc; diff --git a/drivers/net/ethernet/smsc/smsc911x.c b/drivers/net/ethernet/smsc/smsc911x.c index fa5ca0992be6..ea1bbc355b4d 100644 --- a/drivers/net/ethernet/smsc/smsc911x.c +++ b/drivers/net/ethernet/smsc/smsc911x.c @@ -25,7 +25,7 @@ * LAN9215, LAN9216, LAN9217, LAN9218 * LAN9210, LAN9211 * LAN9220, LAN9221 - * LAN89218 + * LAN89218,LAN9250 * */ @@ -1450,6 +1450,8 @@ static int smsc911x_soft_reset(struct smsc911x_data *pdata) unsigned int timeout; unsigned int temp; int ret; + unsigned int reset_offset = HW_CFG; + unsigned int reset_mask = HW_CFG_SRST_; /* * Make sure to power-up the PHY chip before doing a reset, otherwise @@ -1476,15 +1478,23 @@ static int smsc911x_soft_reset(struct smsc911x_data *pdata) } } + if ((pdata->idrev & 0xFFFF0000) == LAN9250) { + /* special reset for LAN9250 */ + reset_offset = RESET_CTL; + reset_mask = RESET_CTL_DIGITAL_RST_; + } + /* Reset the LAN911x */ - smsc911x_reg_write(pdata, HW_CFG, HW_CFG_SRST_); + smsc911x_reg_write(pdata, reset_offset, reset_mask); + + /* verify reset bit is cleared */ timeout = 10; do { udelay(10); - temp = smsc911x_reg_read(pdata, HW_CFG); - } while ((--timeout) && (temp & HW_CFG_SRST_)); + temp = smsc911x_reg_read(pdata, reset_offset); + } while ((--timeout) && (temp & reset_mask)); - if (unlikely(temp & HW_CFG_SRST_)) { + if (unlikely(temp & reset_mask)) { SMSC_WARN(pdata, drv, "Failed to complete reset"); return -EIO; } @@ -2253,28 +2263,29 @@ static int smsc911x_init(struct net_device *dev) pdata->idrev = smsc911x_reg_read(pdata, ID_REV); switch (pdata->idrev & 0xFFFF0000) { - case 0x01180000: - case 0x01170000: - case 0x01160000: - case 0x01150000: - case 0x218A0000: + case LAN9118: + case LAN9117: + case LAN9116: + case LAN9115: + case LAN89218: /* LAN911[5678] family */ pdata->generation = pdata->idrev & 0x0000FFFF; break; - case 0x118A0000: - case 0x117A0000: - case 0x116A0000: - case 0x115A0000: + case LAN9218: + case LAN9217: + case LAN9216: + case LAN9215: /* LAN921[5678] family */ pdata->generation = 3; break; - case 0x92100000: - case 0x92110000: - case 0x92200000: - case 0x92210000: - /* LAN9210/LAN9211/LAN9220/LAN9221 */ + case LAN9210: + case LAN9211: + case LAN9220: + case LAN9221: + case LAN9250: + /* LAN9210/LAN9211/LAN9220/LAN9221/LAN9250 */ pdata->generation = 4; break; diff --git a/drivers/net/ethernet/smsc/smsc911x.h b/drivers/net/ethernet/smsc/smsc911x.h index 54d648920a1b..8d75508acd2b 100644 --- a/drivers/net/ethernet/smsc/smsc911x.h +++ b/drivers/net/ethernet/smsc/smsc911x.h @@ -20,6 +20,22 @@ #ifndef __SMSC911X_H__ #define __SMSC911X_H__ +/*Chip ID*/ +#define LAN9115 0x01150000 +#define LAN9116 0x01160000 +#define LAN9117 0x01170000 +#define LAN9118 0x01180000 +#define LAN9215 0x115A0000 +#define LAN9216 0x116A0000 +#define LAN9217 0x117A0000 +#define LAN9218 0x118A0000 +#define LAN9210 0x92100000 +#define LAN9211 0x92110000 +#define LAN9220 0x92200000 +#define LAN9221 0x92210000 +#define LAN9250 0x92500000 +#define LAN89218 0x218A0000 + #define TX_FIFO_LOW_THRESHOLD ((u32)1600) #define SMSC911X_EEPROM_SIZE ((u32)128) #define USE_DEBUG 0 @@ -303,6 +319,9 @@ #define E2P_DATA_EEPROM_DATA_ 0x000000FF #define LAN_REGISTER_EXTENT 0x00000100 +#define RESET_CTL 0x1F8 +#define RESET_CTL_DIGITAL_RST_ 0x00000001 + /* * MAC Control and Status Register (Indirect Address) * Offset (through the MAC_CSR CMD and DATA port) diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c index 15749d359e60..652453d9fb08 100644 --- a/drivers/net/hyperv/netvsc.c +++ b/drivers/net/hyperv/netvsc.c @@ -1322,6 +1322,10 @@ int netvsc_device_add(struct hv_device *device, nvchan->channel = device->channel; } + /* Enable NAPI handler before init callbacks */ + netif_napi_add(ndev, &net_device->chan_table[0].napi, + netvsc_poll, NAPI_POLL_WEIGHT); + /* Open the channel */ ret = vmbus_open(device->channel, ring_size * PAGE_SIZE, ring_size * PAGE_SIZE, NULL, 0, @@ -1329,6 +1333,7 @@ int netvsc_device_add(struct hv_device *device, net_device->chan_table); if (ret != 0) { + netif_napi_del(&net_device->chan_table[0].napi); netdev_err(ndev, "unable to open channel: %d\n", ret); goto cleanup; } @@ -1336,9 +1341,6 @@ int netvsc_device_add(struct hv_device *device, /* Channel is opened */ netdev_dbg(ndev, "hv_netvsc channel opened successfully\n"); - /* Enable NAPI handler for init callbacks */ - netif_napi_add(ndev, &net_device->chan_table[0].napi, - netvsc_poll, NAPI_POLL_WEIGHT); napi_enable(&net_device->chan_table[0].napi); /* Writing nvdev pointer unlocks netvsc_send(), make sure chn_table is diff --git a/drivers/net/hyperv/rndis_filter.c b/drivers/net/hyperv/rndis_filter.c index ab92c3c95951..f9d5b0b8209a 100644 --- a/drivers/net/hyperv/rndis_filter.c +++ b/drivers/net/hyperv/rndis_filter.c @@ -1018,7 +1018,7 @@ static void netvsc_sc_open(struct vmbus_channel *new_sc) if (ret == 0) napi_enable(&nvchan->napi); else - netdev_err(ndev, "sub channel open failed (%d)\n", ret); + netif_napi_del(&nvchan->napi); if (refcount_dec_and_test(&nvscdev->sc_offered)) complete(&nvscdev->channel_init_wait); diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c index a3ed8115747c..d7165767ca9d 100644 --- a/drivers/net/usb/qmi_wwan.c +++ b/drivers/net/usb/qmi_wwan.c @@ -1201,6 +1201,7 @@ static const struct usb_device_id products[] = { {QMI_FIXED_INTF(0x2357, 0x0201, 4)}, /* TP-LINK HSUPA Modem MA180 */ {QMI_FIXED_INTF(0x2357, 0x9000, 4)}, /* TP-LINK MA260 */ {QMI_QUIRK_SET_DTR(0x1bc7, 0x1040, 2)}, /* Telit LE922A */ + {QMI_FIXED_INTF(0x1bc7, 0x1100, 3)}, /* Telit ME910 */ {QMI_FIXED_INTF(0x1bc7, 0x1200, 5)}, /* Telit LE920 */ {QMI_QUIRK_SET_DTR(0x1bc7, 0x1201, 2)}, /* Telit LE920, LE920A4 */ {QMI_FIXED_INTF(0x1c9e, 0x9b01, 3)}, /* XS Stick W100-2 from 4G Systems */ diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 3d0bc484b3d7..1c6d3923c224 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -1891,17 +1891,17 @@ static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog, virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO6) || virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_ECN) || virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_UFO)) { - NL_SET_ERR_MSG(extack, "can't set XDP while host is implementing LRO, disable LRO first"); + NL_SET_ERR_MSG_MOD(extack, "Can't set XDP while host is implementing LRO, disable LRO first"); return -EOPNOTSUPP; } if (vi->mergeable_rx_bufs && !vi->any_header_sg) { - NL_SET_ERR_MSG(extack, "XDP expects header/data in single page, any_header_sg required"); + NL_SET_ERR_MSG_MOD(extack, "XDP expects header/data in single page, any_header_sg required"); return -EINVAL; } if (dev->mtu > max_sz) { - NL_SET_ERR_MSG(extack, "MTU too large to enable XDP"); + NL_SET_ERR_MSG_MOD(extack, "MTU too large to enable XDP"); netdev_warn(dev, "XDP requires MTU less than %lu\n", max_sz); return -EINVAL; } @@ -1912,7 +1912,7 @@ static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog, /* XDP requires extra queues for XDP_TX */ if (curr_qp + xdp_qp > vi->max_queue_pairs) { - NL_SET_ERR_MSG(extack, "Too few free TX rings available"); + NL_SET_ERR_MSG_MOD(extack, "Too few free TX rings available"); netdev_warn(dev, "request %i queues but max is %i\n", curr_qp + xdp_qp, vi->max_queue_pairs); return -ENOMEM; diff --git a/include/linux/netlink.h b/include/linux/netlink.h index c20395edf2de..5fff5ba5964e 100644 --- a/include/linux/netlink.h +++ b/include/linux/netlink.h @@ -86,19 +86,16 @@ struct netlink_ext_ack { * Currently string formatting is not supported (due * to the lack of an output buffer.) */ -#define NL_SET_ERR_MSG(extack, msg) do { \ - static const char _msg[] = (msg); \ - \ - (extack)->_msg = _msg; \ +#define NL_SET_ERR_MSG(extack, msg) do { \ + static const char __msg[] = (msg); \ + struct netlink_ext_ack *__extack = (extack); \ + \ + if (__extack) \ + __extack->_msg = __msg; \ } while (0) -#define NL_MOD_TRY_SET_ERR_MSG(extack, msg) do { \ - static const char _msg[] = KBUILD_MODNAME ": " msg; \ - struct netlink_ext_ack *_extack = (extack); \ - \ - if (_extack) \ - _extack->_msg = _msg; \ -} while (0) +#define NL_SET_ERR_MSG_MOD(extack, msg) \ + NL_SET_ERR_MSG((extack), KBUILD_MODNAME ": " msg) extern void netlink_kernel_release(struct sock *sk); extern int __netlink_change_ngroups(struct sock *sk, unsigned int groups); diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h index 5544d7b2f2bb..c70ac13a97e6 100644 --- a/include/linux/qed/qed_if.h +++ b/include/linux/qed/qed_if.h @@ -635,7 +635,7 @@ struct qed_common_ops { * @return 0 on success, error otherwise. */ int (*set_coalesce)(struct qed_dev *cdev, u16 rx_coal, u16 tx_coal, - u8 qid, u16 sb_id); + u16 qid, u16 sb_id); /** * @brief set_led - Configure LED mode diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 6e90f1a4950f..15d6599b8bc6 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1013,9 +1013,9 @@ enum rate_info_flags { * @RATE_INFO_BW_160: 160 MHz bandwidth */ enum rate_info_bw { + RATE_INFO_BW_20 = 0, RATE_INFO_BW_5, RATE_INFO_BW_10, - RATE_INFO_BW_20, RATE_INFO_BW_40, RATE_INFO_BW_80, RATE_INFO_BW_160, diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 9dc2c182a263..f5e625f53367 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -84,6 +84,7 @@ struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6, struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int ifindex, struct flowi6 *fl6, int flags); +void ip6_route_init_special_entries(void); int ip6_route_init(void); void ip6_route_cleanup(void); diff --git a/include/uapi/linux/netfilter/nf_conntrack_common.h b/include/uapi/linux/netfilter/nf_conntrack_common.h index a8072cc7fa0b..dc947e59d03a 100644 --- a/include/uapi/linux/netfilter/nf_conntrack_common.h +++ b/include/uapi/linux/netfilter/nf_conntrack_common.h @@ -84,10 +84,6 @@ enum ip_conntrack_status { IPS_DYING_BIT = 9, IPS_DYING = (1 << IPS_DYING_BIT), - /* Bits that cannot be altered from userland. */ - IPS_UNCHANGEABLE_MASK = (IPS_NAT_DONE_MASK | IPS_NAT_MASK | - IPS_EXPECTED | IPS_CONFIRMED | IPS_DYING), - /* Connection has fixed timeout. */ IPS_FIXED_TIMEOUT_BIT = 10, IPS_FIXED_TIMEOUT = (1 << IPS_FIXED_TIMEOUT_BIT), @@ -103,6 +99,15 @@ enum ip_conntrack_status { /* Conntrack got a helper explicitly attached via CT target. */ IPS_HELPER_BIT = 13, IPS_HELPER = (1 << IPS_HELPER_BIT), + + /* Be careful here, modifying these bits can make things messy, + * so don't let users modify them directly. + */ + IPS_UNCHANGEABLE_MASK = (IPS_NAT_DONE_MASK | IPS_NAT_MASK | + IPS_EXPECTED | IPS_CONFIRMED | IPS_DYING | + IPS_SEQ_ADJUST | IPS_TEMPLATE), + + __IPS_MAX_BIT = 14, }; /* Connection tracking event types */ diff --git a/lib/test_bpf.c b/lib/test_bpf.c index a0f66280ea50..889bc31785be 100644 --- a/lib/test_bpf.c +++ b/lib/test_bpf.c @@ -4769,8 +4769,8 @@ static struct bpf_test tests[] = { BPF_LD_IMM64(R1, 3), BPF_LD_IMM64(R2, 2), BPF_JMP_REG(BPF_JGE, R1, R2, 2), - BPF_LD_IMM64(R0, 0xffffffffffffffffUL), - BPF_LD_IMM64(R0, 0xeeeeeeeeeeeeeeeeUL), + BPF_LD_IMM64(R0, 0xffffffffffffffffULL), + BPF_LD_IMM64(R0, 0xeeeeeeeeeeeeeeeeULL), BPF_EXIT_INSN(), }, INTERNAL, @@ -4784,7 +4784,7 @@ static struct bpf_test tests[] = { BPF_LD_IMM64(R1, 3), BPF_LD_IMM64(R2, 2), BPF_JMP_REG(BPF_JGE, R1, R2, 0), - BPF_LD_IMM64(R0, 0xffffffffffffffffUL), + BPF_LD_IMM64(R0, 0xffffffffffffffffULL), BPF_EXIT_INSN(), }, INTERNAL, @@ -4798,8 +4798,8 @@ static struct bpf_test tests[] = { BPF_LD_IMM64(R1, 3), BPF_LD_IMM64(R2, 2), BPF_JMP_REG(BPF_JGE, R1, R2, 4), - BPF_LD_IMM64(R0, 0xffffffffffffffffUL), - BPF_LD_IMM64(R0, 0xeeeeeeeeeeeeeeeeUL), + BPF_LD_IMM64(R0, 0xffffffffffffffffULL), + BPF_LD_IMM64(R0, 0xeeeeeeeeeeeeeeeeULL), BPF_EXIT_INSN(), }, INTERNAL, diff --git a/net/bridge/netfilter/ebt_dnat.c b/net/bridge/netfilter/ebt_dnat.c index 4e0b0c359325..e0bb624c3845 100644 --- a/net/bridge/netfilter/ebt_dnat.c +++ b/net/bridge/netfilter/ebt_dnat.c @@ -9,6 +9,7 @@ */ #include <linux/module.h> #include <net/sock.h> +#include "../br_private.h" #include <linux/netfilter.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter_bridge/ebtables.h> @@ -18,11 +19,30 @@ static unsigned int ebt_dnat_tg(struct sk_buff *skb, const struct xt_action_param *par) { const struct ebt_nat_info *info = par->targinfo; + struct net_device *dev; if (!skb_make_writable(skb, 0)) return EBT_DROP; ether_addr_copy(eth_hdr(skb)->h_dest, info->mac); + + if (is_multicast_ether_addr(info->mac)) { + if (is_broadcast_ether_addr(info->mac)) + skb->pkt_type = PACKET_BROADCAST; + else + skb->pkt_type = PACKET_MULTICAST; + } else { + if (xt_hooknum(par) != NF_BR_BROUTING) + dev = br_port_get_rcu(xt_in(par))->br->dev; + else + dev = xt_in(par); + + if (ether_addr_equal(info->mac, dev->dev_addr)) + skb->pkt_type = PACKET_HOST; + else + skb->pkt_type = PACKET_OTHERHOST; + } + return info->target; } diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 6e67315ec368..bcb0f610ee42 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1054,7 +1054,7 @@ static int rtnl_phys_port_name_fill(struct sk_buff *skb, struct net_device *dev) return err; } - if (nla_put(skb, IFLA_PHYS_PORT_NAME, strlen(name), name)) + if (nla_put_string(skb, IFLA_PHYS_PORT_NAME, name)) return -EMSGSIZE; return 0; diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 9d943974de2b..bdffad875691 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -358,6 +358,9 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4, rt->dst.dev->mtu); return -EMSGSIZE; } + if (length < sizeof(struct iphdr)) + return -EINVAL; + if (flags&MSG_PROBE) goto out; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 8f6373b0cd77..717be4de5324 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -523,6 +523,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len; newtp->rx_opt.mss_clamp = req->mss; tcp_ecn_openreq_child(newtp, req); + newtp->fastopen_req = NULL; newtp->fastopen_rsk = NULL; newtp->syn_data_acked = 0; newtp->rack.mstamp.v64 = 0; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index b09ac38d8dc4..77a4bd526d6e 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -3328,7 +3328,8 @@ static int fixup_permanent_addr(struct inet6_dev *idev, idev->dev, 0, 0); } - addrconf_dad_start(ifp); + if (ifp->state == INET6_IFADDR_STATE_PREDAD) + addrconf_dad_start(ifp); return 0; } @@ -3683,7 +3684,7 @@ restart: if (keep) { /* set state to skip the notifier below */ state = INET6_IFADDR_STATE_DEAD; - ifa->state = 0; + ifa->state = INET6_IFADDR_STATE_PREDAD; if (!(ifa->flags & IFA_F_NODAD)) ifa->flags |= IFA_F_TENTATIVE; @@ -6572,6 +6573,8 @@ int __init addrconf_init(void) goto errlo; } + ip6_route_init_special_entries(); + for (i = 0; i < IN6_ADDR_HSIZE; i++) INIT_HLIST_HEAD(&inet6_addr_lst[i]); diff --git a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c index bf3ad3e7b647..b2b4f031b3a1 100644 --- a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c +++ b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c @@ -235,7 +235,7 @@ int nf_nat_icmpv6_reply_translation(struct sk_buff *skb, inside->icmp6.icmp6_cksum = csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr, skb->len - hdrlen, IPPROTO_ICMPV6, - csum_partial(&inside->icmp6, + skb_checksum(skb, hdrlen, skb->len - hdrlen, 0)); } diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 0da6a12b5472..1f992d9e261d 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -632,6 +632,8 @@ static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length, ipv6_local_error(sk, EMSGSIZE, fl6, rt->dst.dev->mtu); return -EMSGSIZE; } + if (length < sizeof(struct ipv6hdr)) + return -EINVAL; if (flags&MSG_PROBE) goto out; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index a1bf426c959b..2f1136627dcb 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -4027,6 +4027,21 @@ static struct notifier_block ip6_route_dev_notifier = { .priority = 0, }; +void __init ip6_route_init_special_entries(void) +{ + /* Registering of the loopback is done before this portion of code, + * the loopback reference in rt6_info will not be taken, do it + * manually for init_net */ + init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev; + init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); + #ifdef CONFIG_IPV6_MULTIPLE_TABLES + init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev; + init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); + init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev; + init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); + #endif +} + int __init ip6_route_init(void) { int ret; @@ -4053,17 +4068,6 @@ int __init ip6_route_init(void) ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep; - /* Registering of the loopback is done before this portion of code, - * the loopback reference in rt6_info will not be taken, do it - * manually for init_net */ - init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev; - init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); - #ifdef CONFIG_IPV6_MULTIPLE_TABLES - init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev; - init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); - init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev; - init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); - #endif ret = fib6_init(); if (ret) goto out_register_subsys; diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 668d9643f0cc..1fa3c2307b6e 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -3078,6 +3078,17 @@ nla_put_failure: return skb->len; } +static bool ip_vs_is_af_valid(int af) +{ + if (af == AF_INET) + return true; +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6 && ipv6_mod_enabled()) + return true; +#endif + return false; +} + static int ip_vs_genl_parse_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *usvc, struct nlattr *nla, int full_entry, @@ -3105,11 +3116,7 @@ static int ip_vs_genl_parse_service(struct netns_ipvs *ipvs, memset(usvc, 0, sizeof(*usvc)); usvc->af = nla_get_u16(nla_af); -#ifdef CONFIG_IP_VS_IPV6 - if (usvc->af != AF_INET && usvc->af != AF_INET6) -#else - if (usvc->af != AF_INET) -#endif + if (!ip_vs_is_af_valid(usvc->af)) return -EAFNOSUPPORT; if (nla_fwmark) { @@ -3612,6 +3619,11 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info) if (udest.af == 0) udest.af = svc->af; + if (!ip_vs_is_af_valid(udest.af)) { + ret = -EAFNOSUPPORT; + goto out; + } + if (udest.af != svc->af && cmd != IPVS_CMD_DEL_DEST) { /* The synchronization protocol is incompatible * with mixed family services diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index f9245dbfe435..3c8f1ed2f555 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1853,7 +1853,7 @@ EXPORT_SYMBOL_GPL(nf_conntrack_set_hashsize); module_param_call(hashsize, nf_conntrack_set_hashsize, param_get_uint, &nf_conntrack_htable_size, 0600); -static unsigned int total_extension_size(void) +static __always_inline unsigned int total_extension_size(void) { /* remember to add new extensions below */ BUILD_BUG_ON(NF_CT_EXT_NUM > 9); diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c index 4b9dfe3eef62..3a60efa7799b 100644 --- a/net/netfilter/nf_conntrack_helper.c +++ b/net/netfilter/nf_conntrack_helper.c @@ -385,7 +385,7 @@ int nf_conntrack_helper_register(struct nf_conntrack_helper *me) struct nf_conntrack_tuple_mask mask = { .src.u.all = htons(0xFFFF) }; unsigned int h = helper_hash(&me->tuple); struct nf_conntrack_helper *cur; - int ret = 0; + int ret = 0, i; BUG_ON(me->expect_policy == NULL); BUG_ON(me->expect_class_max >= NF_CT_MAX_EXPECT_CLASSES); @@ -395,10 +395,26 @@ int nf_conntrack_helper_register(struct nf_conntrack_helper *me) return -EINVAL; mutex_lock(&nf_ct_helper_mutex); - hlist_for_each_entry(cur, &nf_ct_helper_hash[h], hnode) { - if (nf_ct_tuple_src_mask_cmp(&cur->tuple, &me->tuple, &mask)) { - ret = -EEXIST; - goto out; + for (i = 0; i < nf_ct_helper_hsize; i++) { + hlist_for_each_entry(cur, &nf_ct_helper_hash[i], hnode) { + if (!strcmp(cur->name, me->name) && + (cur->tuple.src.l3num == NFPROTO_UNSPEC || + cur->tuple.src.l3num == me->tuple.src.l3num) && + cur->tuple.dst.protonum == me->tuple.dst.protonum) { + ret = -EEXIST; + goto out; + } + } + } + + /* avoid unpredictable behaviour for auto_assign_helper */ + if (!(me->flags & NF_CT_HELPER_F_USERSPACE)) { + hlist_for_each_entry(cur, &nf_ct_helper_hash[h], hnode) { + if (nf_ct_tuple_src_mask_cmp(&cur->tuple, &me->tuple, + &mask)) { + ret = -EEXIST; + goto out; + } } } hlist_add_head_rcu(&me->hnode, &nf_ct_helper_hash[h]); diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 5f6f2f388928..dcf561b5c97a 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -417,8 +417,7 @@ nla_put_failure: return -1; } -static int ctnetlink_dump_ct_seq_adj(struct sk_buff *skb, - const struct nf_conn *ct) +static int ctnetlink_dump_ct_seq_adj(struct sk_buff *skb, struct nf_conn *ct) { struct nf_conn_seqadj *seqadj = nfct_seqadj(ct); struct nf_ct_seqadj *seq; @@ -426,15 +425,20 @@ static int ctnetlink_dump_ct_seq_adj(struct sk_buff *skb, if (!(ct->status & IPS_SEQ_ADJUST) || !seqadj) return 0; + spin_lock_bh(&ct->lock); seq = &seqadj->seq[IP_CT_DIR_ORIGINAL]; if (dump_ct_seq_adj(skb, seq, CTA_SEQ_ADJ_ORIG) == -1) - return -1; + goto err; seq = &seqadj->seq[IP_CT_DIR_REPLY]; if (dump_ct_seq_adj(skb, seq, CTA_SEQ_ADJ_REPLY) == -1) - return -1; + goto err; + spin_unlock_bh(&ct->lock); return 0; +err: + spin_unlock_bh(&ct->lock); + return -1; } static int ctnetlink_dump_id(struct sk_buff *skb, const struct nf_conn *ct) @@ -1417,6 +1421,24 @@ ctnetlink_parse_nat_setup(struct nf_conn *ct, } #endif +static void +__ctnetlink_change_status(struct nf_conn *ct, unsigned long on, + unsigned long off) +{ + unsigned int bit; + + /* Ignore these unchangable bits */ + on &= ~IPS_UNCHANGEABLE_MASK; + off &= ~IPS_UNCHANGEABLE_MASK; + + for (bit = 0; bit < __IPS_MAX_BIT; bit++) { + if (on & (1 << bit)) + set_bit(bit, &ct->status); + else if (off & (1 << bit)) + clear_bit(bit, &ct->status); + } +} + static int ctnetlink_change_status(struct nf_conn *ct, const struct nlattr * const cda[]) { @@ -1436,10 +1458,7 @@ ctnetlink_change_status(struct nf_conn *ct, const struct nlattr * const cda[]) /* ASSURED bit can only be set */ return -EBUSY; - /* Be careful here, modifying NAT bits can screw up things, - * so don't let users modify them directly if they don't pass - * nf_nat_range. */ - ct->status |= status & ~(IPS_NAT_DONE_MASK | IPS_NAT_MASK); + __ctnetlink_change_status(ct, status, 0); return 0; } @@ -1508,23 +1527,11 @@ static int ctnetlink_change_helper(struct nf_conn *ct, return 0; } + rcu_read_lock(); helper = __nf_conntrack_helper_find(helpname, nf_ct_l3num(ct), nf_ct_protonum(ct)); if (helper == NULL) { -#ifdef CONFIG_MODULES - spin_unlock_bh(&nf_conntrack_expect_lock); - - if (request_module("nfct-helper-%s", helpname) < 0) { - spin_lock_bh(&nf_conntrack_expect_lock); - return -EOPNOTSUPP; - } - - spin_lock_bh(&nf_conntrack_expect_lock); - helper = __nf_conntrack_helper_find(helpname, nf_ct_l3num(ct), - nf_ct_protonum(ct)); - if (helper) - return -EAGAIN; -#endif + rcu_read_unlock(); return -EOPNOTSUPP; } @@ -1533,13 +1540,16 @@ static int ctnetlink_change_helper(struct nf_conn *ct, /* update private helper data if allowed. */ if (helper->from_nlattr) helper->from_nlattr(helpinfo, ct); - return 0; + err = 0; } else - return -EBUSY; + err = -EBUSY; + } else { + /* we cannot set a helper for an existing conntrack */ + err = -EOPNOTSUPP; } - /* we cannot set a helper for an existing conntrack */ - return -EOPNOTSUPP; + rcu_read_unlock(); + return err; } static int ctnetlink_change_timeout(struct nf_conn *ct, @@ -1630,25 +1640,30 @@ ctnetlink_change_seq_adj(struct nf_conn *ct, if (!seqadj) return 0; + spin_lock_bh(&ct->lock); if (cda[CTA_SEQ_ADJ_ORIG]) { ret = change_seq_adj(&seqadj->seq[IP_CT_DIR_ORIGINAL], cda[CTA_SEQ_ADJ_ORIG]); if (ret < 0) - return ret; + goto err; - ct->status |= IPS_SEQ_ADJUST; + set_bit(IPS_SEQ_ADJUST_BIT, &ct->status); } if (cda[CTA_SEQ_ADJ_REPLY]) { ret = change_seq_adj(&seqadj->seq[IP_CT_DIR_REPLY], cda[CTA_SEQ_ADJ_REPLY]); if (ret < 0) - return ret; + goto err; - ct->status |= IPS_SEQ_ADJUST; + set_bit(IPS_SEQ_ADJUST_BIT, &ct->status); } + spin_unlock_bh(&ct->lock); return 0; +err: + spin_unlock_bh(&ct->lock); + return ret; } static int @@ -1959,9 +1974,7 @@ static int ctnetlink_new_conntrack(struct net *net, struct sock *ctnl, err = -EEXIST; ct = nf_ct_tuplehash_to_ctrack(h); if (!(nlh->nlmsg_flags & NLM_F_EXCL)) { - spin_lock_bh(&nf_conntrack_expect_lock); err = ctnetlink_change_conntrack(ct, cda); - spin_unlock_bh(&nf_conntrack_expect_lock); if (err == 0) { nf_conntrack_eventmask_report((1 << IPCT_REPLY) | (1 << IPCT_ASSURED) | @@ -2294,10 +2307,10 @@ ctnetlink_update_status(struct nf_conn *ct, const struct nlattr * const cda[]) /* This check is less strict than ctnetlink_change_status() * because callers often flip IPS_EXPECTED bits when sending * an NFQA_CT attribute to the kernel. So ignore the - * unchangeable bits but do not error out. + * unchangeable bits but do not error out. Also user programs + * are allowed to clear the bits that they are allowed to change. */ - ct->status = (status & ~IPS_UNCHANGEABLE_MASK) | - (ct->status & IPS_UNCHANGEABLE_MASK); + __ctnetlink_change_status(ct, status, ~status); return 0; } @@ -2351,11 +2364,7 @@ ctnetlink_glue_parse(const struct nlattr *attr, struct nf_conn *ct) if (ret < 0) return ret; - spin_lock_bh(&nf_conntrack_expect_lock); - ret = ctnetlink_glue_parse_ct((const struct nlattr **)cda, ct); - spin_unlock_bh(&nf_conntrack_expect_lock); - - return ret; + return ctnetlink_glue_parse_ct((const struct nlattr **)cda, ct); } static int ctnetlink_glue_exp_parse(const struct nlattr * const *cda, diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 1c6482d2c4dc..559225029740 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -3778,6 +3778,11 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, err = set->ops->insert(ctx->net, set, &elem, &ext2); if (err) { if (err == -EEXIST) { + if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA) ^ + nft_set_ext_exists(ext2, NFT_SET_EXT_DATA) || + nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF) ^ + nft_set_ext_exists(ext2, NFT_SET_EXT_OBJREF)) + return -EBUSY; if ((nft_set_ext_exists(ext, NFT_SET_EXT_DATA) && nft_set_ext_exists(ext2, NFT_SET_EXT_DATA) && memcmp(nft_set_ext_data(ext), diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c index 3948da380259..66221ad891a9 100644 --- a/net/netfilter/nft_dynset.c +++ b/net/netfilter/nft_dynset.c @@ -82,8 +82,7 @@ static void nft_dynset_eval(const struct nft_expr *expr, nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION)) { timeout = priv->timeout ? : set->timeout; *nft_set_ext_expiration(ext) = jiffies + timeout; - } else if (sexpr == NULL) - goto out; + } if (sexpr != NULL) sexpr->ops->eval(sexpr, regs, pkt); @@ -92,7 +91,7 @@ static void nft_dynset_eval(const struct nft_expr *expr, regs->verdict.code = NFT_BREAK; return; } -out: + if (!priv->invert) regs->verdict.code = NFT_BREAK; } diff --git a/net/netfilter/nft_set_bitmap.c b/net/netfilter/nft_set_bitmap.c index 8ebbc2940f4c..b988162b5b15 100644 --- a/net/netfilter/nft_set_bitmap.c +++ b/net/netfilter/nft_set_bitmap.c @@ -257,6 +257,11 @@ static int nft_bitmap_init(const struct nft_set *set, static void nft_bitmap_destroy(const struct nft_set *set) { + struct nft_bitmap *priv = nft_set_priv(set); + struct nft_bitmap_elem *be, *n; + + list_for_each_entry_safe(be, n, &priv->list, head) + nft_set_elem_destroy(set, be, true); } static bool nft_bitmap_estimate(const struct nft_set_desc *desc, u32 features, diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 14857afc9937..f134d384852f 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -1051,8 +1051,10 @@ struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af, list_for_each_entry(t, &init_net.xt.tables[af], list) { if (strcmp(t->name, name)) continue; - if (!try_module_get(t->me)) + if (!try_module_get(t->me)) { + mutex_unlock(&xt[af].mutex); return NULL; + } mutex_unlock(&xt[af].mutex); if (t->table_init(net) != 0) { diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c index 3cbe1bcf6a74..bb7ad82dcd56 100644 --- a/net/netfilter/xt_CT.c +++ b/net/netfilter/xt_CT.c @@ -168,8 +168,10 @@ xt_ct_set_timeout(struct nf_conn *ct, const struct xt_tgchk_param *par, goto err_put_timeout; } timeout_ext = nf_ct_timeout_ext_add(ct, timeout, GFP_ATOMIC); - if (timeout_ext == NULL) + if (!timeout_ext) { ret = -ENOMEM; + goto err_put_timeout; + } rcu_read_unlock(); return ret; @@ -201,6 +203,7 @@ static int xt_ct_tg_check(const struct xt_tgchk_param *par, struct xt_ct_target_info_v1 *info) { struct nf_conntrack_zone zone; + struct nf_conn_help *help; struct nf_conn *ct; int ret = -EOPNOTSUPP; @@ -249,7 +252,7 @@ static int xt_ct_tg_check(const struct xt_tgchk_param *par, if (info->timeout[0]) { ret = xt_ct_set_timeout(ct, par, info->timeout); if (ret < 0) - goto err3; + goto err4; } __set_bit(IPS_CONFIRMED_BIT, &ct->status); nf_conntrack_get(&ct->ct_general); @@ -257,6 +260,10 @@ out: info->ct = ct; return 0; +err4: + help = nfct_help(ct); + if (help) + module_put(help->helper->me); err3: nf_ct_tmpl_free(ct); err2: diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c index 770bbec878f1..e75ef39669c5 100644 --- a/net/netfilter/xt_socket.c +++ b/net/netfilter/xt_socket.c @@ -152,7 +152,7 @@ static int socket_mt_enable_defrag(struct net *net, int family) switch (family) { case NFPROTO_IPV4: return nf_defrag_ipv4_enable(net); -#ifdef XT_SOCKET_HAVE_IPV6 +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) case NFPROTO_IPV6: return nf_defrag_ipv6_enable(net); #endif diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index 42a95919df09..bf602e33c40a 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -516,10 +516,38 @@ ovs_ct_expect_find(struct net *net, const struct nf_conntrack_zone *zone, u16 proto, const struct sk_buff *skb) { struct nf_conntrack_tuple tuple; + struct nf_conntrack_expect *exp; if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), proto, net, &tuple)) return NULL; - return __nf_ct_expect_find(net, zone, &tuple); + + exp = __nf_ct_expect_find(net, zone, &tuple); + if (exp) { + struct nf_conntrack_tuple_hash *h; + + /* Delete existing conntrack entry, if it clashes with the + * expectation. This can happen since conntrack ALGs do not + * check for clashes between (new) expectations and existing + * conntrack entries. nf_conntrack_in() will check the + * expectations only if a conntrack entry can not be found, + * which can lead to OVS finding the expectation (here) in the + * init direction, but which will not be removed by the + * nf_conntrack_in() call, if a matching conntrack entry is + * found instead. In this case all init direction packets + * would be reported as new related packets, while reply + * direction packets would be reported as un-related + * established packets. + */ + h = nf_conntrack_find_get(net, zone, &tuple); + if (h) { + struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); + + nf_ct_delete(ct, 0, 0); + nf_conntrack_put(&ct->ct_general); + } + } + + return exp; } /* This replicates logic from nf_conntrack_core.c that is not exported. */ diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c index 2efb36c08f2a..dee469fed967 100644 --- a/net/sched/cls_matchall.c +++ b/net/sched/cls_matchall.c @@ -203,8 +203,7 @@ static int mall_change(struct net *net, struct sk_buff *in_skb, *arg = (unsigned long) head; rcu_assign_pointer(tp->root, new); - if (head) - call_rcu(&head->rcu, mall_destroy_rcu); + call_rcu(&head->rcu, mall_destroy_rcu); return 0; err_replace_hw_filter: diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c index 4221dc359453..74456b3eb89a 100644 --- a/samples/bpf/bpf_load.c +++ b/samples/bpf/bpf_load.c @@ -39,6 +39,9 @@ int event_fd[MAX_PROGS]; int prog_cnt; int prog_array_fd = -1; +struct bpf_map_data map_data[MAX_MAPS]; +int map_data_count = 0; + static int populate_prog_array(const char *event, int prog_fd) { int ind = atoi(event), err; @@ -186,42 +189,45 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size) return 0; } -static int load_maps(struct bpf_map_def *maps, int nr_maps, - const char **map_names, fixup_map_cb fixup_map) +static int load_maps(struct bpf_map_data *maps, int nr_maps, + fixup_map_cb fixup_map) { int i; - /* - * Warning: Using "maps" pointing to ELF data_maps->d_buf as - * an array of struct bpf_map_def is a wrong assumption about - * the ELF maps section format. - */ + for (i = 0; i < nr_maps; i++) { - if (fixup_map) - fixup_map(&maps[i], map_names[i], i); - - if (maps[i].type == BPF_MAP_TYPE_ARRAY_OF_MAPS || - maps[i].type == BPF_MAP_TYPE_HASH_OF_MAPS) { - int inner_map_fd = map_fd[maps[i].inner_map_idx]; - - map_fd[i] = bpf_create_map_in_map(maps[i].type, - maps[i].key_size, - inner_map_fd, - maps[i].max_entries, - maps[i].map_flags); + if (fixup_map) { + fixup_map(&maps[i], i); + /* Allow userspace to assign map FD prior to creation */ + if (maps[i].fd != -1) { + map_fd[i] = maps[i].fd; + continue; + } + } + + if (maps[i].def.type == BPF_MAP_TYPE_ARRAY_OF_MAPS || + maps[i].def.type == BPF_MAP_TYPE_HASH_OF_MAPS) { + int inner_map_fd = map_fd[maps[i].def.inner_map_idx]; + + map_fd[i] = bpf_create_map_in_map(maps[i].def.type, + maps[i].def.key_size, + inner_map_fd, + maps[i].def.max_entries, + maps[i].def.map_flags); } else { - map_fd[i] = bpf_create_map(maps[i].type, - maps[i].key_size, - maps[i].value_size, - maps[i].max_entries, - maps[i].map_flags); + map_fd[i] = bpf_create_map(maps[i].def.type, + maps[i].def.key_size, + maps[i].def.value_size, + maps[i].def.max_entries, + maps[i].def.map_flags); } if (map_fd[i] < 0) { printf("failed to create a map: %d %s\n", errno, strerror(errno)); return 1; } + maps[i].fd = map_fd[i]; - if (maps[i].type == BPF_MAP_TYPE_PROG_ARRAY) + if (maps[i].def.type == BPF_MAP_TYPE_PROG_ARRAY) prog_array_fd = map_fd[i]; } return 0; @@ -251,7 +257,8 @@ static int get_sec(Elf *elf, int i, GElf_Ehdr *ehdr, char **shname, } static int parse_relo_and_apply(Elf_Data *data, Elf_Data *symbols, - GElf_Shdr *shdr, struct bpf_insn *insn) + GElf_Shdr *shdr, struct bpf_insn *insn, + struct bpf_map_data *maps, int nr_maps) { int i, nrels; @@ -261,6 +268,8 @@ static int parse_relo_and_apply(Elf_Data *data, Elf_Data *symbols, GElf_Sym sym; GElf_Rel rel; unsigned int insn_idx; + bool match = false; + int j, map_idx; gelf_getrel(data, i, &rel); @@ -274,11 +283,21 @@ static int parse_relo_and_apply(Elf_Data *data, Elf_Data *symbols, return 1; } insn[insn_idx].src_reg = BPF_PSEUDO_MAP_FD; - /* - * Warning: Using sizeof(struct bpf_map_def) here is a - * wrong assumption about ELF maps section format - */ - insn[insn_idx].imm = map_fd[sym.st_value / sizeof(struct bpf_map_def)]; + + /* Match FD relocation against recorded map_data[] offset */ + for (map_idx = 0; map_idx < nr_maps; map_idx++) { + if (maps[map_idx].elf_offset == sym.st_value) { + match = true; + break; + } + } + if (match) { + insn[insn_idx].imm = maps[map_idx].fd; + } else { + printf("invalid relo for insn[%d] no map_data match\n", + insn_idx); + return 1; + } } return 0; @@ -297,40 +316,112 @@ static int cmp_symbols(const void *l, const void *r) return 0; } -static int get_sorted_map_names(Elf *elf, Elf_Data *symbols, int maps_shndx, - int strtabidx, char **map_names) +static int load_elf_maps_section(struct bpf_map_data *maps, int maps_shndx, + Elf *elf, Elf_Data *symbols, int strtabidx) { - GElf_Sym map_symbols[MAX_MAPS]; - int i, nr_maps = 0; + int map_sz_elf, map_sz_copy; + bool validate_zero = false; + Elf_Data *data_maps; + int i, nr_maps; + GElf_Sym *sym; + Elf_Scn *scn; + int copy_sz; + + if (maps_shndx < 0) + return -EINVAL; + if (!symbols) + return -EINVAL; + + /* Get data for maps section via elf index */ + scn = elf_getscn(elf, maps_shndx); + if (scn) + data_maps = elf_getdata(scn, NULL); + if (!scn || !data_maps) { + printf("Failed to get Elf_Data from maps section %d\n", + maps_shndx); + return -EINVAL; + } - for (i = 0; i < symbols->d_size / sizeof(GElf_Sym); i++) { - assert(nr_maps < MAX_MAPS); - if (!gelf_getsym(symbols, i, &map_symbols[nr_maps])) + /* For each map get corrosponding symbol table entry */ + sym = calloc(MAX_MAPS+1, sizeof(GElf_Sym)); + for (i = 0, nr_maps = 0; i < symbols->d_size / sizeof(GElf_Sym); i++) { + assert(nr_maps < MAX_MAPS+1); + if (!gelf_getsym(symbols, i, &sym[nr_maps])) continue; - if (map_symbols[nr_maps].st_shndx != maps_shndx) + if (sym[nr_maps].st_shndx != maps_shndx) continue; + /* Only increment iif maps section */ nr_maps++; } - qsort(map_symbols, nr_maps, sizeof(GElf_Sym), cmp_symbols); + /* Align to map_fd[] order, via sort on offset in sym.st_value */ + qsort(sym, nr_maps, sizeof(GElf_Sym), cmp_symbols); + + /* Keeping compatible with ELF maps section changes + * ------------------------------------------------ + * The program size of struct bpf_map_def is known by loader + * code, but struct stored in ELF file can be different. + * + * Unfortunately sym[i].st_size is zero. To calculate the + * struct size stored in the ELF file, assume all struct have + * the same size, and simply divide with number of map + * symbols. + */ + map_sz_elf = data_maps->d_size / nr_maps; + map_sz_copy = sizeof(struct bpf_map_def); + if (map_sz_elf < map_sz_copy) { + /* + * Backward compat, loading older ELF file with + * smaller struct, keeping remaining bytes zero. + */ + map_sz_copy = map_sz_elf; + } else if (map_sz_elf > map_sz_copy) { + /* + * Forward compat, loading newer ELF file with larger + * struct with unknown features. Assume zero means + * feature not used. Thus, validate rest of struct + * data is zero. + */ + validate_zero = true; + } + /* Memcpy relevant part of ELF maps data to loader maps */ for (i = 0; i < nr_maps; i++) { - char *map_name; - - map_name = elf_strptr(elf, strtabidx, map_symbols[i].st_name); - if (!map_name) { - printf("cannot get map symbol\n"); - return -1; - } - - map_names[i] = strdup(map_name); - if (!map_names[i]) { + unsigned char *addr, *end; + struct bpf_map_def *def; + const char *map_name; + size_t offset; + + map_name = elf_strptr(elf, strtabidx, sym[i].st_name); + maps[i].name = strdup(map_name); + if (!maps[i].name) { printf("strdup(%s): %s(%d)\n", map_name, strerror(errno), errno); - return -1; + free(sym); + return -errno; + } + + /* Symbol value is offset into ELF maps section data area */ + offset = sym[i].st_value; + def = (struct bpf_map_def *)(data_maps->d_buf + offset); + maps[i].elf_offset = offset; + memset(&maps[i].def, 0, sizeof(struct bpf_map_def)); + memcpy(&maps[i].def, def, map_sz_copy); + + /* Verify no newer features were requested */ + if (validate_zero) { + addr = (unsigned char*) def + map_sz_copy; + end = (unsigned char*) def + map_sz_elf; + for (; addr < end; addr++) { + if (*addr != 0) { + free(sym); + return -EFBIG; + } + } } } + free(sym); return nr_maps; } @@ -341,7 +432,8 @@ static int do_load_bpf_file(const char *path, fixup_map_cb fixup_map) GElf_Ehdr ehdr; GElf_Shdr shdr, shdr_prog; Elf_Data *data, *data_prog, *data_maps = NULL, *symbols = NULL; - char *shname, *shname_prog, *map_names[MAX_MAPS] = { NULL }; + char *shname, *shname_prog; + int nr_maps = 0; /* reset global variables */ kern_version = 0; @@ -389,8 +481,12 @@ static int do_load_bpf_file(const char *path, fixup_map_cb fixup_map) } memcpy(&kern_version, data->d_buf, sizeof(int)); } else if (strcmp(shname, "maps") == 0) { + int j; + maps_shndx = i; data_maps = data; + for (j = 0; j < MAX_MAPS; j++) + map_data[j].fd = -1; } else if (shdr.sh_type == SHT_SYMTAB) { strtabidx = shdr.sh_link; symbols = data; @@ -405,27 +501,17 @@ static int do_load_bpf_file(const char *path, fixup_map_cb fixup_map) } if (data_maps) { - int nr_maps; - int prog_elf_map_sz; - - nr_maps = get_sorted_map_names(elf, symbols, maps_shndx, - strtabidx, map_names); - if (nr_maps < 0) - goto done; - - /* Deduce map struct size stored in ELF maps section */ - prog_elf_map_sz = data_maps->d_size / nr_maps; - if (prog_elf_map_sz != sizeof(struct bpf_map_def)) { - printf("Error: ELF maps sec wrong size (%d/%lu)," - " old kern.o file?\n", - prog_elf_map_sz, sizeof(struct bpf_map_def)); + nr_maps = load_elf_maps_section(map_data, maps_shndx, + elf, symbols, strtabidx); + if (nr_maps < 0) { + printf("Error: Failed loading ELF maps (errno:%d):%s\n", + nr_maps, strerror(-nr_maps)); ret = 1; goto done; } - - if (load_maps(data_maps->d_buf, nr_maps, - (const char **)map_names, fixup_map)) + if (load_maps(map_data, nr_maps, fixup_map)) goto done; + map_data_count = nr_maps; processed_sec[maps_shndx] = true; } @@ -453,7 +539,8 @@ static int do_load_bpf_file(const char *path, fixup_map_cb fixup_map) processed_sec[shdr.sh_info] = true; processed_sec[i] = true; - if (parse_relo_and_apply(data, symbols, &shdr, insns)) + if (parse_relo_and_apply(data, symbols, &shdr, insns, + map_data, nr_maps)) continue; if (memcmp(shname_prog, "kprobe/", 7) == 0 || @@ -488,8 +575,6 @@ static int do_load_bpf_file(const char *path, fixup_map_cb fixup_map) ret = 0; done: - for (i = 0; i < MAX_MAPS; i++) - free(map_names[i]); close(fd); return ret; } diff --git a/samples/bpf/bpf_load.h b/samples/bpf/bpf_load.h index 05822f83173a..ca0563d04744 100644 --- a/samples/bpf/bpf_load.h +++ b/samples/bpf/bpf_load.h @@ -15,15 +15,27 @@ struct bpf_map_def { unsigned int inner_map_idx; }; -typedef void (*fixup_map_cb)(struct bpf_map_def *map, const char *map_name, - int idx); +struct bpf_map_data { + int fd; + char *name; + size_t elf_offset; + struct bpf_map_def def; +}; + +typedef void (*fixup_map_cb)(struct bpf_map_data *map, int idx); -extern int map_fd[MAX_MAPS]; extern int prog_fd[MAX_PROGS]; extern int event_fd[MAX_PROGS]; extern char bpf_log_buf[BPF_LOG_BUF_SIZE]; extern int prog_cnt; +/* There is a one-to-one mapping between map_fd[] and map_data[]. + * The map_data[] just contains more rich info on the given map. + */ +extern int map_fd[MAX_MAPS]; +extern struct bpf_map_data map_data[MAX_MAPS]; +extern int map_data_count; + /* parses elf file compiled by llvm .c->.o * . parses 'maps' section and creates maps via BPF syscall * . parses 'license' section and passes it to syscall diff --git a/samples/bpf/map_perf_test_user.c b/samples/bpf/map_perf_test_user.c index 6ac778153315..1a8894b5ac51 100644 --- a/samples/bpf/map_perf_test_user.c +++ b/samples/bpf/map_perf_test_user.c @@ -320,21 +320,21 @@ static void fill_lpm_trie(void) assert(!r); } -static void fixup_map(struct bpf_map_def *map, const char *name, int idx) +static void fixup_map(struct bpf_map_data *map, int idx) { int i; - if (!strcmp("inner_lru_hash_map", name)) { + if (!strcmp("inner_lru_hash_map", map->name)) { inner_lru_hash_idx = idx; - inner_lru_hash_size = map->max_entries; + inner_lru_hash_size = map->def.max_entries; } - if (!strcmp("array_of_lru_hashs", name)) { + if (!strcmp("array_of_lru_hashs", map->name)) { if (inner_lru_hash_idx == -1) { printf("inner_lru_hash_map must be defined before array_of_lru_hashs\n"); exit(1); } - map->inner_map_idx = inner_lru_hash_idx; + map->def.inner_map_idx = inner_lru_hash_idx; array_of_lru_hashs_idx = idx; } @@ -345,9 +345,9 @@ static void fixup_map(struct bpf_map_def *map, const char *name, int idx) /* Only change the max_entries for the enabled test(s) */ for (i = 0; i < NR_TESTS; i++) { - if (!strcmp(test_map_names[i], name) && + if (!strcmp(test_map_names[i], map->name) && (check_test_flags(i))) { - map->max_entries = num_map_entries; + map->def.max_entries = num_map_entries; } } } diff --git a/samples/bpf/tracex2_user.c b/samples/bpf/tracex2_user.c index ded9804c5034..7fee0f1ba9a3 100644 --- a/samples/bpf/tracex2_user.c +++ b/samples/bpf/tracex2_user.c @@ -4,6 +4,7 @@ #include <signal.h> #include <linux/bpf.h> #include <string.h> +#include <sys/resource.h> #include "libbpf.h" #include "bpf_load.h" @@ -112,6 +113,7 @@ static void int_exit(int sig) int main(int ac, char **argv) { + struct rlimit r = {1024*1024, RLIM_INFINITY}; char filename[256]; long key, next_key, value; FILE *f; @@ -119,6 +121,11 @@ int main(int ac, char **argv) snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + if (setrlimit(RLIMIT_MEMLOCK, &r)) { + perror("setrlimit(RLIMIT_MEMLOCK)"); + return 1; + } + signal(SIGINT, int_exit); /* start 'ping' in the background to have some kfree_skb events */ diff --git a/samples/bpf/tracex3_user.c b/samples/bpf/tracex3_user.c index 8f7d199d5945..fe372239d505 100644 --- a/samples/bpf/tracex3_user.c +++ b/samples/bpf/tracex3_user.c @@ -11,6 +11,7 @@ #include <stdbool.h> #include <string.h> #include <linux/bpf.h> +#include <sys/resource.h> #include "libbpf.h" #include "bpf_load.h" @@ -112,11 +113,17 @@ static void print_hist(int fd) int main(int ac, char **argv) { + struct rlimit r = {1024*1024, RLIM_INFINITY}; char filename[256]; int i; snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + if (setrlimit(RLIMIT_MEMLOCK, &r)) { + perror("setrlimit(RLIMIT_MEMLOCK)"); + return 1; + } + if (load_bpf_file(filename)) { printf("%s", bpf_log_buf); return 1; diff --git a/samples/bpf/tracex4_user.c b/samples/bpf/tracex4_user.c index 03449f773cb1..22c644f1f4c3 100644 --- a/samples/bpf/tracex4_user.c +++ b/samples/bpf/tracex4_user.c @@ -12,6 +12,8 @@ #include <string.h> #include <time.h> #include <linux/bpf.h> +#include <sys/resource.h> + #include "libbpf.h" #include "bpf_load.h" @@ -50,11 +52,17 @@ static void print_old_objects(int fd) int main(int ac, char **argv) { + struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; char filename[256]; int i; snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + if (setrlimit(RLIMIT_MEMLOCK, &r)) { + perror("setrlimit(RLIMIT_MEMLOCK, RLIM_INFINITY)"); + return 1; + } + if (load_bpf_file(filename)) { printf("%s", bpf_log_buf); return 1; diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index d8d94b9bd76c..91edd0566237 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -13,7 +13,7 @@ LDLIBS += -lcap -lelf TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test_progs -TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o +TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o TEST_PROGS := test_kmod.sh @@ -34,6 +34,6 @@ $(BPFOBJ): force CLANG ?= clang %.o: %.c - $(CLANG) -I../../../include/uapi -I../../../../samples/bpf/ \ - -D__x86_64__ -Wno-compare-distinct-pointer-types \ + $(CLANG) -I. -I../../../include/uapi -I../../../../samples/bpf/ \ + -Wno-compare-distinct-pointer-types \ -O2 -target bpf -c $< -o $@ diff --git a/tools/testing/selftests/bpf/gnu/stubs.h b/tools/testing/selftests/bpf/gnu/stubs.h new file mode 100644 index 000000000000..719225b16626 --- /dev/null +++ b/tools/testing/selftests/bpf/gnu/stubs.h @@ -0,0 +1 @@ +/* dummy .h to trick /usr/include/features.h to work with 'clang -target bpf' */ diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c index 4ed049a0b14b..b59f5ed4ae40 100644 --- a/tools/testing/selftests/bpf/test_progs.c +++ b/tools/testing/selftests/bpf/test_progs.c @@ -268,6 +268,21 @@ out: bpf_object__close(obj); } +static void test_tcp_estats(void) +{ + const char *file = "./test_tcp_estats.o"; + int err, prog_fd; + struct bpf_object *obj; + __u32 duration = 0; + + err = bpf_prog_load(file, BPF_PROG_TYPE_TRACEPOINT, &obj, &prog_fd); + CHECK(err, "", "err %d errno %d\n", err, errno); + if (err) + return; + + bpf_object__close(obj); +} + int main(void) { struct rlimit rinf = { RLIM_INFINITY, RLIM_INFINITY }; @@ -277,6 +292,7 @@ int main(void) test_pkt_access(); test_xdp(); test_l4lb(); + test_tcp_estats(); printf("Summary: %d PASSED, %d FAILED\n", pass_cnt, error_cnt); return 0; diff --git a/tools/testing/selftests/bpf/test_tcp_estats.c b/tools/testing/selftests/bpf/test_tcp_estats.c new file mode 100644 index 000000000000..bee3bbecc0c4 --- /dev/null +++ b/tools/testing/selftests/bpf/test_tcp_estats.c @@ -0,0 +1,258 @@ +/* Copyright (c) 2017 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ + +/* This program shows clang/llvm is able to generate code pattern + * like: + * _tcp_send_active_reset: + * 0: bf 16 00 00 00 00 00 00 r6 = r1 + * ...... + * 335: b7 01 00 00 0f 00 00 00 r1 = 15 + * 336: 05 00 48 00 00 00 00 00 goto 72 + * + * LBB0_3: + * 337: b7 01 00 00 01 00 00 00 r1 = 1 + * 338: 63 1a d0 ff 00 00 00 00 *(u32 *)(r10 - 48) = r1 + * 408: b7 01 00 00 03 00 00 00 r1 = 3 + * + * LBB0_4: + * 409: 71 a2 fe ff 00 00 00 00 r2 = *(u8 *)(r10 - 2) + * 410: bf a7 00 00 00 00 00 00 r7 = r10 + * 411: 07 07 00 00 b8 ff ff ff r7 += -72 + * 412: bf 73 00 00 00 00 00 00 r3 = r7 + * 413: 0f 13 00 00 00 00 00 00 r3 += r1 + * 414: 73 23 2d 00 00 00 00 00 *(u8 *)(r3 + 45) = r2 + * + * From the above code snippet, the code generated by the compiler + * is reasonable. The "r1" is assigned to different values in basic + * blocks "_tcp_send_active_reset" and "LBB0_3", and used in "LBB0_4". + * The verifier should be able to handle such code patterns. + */ +#include <string.h> +#include <linux/bpf.h> +#include <linux/ipv6.h> +#include <linux/version.h> +#include <sys/socket.h> +#include "bpf_helpers.h" + +#define _(P) ({typeof(P) val = 0; bpf_probe_read(&val, sizeof(val), &P); val;}) +#define TCP_ESTATS_MAGIC 0xBAADBEEF + +/* This test case needs "sock" and "pt_regs" data structure. + * Recursively, "sock" needs "sock_common" and "inet_sock". + * However, this is a unit test case only for + * verifier purpose without bpf program execution. + * We can safely mock much simpler data structures, basically + * only taking the necessary fields from kernel headers. + */ +typedef __u32 __bitwise __portpair; +typedef __u64 __bitwise __addrpair; + +struct sock_common { + unsigned short skc_family; + union { + __addrpair skc_addrpair; + struct { + __be32 skc_daddr; + __be32 skc_rcv_saddr; + }; + }; + union { + __portpair skc_portpair; + struct { + __be16 skc_dport; + __u16 skc_num; + }; + }; + struct in6_addr skc_v6_daddr; + struct in6_addr skc_v6_rcv_saddr; +}; + +struct sock { + struct sock_common __sk_common; +#define sk_family __sk_common.skc_family +#define sk_v6_daddr __sk_common.skc_v6_daddr +#define sk_v6_rcv_saddr __sk_common.skc_v6_rcv_saddr +}; + +struct inet_sock { + struct sock sk; +#define inet_daddr sk.__sk_common.skc_daddr +#define inet_dport sk.__sk_common.skc_dport + __be32 inet_saddr; + __be16 inet_sport; +}; + +struct pt_regs { + long di; +}; + +static inline struct inet_sock *inet_sk(const struct sock *sk) +{ + return (struct inet_sock *)sk; +} + +/* Define various data structures for state recording. + * Some fields are not used due to test simplification. + */ +enum tcp_estats_addrtype { + TCP_ESTATS_ADDRTYPE_IPV4 = 1, + TCP_ESTATS_ADDRTYPE_IPV6 = 2 +}; + +enum tcp_estats_event_type { + TCP_ESTATS_ESTABLISH, + TCP_ESTATS_PERIODIC, + TCP_ESTATS_TIMEOUT, + TCP_ESTATS_RETRANSMIT_TIMEOUT, + TCP_ESTATS_RETRANSMIT_OTHER, + TCP_ESTATS_SYN_RETRANSMIT, + TCP_ESTATS_SYNACK_RETRANSMIT, + TCP_ESTATS_TERM, + TCP_ESTATS_TX_RESET, + TCP_ESTATS_RX_RESET, + TCP_ESTATS_WRITE_TIMEOUT, + TCP_ESTATS_CONN_TIMEOUT, + TCP_ESTATS_ACK_LATENCY, + TCP_ESTATS_NEVENTS, +}; + +struct tcp_estats_event { + int pid; + int cpu; + unsigned long ts; + unsigned int magic; + enum tcp_estats_event_type event_type; +}; + +/* The below data structure is packed in order for + * llvm compiler to generate expected code. + */ +struct tcp_estats_conn_id { + unsigned int localaddressType; + struct { + unsigned char data[16]; + } localaddress; + struct { + unsigned char data[16]; + } remaddress; + unsigned short localport; + unsigned short remport; +} __attribute__((__packed__)); + +struct tcp_estats_basic_event { + struct tcp_estats_event event; + struct tcp_estats_conn_id conn_id; +}; + +struct bpf_map_def SEC("maps") ev_record_map = { + .type = BPF_MAP_TYPE_HASH, + .key_size = sizeof(__u32), + .value_size = sizeof(struct tcp_estats_basic_event), + .max_entries = 1024, +}; + +struct dummy_tracepoint_args { + unsigned long long pad; + struct sock *sock; +}; + +static __always_inline void tcp_estats_ev_init(struct tcp_estats_event *event, + enum tcp_estats_event_type type) +{ + event->magic = TCP_ESTATS_MAGIC; + event->ts = bpf_ktime_get_ns(); + event->event_type = type; +} + +static __always_inline void unaligned_u32_set(unsigned char *to, __u8 *from) +{ + to[0] = _(from[0]); + to[1] = _(from[1]); + to[2] = _(from[2]); + to[3] = _(from[3]); +} + +static __always_inline void conn_id_ipv4_init(struct tcp_estats_conn_id *conn_id, + __be32 *saddr, __be32 *daddr) +{ + conn_id->localaddressType = TCP_ESTATS_ADDRTYPE_IPV4; + + unaligned_u32_set(conn_id->localaddress.data, (__u8 *)saddr); + unaligned_u32_set(conn_id->remaddress.data, (__u8 *)daddr); +} + +static __always_inline void conn_id_ipv6_init(struct tcp_estats_conn_id *conn_id, + __be32 *saddr, __be32 *daddr) +{ + conn_id->localaddressType = TCP_ESTATS_ADDRTYPE_IPV6; + + unaligned_u32_set(conn_id->localaddress.data, (__u8 *)saddr); + unaligned_u32_set(conn_id->localaddress.data + sizeof(__u32), + (__u8 *)(saddr + 1)); + unaligned_u32_set(conn_id->localaddress.data + sizeof(__u32) * 2, + (__u8 *)(saddr + 2)); + unaligned_u32_set(conn_id->localaddress.data + sizeof(__u32) * 3, + (__u8 *)(saddr + 3)); + + unaligned_u32_set(conn_id->remaddress.data, + (__u8 *)(daddr)); + unaligned_u32_set(conn_id->remaddress.data + sizeof(__u32), + (__u8 *)(daddr + 1)); + unaligned_u32_set(conn_id->remaddress.data + sizeof(__u32) * 2, + (__u8 *)(daddr + 2)); + unaligned_u32_set(conn_id->remaddress.data + sizeof(__u32) * 3, + (__u8 *)(daddr + 3)); +} + +static __always_inline void tcp_estats_conn_id_init(struct tcp_estats_conn_id *conn_id, + struct sock *sk) +{ + conn_id->localport = _(inet_sk(sk)->inet_sport); + conn_id->remport = _(inet_sk(sk)->inet_dport); + + if (_(sk->sk_family) == AF_INET6) + conn_id_ipv6_init(conn_id, + sk->sk_v6_rcv_saddr.s6_addr32, + sk->sk_v6_daddr.s6_addr32); + else + conn_id_ipv4_init(conn_id, + &inet_sk(sk)->inet_saddr, + &inet_sk(sk)->inet_daddr); +} + +static __always_inline void tcp_estats_init(struct sock *sk, + struct tcp_estats_event *event, + struct tcp_estats_conn_id *conn_id, + enum tcp_estats_event_type type) +{ + tcp_estats_ev_init(event, type); + tcp_estats_conn_id_init(conn_id, sk); +} + +static __always_inline void send_basic_event(struct sock *sk, + enum tcp_estats_event_type type) +{ + struct tcp_estats_basic_event ev; + __u32 key = bpf_get_prandom_u32(); + + memset(&ev, 0, sizeof(ev)); + tcp_estats_init(sk, &ev.event, &ev.conn_id, type); + bpf_map_update_elem(&ev_record_map, &key, &ev, BPF_ANY); +} + +SEC("dummy_tracepoint") +int _dummy_tracepoint(struct dummy_tracepoint_args *arg) +{ + if (!arg->sock) + return 0; + + send_basic_event(arg->sock, TCP_ESTATS_TX_RESET); + return 0; +} + +char _license[] SEC("license") = "GPL"; +__u32 _version SEC("version") = 1; /* ignored by tracepoints, required by libbpf.a */ |