From cfbe3650dd3ef2ea9a4420ca89d9a4df98af3fb6 Mon Sep 17 00:00:00 2001 From: Dongliang Mu Date: Wed, 14 Jul 2021 11:27:03 +0800 Subject: netfilter: nf_tables: fix audit memory leak in nf_tables_commit In nf_tables_commit, if nf_tables_commit_audit_alloc fails, it does not free the adp variable. Fix this by adding nf_tables_commit_audit_free which frees the linked list with the head node adl. backtrace: kmalloc include/linux/slab.h:591 [inline] kzalloc include/linux/slab.h:721 [inline] nf_tables_commit_audit_alloc net/netfilter/nf_tables_api.c:8439 [inline] nf_tables_commit+0x16e/0x1760 net/netfilter/nf_tables_api.c:8508 nfnetlink_rcv_batch+0x512/0xa80 net/netfilter/nfnetlink.c:562 nfnetlink_rcv_skb_batch net/netfilter/nfnetlink.c:634 [inline] nfnetlink_rcv+0x1fa/0x220 net/netfilter/nfnetlink.c:652 netlink_unicast_kernel net/netlink/af_netlink.c:1314 [inline] netlink_unicast+0x2c7/0x3e0 net/netlink/af_netlink.c:1340 netlink_sendmsg+0x36b/0x6b0 net/netlink/af_netlink.c:1929 sock_sendmsg_nosec net/socket.c:702 [inline] sock_sendmsg+0x56/0x80 net/socket.c:722 Reported-by: syzbot Reported-by: kernel test robot Fixes: c520292f29b8 ("audit: log nftables configuration change events once per table") Signed-off-by: Dongliang Mu Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index de182d1f7c4e..081437dd75b7 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -8445,6 +8445,16 @@ static int nf_tables_commit_audit_alloc(struct list_head *adl, return 0; } +static void nf_tables_commit_audit_free(struct list_head *adl) +{ + struct nft_audit_data *adp, *adn; + + list_for_each_entry_safe(adp, adn, adl, list) { + list_del(&adp->list); + kfree(adp); + } +} + static void nf_tables_commit_audit_collect(struct list_head *adl, struct nft_table *table, u32 op) { @@ -8509,6 +8519,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) ret = nf_tables_commit_audit_alloc(&adl, trans->ctx.table); if (ret) { nf_tables_commit_chain_prepare_cancel(net); + nf_tables_commit_audit_free(&adl); return ret; } if (trans->msg_type == NFT_MSG_NEWRULE || @@ -8518,6 +8529,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) ret = nf_tables_commit_chain_prepare(net, chain); if (ret < 0) { nf_tables_commit_chain_prepare_cancel(net); + nf_tables_commit_audit_free(&adl); return ret; } } -- cgit v1.2.3 From ec61cd49bf566401306cfc4855bda8c08bbaa46c Mon Sep 17 00:00:00 2001 From: Johan Almbladh Date: Mon, 28 Jun 2021 14:37:13 +0200 Subject: mac80211: Do not strip skb headroom on monitor frames When a monitor interface is present together with other interfaces, a received skb is copied and received on the monitor netdev. Before, the copied skb was allocated with exactly the amount of space needed for the radiotap header, resulting in an skb without any headroom at all being received on the monitor netdev. With the introduction of eBPF and XDP in the kernel, skbs may be processed by custom eBPF programs. However, since the skb cannot be reallocated in the eBPF program, no more data or headers can be pushed. The old code made sure the final headroom was zero regardless of the value of NET_SKB_PAD, so increasing that constant would have no effect. Now we allocate monitor skb copies with a headroom of NET_SKB_PAD bytes before the radiotap header. Monitor interfaces now behave in the same way as other netdev interfaces that honor the NET_SKB_PAD constant. Signed-off-by: Johan Almbladh Link: https://lore.kernel.org/r/20210628123713.2070753-1-johan.almbladh@anyfinetworks.com Signed-off-by: Johannes Berg --- net/mac80211/rx.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 771921c057e8..2563473b5cf1 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -730,7 +730,8 @@ ieee80211_make_monitor_skb(struct ieee80211_local *local, * Need to make a copy and possibly remove radiotap header * and FCS from the original. */ - skb = skb_copy_expand(*origskb, needed_headroom, 0, GFP_ATOMIC); + skb = skb_copy_expand(*origskb, needed_headroom + NET_SKB_PAD, + 0, GFP_ATOMIC); if (!skb) return NULL; -- cgit v1.2.3 From 1a7915501ca94a1f10288defe333cd5ade210b63 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Tue, 29 Jun 2021 13:28:53 +0200 Subject: mac80211: fix starting aggregation sessions on mesh interfaces The logic for starting aggregation sessions was recently moved from minstrel_ht to mac80211, into the subif tx handler just after the sta lookup. Unfortunately this didn't work for mesh interfaces, since the sta lookup is deferred until a much later point in time on those. Fix this by also calling the aggregation check right after the deferred sta lookup. Fixes: 08a46c642001 ("mac80211: move A-MPDU session check from minstrel_ht to mac80211") Signed-off-by: Felix Fietkau Link: https://lore.kernel.org/r/20210629112853.29785-1-nbd@nbd.name Signed-off-by: Johannes Berg --- net/mac80211/tx.c | 57 +++++++++++++++++++++++++++++++------------------------ 1 file changed, 32 insertions(+), 25 deletions(-) (limited to 'net') diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index e96981144358..8509778ff31f 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -1147,6 +1147,29 @@ static bool ieee80211_tx_prep_agg(struct ieee80211_tx_data *tx, return queued; } +static void +ieee80211_aggr_check(struct ieee80211_sub_if_data *sdata, + struct sta_info *sta, + struct sk_buff *skb) +{ + struct rate_control_ref *ref = sdata->local->rate_ctrl; + u16 tid; + + if (!ref || !(ref->ops->capa & RATE_CTRL_CAPA_AMPDU_TRIGGER)) + return; + + if (!sta || !sta->sta.ht_cap.ht_supported || + !sta->sta.wme || skb_get_queue_mapping(skb) == IEEE80211_AC_VO || + skb->protocol == sdata->control_port_protocol) + return; + + tid = skb->priority & IEEE80211_QOS_CTL_TID_MASK; + if (likely(sta->ampdu_mlme.tid_tx[tid])) + return; + + ieee80211_start_tx_ba_session(&sta->sta, tid, 0); +} + /* * initialises @tx * pass %NULL for the station if unknown, a valid pointer if known @@ -1160,6 +1183,7 @@ ieee80211_tx_prepare(struct ieee80211_sub_if_data *sdata, struct ieee80211_local *local = sdata->local; struct ieee80211_hdr *hdr; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); + bool aggr_check = false; int tid; memset(tx, 0, sizeof(*tx)); @@ -1188,8 +1212,10 @@ ieee80211_tx_prepare(struct ieee80211_sub_if_data *sdata, } else if (tx->sdata->control_port_protocol == tx->skb->protocol) { tx->sta = sta_info_get_bss(sdata, hdr->addr1); } - if (!tx->sta && !is_multicast_ether_addr(hdr->addr1)) + if (!tx->sta && !is_multicast_ether_addr(hdr->addr1)) { tx->sta = sta_info_get(sdata, hdr->addr1); + aggr_check = true; + } } if (tx->sta && ieee80211_is_data_qos(hdr->frame_control) && @@ -1199,8 +1225,12 @@ ieee80211_tx_prepare(struct ieee80211_sub_if_data *sdata, struct tid_ampdu_tx *tid_tx; tid = ieee80211_get_tid(hdr); - tid_tx = rcu_dereference(tx->sta->ampdu_mlme.tid_tx[tid]); + if (!tid_tx && aggr_check) { + ieee80211_aggr_check(sdata, tx->sta, skb); + tid_tx = rcu_dereference(tx->sta->ampdu_mlme.tid_tx[tid]); + } + if (tid_tx) { bool queued; @@ -4120,29 +4150,6 @@ void ieee80211_txq_schedule_start(struct ieee80211_hw *hw, u8 ac) } EXPORT_SYMBOL(ieee80211_txq_schedule_start); -static void -ieee80211_aggr_check(struct ieee80211_sub_if_data *sdata, - struct sta_info *sta, - struct sk_buff *skb) -{ - struct rate_control_ref *ref = sdata->local->rate_ctrl; - u16 tid; - - if (!ref || !(ref->ops->capa & RATE_CTRL_CAPA_AMPDU_TRIGGER)) - return; - - if (!sta || !sta->sta.ht_cap.ht_supported || - !sta->sta.wme || skb_get_queue_mapping(skb) == IEEE80211_AC_VO || - skb->protocol == sdata->control_port_protocol) - return; - - tid = skb->priority & IEEE80211_QOS_CTL_TID_MASK; - if (likely(sta->ampdu_mlme.tid_tx[tid])) - return; - - ieee80211_start_tx_ba_session(&sta->sta, tid, 0); -} - void __ieee80211_subif_start_xmit(struct sk_buff *skb, struct net_device *dev, u32 info_flags, -- cgit v1.2.3 From a5d3cbdb09ff1f52cbe040932e06c8b9915c6dad Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Fri, 2 Jul 2021 07:01:11 +0200 Subject: mac80211: fix enabling 4-address mode on a sta vif after assoc Notify the driver about the 4-address mode change and also send a nulldata packet to the AP to notify it about the change Fixes: 1ff4e8f2dec8 ("mac80211: notify the driver when a sta uses 4-address mode") Signed-off-by: Felix Fietkau Link: https://lore.kernel.org/r/20210702050111.47546-1-nbd@nbd.name Signed-off-by: Johannes Berg --- net/mac80211/cfg.c | 19 +++++++++++++++++++ net/mac80211/ieee80211_i.h | 2 ++ net/mac80211/mlme.c | 4 ++-- 3 files changed, 23 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 84cc7733ea66..4e6f11e63df3 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -152,6 +152,8 @@ static int ieee80211_change_iface(struct wiphy *wiphy, struct vif_params *params) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + struct ieee80211_local *local = sdata->local; + struct sta_info *sta; int ret; ret = ieee80211_if_change_type(sdata, type); @@ -162,7 +164,24 @@ static int ieee80211_change_iface(struct wiphy *wiphy, RCU_INIT_POINTER(sdata->u.vlan.sta, NULL); ieee80211_check_fast_rx_iface(sdata); } else if (type == NL80211_IFTYPE_STATION && params->use_4addr >= 0) { + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; + + if (params->use_4addr == ifmgd->use_4addr) + return 0; + sdata->u.mgd.use_4addr = params->use_4addr; + if (!ifmgd->associated) + return 0; + + mutex_lock(&local->sta_mtx); + sta = sta_info_get(sdata, ifmgd->bssid); + if (sta) + drv_sta_set_4addr(local, sdata, &sta->sta, + params->use_4addr); + mutex_unlock(&local->sta_mtx); + + if (params->use_4addr) + ieee80211_send_4addr_nullfunc(local, sdata); } if (sdata->vif.type == NL80211_IFTYPE_MONITOR) { diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 22549b95d1aa..30ce6d2ec7ce 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -2201,6 +2201,8 @@ void ieee80211_dynamic_ps_timer(struct timer_list *t); void ieee80211_send_nullfunc(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, bool powersave); +void ieee80211_send_4addr_nullfunc(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata); void ieee80211_sta_tx_notify(struct ieee80211_sub_if_data *sdata, struct ieee80211_hdr *hdr, bool ack, u16 tx_time); diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index a00f11a33699..c0ea3b1aa9e1 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -1095,8 +1095,8 @@ void ieee80211_send_nullfunc(struct ieee80211_local *local, ieee80211_tx_skb(sdata, skb); } -static void ieee80211_send_4addr_nullfunc(struct ieee80211_local *local, - struct ieee80211_sub_if_data *sdata) +void ieee80211_send_4addr_nullfunc(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata) { struct sk_buff *skb; struct ieee80211_hdr *nullfunc; -- cgit v1.2.3 From 0d059964504a1605d84938c0b5b38f6573121c4a Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 12 Jul 2021 21:53:30 +0200 Subject: nl80211: limit band information in non-split data In non-split data, we shouldn't be adding S1G and 6 GHz data (or future bands) since we're really close to the 4k message size limit. Remove those bands, any modern userspace that can use S1G or 6 GHz should already be using split dumps, and if not then it needs to update. Link: https://lore.kernel.org/r/20210712215329.31444162a2c2.I5555312e4a074c84f8b4e7ad79dc4d1fbfc5126c@changeid Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 50eb405b0690..16c88beea48b 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -2351,7 +2351,10 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev, goto nla_put_failure; for (band = state->band_start; - band < NUM_NL80211_BANDS; band++) { + band < (state->split ? + NUM_NL80211_BANDS : + NL80211_BAND_60GHZ + 1); + band++) { struct ieee80211_supported_band *sband; /* omit higher bands for ancient software */ -- cgit v1.2.3 From f9a5c358c8d26fed0cc45f2afc64633d4ba21dff Mon Sep 17 00:00:00 2001 From: Nguyen Dinh Phi Date: Mon, 28 Jun 2021 21:23:34 +0800 Subject: cfg80211: Fix possible memory leak in function cfg80211_bss_update When we exceed the limit of BSS entries, this function will free the new entry, however, at this time, it is the last door to access the inputed ies, so these ies will be unreferenced objects and cause memory leak. Therefore we should free its ies before deallocating the new entry, beside of dropping it from hidden_list. Signed-off-by: Nguyen Dinh Phi Link: https://lore.kernel.org/r/20210628132334.851095-1-phind.uet@gmail.com Signed-off-by: Johannes Berg --- net/wireless/scan.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/wireless/scan.c b/net/wireless/scan.c index f03c7ac8e184..7897b1478c3c 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -1754,16 +1754,14 @@ cfg80211_bss_update(struct cfg80211_registered_device *rdev, * be grouped with this beacon for updates ... */ if (!cfg80211_combine_bsses(rdev, new)) { - kfree(new); + bss_ref_put(rdev, new); goto drop; } } if (rdev->bss_entries >= bss_entries_limit && !cfg80211_bss_expire_oldest(rdev)) { - if (!list_empty(&new->hidden_list)) - list_del(&new->hidden_list); - kfree(new); + bss_ref_put(rdev, new); goto drop; } -- cgit v1.2.3 From 32c3973d808301e7a980f80fee8818fdf7c82b09 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sat, 17 Jul 2021 10:10:29 +0200 Subject: netfilter: flowtable: avoid possible false sharing The flowtable follows the same timeout approach as conntrack, use the same idiom as in cc16921351d8 ("netfilter: conntrack: avoid same-timeout update") but also include the fix provided by e37542ba111f ("netfilter: conntrack: avoid possible false sharing"). Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_flow_table_core.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index 1e50908b1b7e..551976e4284c 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -331,7 +331,11 @@ EXPORT_SYMBOL_GPL(flow_offload_add); void flow_offload_refresh(struct nf_flowtable *flow_table, struct flow_offload *flow) { - flow->timeout = nf_flowtable_time_stamp + flow_offload_get_timeout(flow); + u32 timeout; + + timeout = nf_flowtable_time_stamp + flow_offload_get_timeout(flow); + if (READ_ONCE(flow->timeout) != timeout) + WRITE_ONCE(flow->timeout, timeout); if (likely(!nf_flowtable_hw_offload(flow_table))) return; -- cgit v1.2.3 From 32953df7a6eb56bd9b8f18a13034d55f9fc96cfa Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sat, 17 Jul 2021 10:20:08 +0200 Subject: netfilter: nft_last: avoid possible false sharing Use the idiom described in: https://github.com/google/ktsan/wiki/READ_ONCE-and-WRITE_ONCE#it-may-improve-performance Moreover, prevent a compiler optimization. Fixes: 836382dc2471 ("netfilter: nf_tables: add last expression") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_last.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_last.c b/net/netfilter/nft_last.c index 8088b99f2ee3..304e33cbed9b 100644 --- a/net/netfilter/nft_last.c +++ b/net/netfilter/nft_last.c @@ -48,24 +48,30 @@ static void nft_last_eval(const struct nft_expr *expr, { struct nft_last_priv *priv = nft_expr_priv(expr); - priv->last_jiffies = jiffies; - priv->last_set = 1; + if (READ_ONCE(priv->last_jiffies) != jiffies) + WRITE_ONCE(priv->last_jiffies, jiffies); + if (READ_ONCE(priv->last_set) == 0) + WRITE_ONCE(priv->last_set, 1); } static int nft_last_dump(struct sk_buff *skb, const struct nft_expr *expr) { struct nft_last_priv *priv = nft_expr_priv(expr); + unsigned long last_jiffies = READ_ONCE(priv->last_jiffies); + u32 last_set = READ_ONCE(priv->last_set); __be64 msecs; - if (time_before(jiffies, priv->last_jiffies)) - priv->last_set = 0; + if (time_before(jiffies, last_jiffies)) { + WRITE_ONCE(priv->last_set, 0); + last_set = 0; + } - if (priv->last_set) - msecs = nf_jiffies64_to_msecs(jiffies - priv->last_jiffies); + if (last_set) + msecs = nf_jiffies64_to_msecs(jiffies - last_jiffies); else msecs = 0; - if (nla_put_be32(skb, NFTA_LAST_SET, htonl(priv->last_set)) || + if (nla_put_be32(skb, NFTA_LAST_SET, htonl(last_set)) || nla_put_be64(skb, NFTA_LAST_MSECS, msecs, NFTA_LAST_PAD)) goto nla_put_failure; -- cgit v1.2.3 From 30a56a2b881821625f79837d4d968c679852444e Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sun, 18 Jul 2021 18:36:00 +0200 Subject: netfilter: conntrack: adjust stop timestamp to real expiry value In case the entry is evicted via garbage collection there is delay between the timeout value and the eviction event. This adjusts the stop value based on how much time has passed. Fixes: b87a2f9199ea82 ("netfilter: conntrack: add gc worker to remove timed-out entries") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_core.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 83c52df85870..5c03e5106751 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -670,8 +670,13 @@ bool nf_ct_delete(struct nf_conn *ct, u32 portid, int report) return false; tstamp = nf_conn_tstamp_find(ct); - if (tstamp && tstamp->stop == 0) + if (tstamp) { + s32 timeout = ct->timeout - nfct_time_stamp; + tstamp->stop = ktime_get_real_ns(); + if (timeout < 0) + tstamp->stop -= jiffies_to_nsecs(-timeout); + } if (nf_conntrack_event_report(IPCT_DESTROY, ct, portid, report) < 0) { -- cgit v1.2.3 From a33f387ecd5aafae514095c2c4a8c24f7aea7e8b Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 20 Jul 2021 18:22:50 +0200 Subject: netfilter: nft_nat: allow to specify layer 4 protocol NAT only nft_nat reports a bogus EAFNOSUPPORT if no layer 3 information is specified. Fixes: d07db9884a5f ("netfilter: nf_tables: introduce nft_validate_register_load()") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_nat.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c index 0840c635b752..be1595d6979d 100644 --- a/net/netfilter/nft_nat.c +++ b/net/netfilter/nft_nat.c @@ -201,7 +201,9 @@ static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr, alen = sizeof_field(struct nf_nat_range, min_addr.ip6); break; default: - return -EAFNOSUPPORT; + if (tb[NFTA_NAT_REG_ADDR_MIN]) + return -EAFNOSUPPORT; + break; } priv->family = family; -- cgit v1.2.3 From 217e26bd87b2930856726b48a4e71c768b8c9bf5 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 21 Jul 2021 17:22:32 +0200 Subject: netfilter: nfnl_hook: fix unused variable warning The only user of this variable is in an #ifdef: net/netfilter/nfnetlink_hook.c: In function 'nfnl_hook_entries_head': net/netfilter/nfnetlink_hook.c:177:28: error: unused variable 'netdev' [-Werror=unused-variable] Fixes: e2cf17d3774c ("netfilter: add new hook nfnl subsystem") Signed-off-by: Arnd Bergmann Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nfnetlink_hook.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/netfilter/nfnetlink_hook.c b/net/netfilter/nfnetlink_hook.c index 50b4e3c9347a..202f57d17bab 100644 --- a/net/netfilter/nfnetlink_hook.c +++ b/net/netfilter/nfnetlink_hook.c @@ -174,7 +174,9 @@ static const struct nf_hook_entries * nfnl_hook_entries_head(u8 pf, unsigned int hook, struct net *net, const char *dev) { const struct nf_hook_entries *hook_head = NULL; +#ifdef CONFIG_NETFILTER_INGRESS struct net_device *netdev; +#endif switch (pf) { case NFPROTO_IPV4: -- cgit v1.2.3 From f8dd60de194817c86bf812700980762bb5a8d9a4 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Thu, 22 Jul 2021 12:05:41 -0400 Subject: tipc: fix implicit-connect for SYN+ For implicit-connect, when it's either SYN- or SYN+, an ACK should be sent back to the client immediately. It's not appropriate for the client to enter established state only after receiving data from the server. On client side, after the SYN is sent out, tipc_wait_for_connect() should be called to wait for the ACK if timeout is set. This patch also restricts __tipc_sendstream() to call __sendmsg() only when it's in TIPC_OPEN state, so that the client can program in a single loop doing both connecting and data sending like: for (...) sendmsg(dest, buf); This makes the implicit-connect more implicit. Fixes: b97bf3fd8f6a ("[TIPC] Initial merge") Signed-off-by: Xin Long Acked-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/socket.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 34a97ea36cc8..ebd300c26a44 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -158,6 +158,7 @@ static void tipc_sk_remove(struct tipc_sock *tsk); static int __tipc_sendstream(struct socket *sock, struct msghdr *m, size_t dsz); static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dsz); static void tipc_sk_push_backlog(struct tipc_sock *tsk, bool nagle_ack); +static int tipc_wait_for_connect(struct socket *sock, long *timeo_p); static const struct proto_ops packet_ops; static const struct proto_ops stream_ops; @@ -1515,8 +1516,13 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen) rc = 0; } - if (unlikely(syn && !rc)) + if (unlikely(syn && !rc)) { tipc_set_sk_state(sk, TIPC_CONNECTING); + if (timeout) { + timeout = msecs_to_jiffies(timeout); + tipc_wait_for_connect(sock, &timeout); + } + } return rc ? rc : dlen; } @@ -1564,7 +1570,7 @@ static int __tipc_sendstream(struct socket *sock, struct msghdr *m, size_t dlen) return -EMSGSIZE; /* Handle implicit connection setup */ - if (unlikely(dest)) { + if (unlikely(dest && sk->sk_state == TIPC_OPEN)) { rc = __tipc_sendmsg(sock, m, dlen); if (dlen && dlen == rc) { tsk->peer_caps = tipc_node_get_capabilities(net, dnode); @@ -2689,9 +2695,10 @@ static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags, bool kern) { struct sock *new_sk, *sk = sock->sk; - struct sk_buff *buf; struct tipc_sock *new_tsock; + struct msghdr m = {NULL,}; struct tipc_msg *msg; + struct sk_buff *buf; long timeo; int res; @@ -2737,19 +2744,17 @@ static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags, } /* - * Respond to 'SYN-' by discarding it & returning 'ACK'-. - * Respond to 'SYN+' by queuing it on new socket. + * Respond to 'SYN-' by discarding it & returning 'ACK'. + * Respond to 'SYN+' by queuing it on new socket & returning 'ACK'. */ if (!msg_data_sz(msg)) { - struct msghdr m = {NULL,}; - tsk_advance_rx_queue(sk); - __tipc_sendstream(new_sock, &m, 0); } else { __skb_dequeue(&sk->sk_receive_queue); __skb_queue_head(&new_sk->sk_receive_queue, buf); skb_set_owner_r(buf, new_sk); } + __tipc_sendstream(new_sock, &m, 0); release_sock(new_sk); exit: release_sock(sk); -- cgit v1.2.3 From d237a7f11719ff9320721be5818352e48071aab6 Mon Sep 17 00:00:00 2001 From: Hoang Le Date: Fri, 23 Jul 2021 09:25:34 +0700 Subject: tipc: fix sleeping in tipc accept routine The release_sock() is blocking function, it would change the state after sleeping. In order to evaluate the stated condition outside the socket lock context, switch to use wait_woken() instead. Fixes: 6398e23cdb1d8 ("tipc: standardize accept routine") Acked-by: Jon Maloy Signed-off-by: Hoang Le Signed-off-by: David S. Miller --- net/tipc/socket.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/tipc/socket.c b/net/tipc/socket.c index ebd300c26a44..75b99b7eda22 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -2652,7 +2652,7 @@ static int tipc_listen(struct socket *sock, int len) static int tipc_wait_for_accept(struct socket *sock, long timeo) { struct sock *sk = sock->sk; - DEFINE_WAIT(wait); + DEFINE_WAIT_FUNC(wait, woken_wake_function); int err; /* True wake-one mechanism for incoming connections: only @@ -2661,12 +2661,12 @@ static int tipc_wait_for_accept(struct socket *sock, long timeo) * anymore, the common case will execute the loop only once. */ for (;;) { - prepare_to_wait_exclusive(sk_sleep(sk), &wait, - TASK_INTERRUPTIBLE); if (timeo && skb_queue_empty(&sk->sk_receive_queue)) { + add_wait_queue(sk_sleep(sk), &wait); release_sock(sk); - timeo = schedule_timeout(timeo); + timeo = wait_woken(&wait, TASK_INTERRUPTIBLE, timeo); lock_sock(sk); + remove_wait_queue(sk_sleep(sk), &wait); } err = 0; if (!skb_queue_empty(&sk->sk_receive_queue)) @@ -2678,7 +2678,6 @@ static int tipc_wait_for_accept(struct socket *sock, long timeo) if (signal_pending(current)) break; } - finish_wait(sk_sleep(sk), &wait); return err; } -- cgit v1.2.3 From 227adfb2b1dfbc53dfc53b9dd7a93a6298ff7c56 Mon Sep 17 00:00:00 2001 From: Gilad Naaman Date: Thu, 22 Jul 2021 20:01:28 +0300 Subject: net: Set true network header for ECN decapsulation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In cases where the header straight after the tunnel header was another ethernet header (TEB), instead of the network header, the ECN decapsulation code would treat the ethernet header as if it was an IP header, resulting in mishandling and possible wrong drops or corruption of the IP header. In this case, ECT(1) is sent, so IP_ECN_decapsulate tries to copy it to the inner IPv4 header, and correct its checksum. The offset of the ECT bits in an IPv4 header corresponds to the lower 2 bits of the second octet of the destination MAC address in the ethernet header. The IPv4 checksum corresponds to end of the source address. In order to reproduce: $ ip netns add A $ ip netns add B $ ip -n A link add _v0 type veth peer name _v1 netns B $ ip -n A link set _v0 up $ ip -n A addr add dev _v0 10.254.3.1/24 $ ip -n A route add default dev _v0 scope global $ ip -n B link set _v1 up $ ip -n B addr add dev _v1 10.254.1.6/24 $ ip -n B route add default dev _v1 scope global $ ip -n B link add gre1 type gretap local 10.254.1.6 remote 10.254.3.1 key 0x49000000 $ ip -n B link set gre1 up # Now send an IPv4/GRE/Eth/IPv4 frame where the outer header has ECT(1), # and the inner header has no ECT bits set: $ cat send_pkt.py #!/usr/bin/env python3 from scapy.all import * pkt = IP(b'E\x01\x00\xa7\x00\x00\x00\x00@/`%\n\xfe\x03\x01\n\xfe\x01\x06 \x00eXI\x00' b'\x00\x00\x18\xbe\x92\xa0\xee&\x18\xb0\x92\xa0l&\x08\x00E\x00\x00}\x8b\x85' b'@\x00\x01\x01\xe4\xf2\x82\x82\x82\x01\x82\x82\x82\x02\x08\x00d\x11\xa6\xeb' b'3\x1e\x1e\\xf3\\xf7`\x00\x00\x00\x00ZN\x00\x00\x00\x00\x00\x00\x10\x11\x12' b'\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !"#$%&\'()*+,-./01234' b'56789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ') send(pkt) $ sudo ip netns exec B tcpdump -neqlllvi gre1 icmp & ; sleep 1 $ sudo ip netns exec A python3 send_pkt.py In the original packet, the source/destinatio MAC addresses are dst=18:be:92:a0:ee:26 src=18:b0:92:a0:6c:26 In the received packet, they are dst=18:bd:92:a0:ee:26 src=18:b0:92:a0:6c:27 Thanks to Lahav Schlesinger and Isaac Garzon for helping me pinpoint the origin. Fixes: b723748750ec ("tunnel: Propagate ECT(1) when decapsulating as recommended by RFC6040") Cc: David S. Miller Cc: Hideaki YOSHIFUJI Cc: David Ahern Cc: Jakub Kicinski Cc: Toke Høiland-Jørgensen Signed-off-by: Gilad Naaman Acked-by: Toke Høiland-Jørgensen Signed-off-by: David S. Miller --- net/ipv4/ip_tunnel.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 0dca00745ac3..be75b409445c 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -390,7 +390,7 @@ int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb, tunnel->i_seqno = ntohl(tpi->seq) + 1; } - skb_reset_network_header(skb); + skb_set_network_header(skb, (tunnel->dev->type == ARPHRD_ETHER) ? ETH_HLEN : 0); err = IP_ECN_decapsulate(iph, skb); if (unlikely(err)) { -- cgit v1.2.3 From 46c7655f0b56b1ac864115441064cde9ed124f4a Mon Sep 17 00:00:00 2001 From: Kangmin Park Date: Fri, 23 Jul 2021 02:44:43 +0900 Subject: ipv6: decrease hop limit counter in ip6_forward() Decrease hop limit counter when deliver skb to ndp proxy. Signed-off-by: Kangmin Park Signed-off-by: David S. Miller --- net/ipv6/ip6_output.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index e1b9f7ac8bad..8e6ca9ad6812 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -549,9 +549,10 @@ int ip6_forward(struct sk_buff *skb) if (net->ipv6.devconf_all->proxy_ndp && pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) { int proxied = ip6_forward_proxy_check(skb); - if (proxied > 0) + if (proxied > 0) { + hdr->hop_limit--; return ip6_input(skb); - else if (proxied < 0) { + } else if (proxied < 0) { __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS); goto drop; } -- cgit v1.2.3 From 52f3456a96c06760b9bfae460e39596fec7af22e Mon Sep 17 00:00:00 2001 From: Pavel Skripkin Date: Fri, 23 Jul 2021 18:31:32 +0300 Subject: net: qrtr: fix memory leaks Syzbot reported memory leak in qrtr. The problem was in unputted struct sock. qrtr_local_enqueue() function calls qrtr_port_lookup() which takes sock reference if port was found. Then there is the following check: if (!ipc || &ipc->sk == skb->sk) { ... return -ENODEV; } Since we should drop the reference before returning from this function and ipc can be non-NULL inside this if, we should add qrtr_port_put() inside this if. The similar corner case is in qrtr_endpoint_post() as Manivannan reported. In case of sock_queue_rcv_skb() failure we need to put port reference to avoid leaking struct sock pointer. Fixes: e04df98adf7d ("net: qrtr: Remove receive worker") Fixes: bdabad3e363d ("net: Add Qualcomm IPC router") Reported-and-tested-by: syzbot+35a511c72ea7356cdcf3@syzkaller.appspotmail.com Signed-off-by: Pavel Skripkin Reviewed-by: Manivannan Sadhasivam Signed-off-by: David S. Miller --- net/qrtr/qrtr.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c index e6f4a6202f82..171b7f3be6ef 100644 --- a/net/qrtr/qrtr.c +++ b/net/qrtr/qrtr.c @@ -518,8 +518,10 @@ int qrtr_endpoint_post(struct qrtr_endpoint *ep, const void *data, size_t len) if (!ipc) goto err; - if (sock_queue_rcv_skb(&ipc->sk, skb)) + if (sock_queue_rcv_skb(&ipc->sk, skb)) { + qrtr_port_put(ipc); goto err; + } qrtr_port_put(ipc); } @@ -839,6 +841,8 @@ static int qrtr_local_enqueue(struct qrtr_node *node, struct sk_buff *skb, ipc = qrtr_port_lookup(to->sq_port); if (!ipc || &ipc->sk == skb->sk) { /* do not send to self */ + if (ipc) + qrtr_port_put(ipc); kfree_skb(skb); return -ENODEV; } -- cgit v1.2.3 From 54f93336d000229f72c26d8a3f69dd256b744528 Mon Sep 17 00:00:00 2001 From: Ziyang Xuan Date: Thu, 22 Jul 2021 15:08:19 +0800 Subject: can: raw: raw_setsockopt(): fix raw_rcv panic for sock UAF We get a bug during ltp can_filter test as following. =========================================== [60919.264984] BUG: unable to handle kernel NULL pointer dereference at 0000000000000010 [60919.265223] PGD 8000003dda726067 P4D 8000003dda726067 PUD 3dda727067 PMD 0 [60919.265443] Oops: 0000 [#1] SMP PTI [60919.265550] CPU: 30 PID: 3638365 Comm: can_filter Kdump: loaded Tainted: G W 4.19.90+ #1 [60919.266068] RIP: 0010:selinux_socket_sock_rcv_skb+0x3e/0x200 [60919.293289] RSP: 0018:ffff8d53bfc03cf8 EFLAGS: 00010246 [60919.307140] RAX: 0000000000000000 RBX: 000000000000001d RCX: 0000000000000007 [60919.320756] RDX: 0000000000000001 RSI: ffff8d5104a8ed00 RDI: ffff8d53bfc03d30 [60919.334319] RBP: ffff8d9338056800 R08: ffff8d53bfc29d80 R09: 0000000000000001 [60919.347969] R10: ffff8d53bfc03ec0 R11: ffffb8526ef47c98 R12: ffff8d53bfc03d30 [60919.350320] perf: interrupt took too long (3063 > 2500), lowering kernel.perf_event_max_sample_rate to 65000 [60919.361148] R13: 0000000000000001 R14: ffff8d53bcf90000 R15: 0000000000000000 [60919.361151] FS: 00007fb78b6b3600(0000) GS:ffff8d53bfc00000(0000) knlGS:0000000000000000 [60919.400812] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [60919.413730] CR2: 0000000000000010 CR3: 0000003e3f784006 CR4: 00000000007606e0 [60919.426479] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [60919.439339] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [60919.451608] PKRU: 55555554 [60919.463622] Call Trace: [60919.475617] [60919.487122] ? update_load_avg+0x89/0x5d0 [60919.498478] ? update_load_avg+0x89/0x5d0 [60919.509822] ? account_entity_enqueue+0xc5/0xf0 [60919.520709] security_sock_rcv_skb+0x2a/0x40 [60919.531413] sk_filter_trim_cap+0x47/0x1b0 [60919.542178] ? kmem_cache_alloc+0x38/0x1b0 [60919.552444] sock_queue_rcv_skb+0x17/0x30 [60919.562477] raw_rcv+0x110/0x190 [can_raw] [60919.572539] can_rcv_filter+0xbc/0x1b0 [can] [60919.582173] can_receive+0x6b/0xb0 [can] [60919.591595] can_rcv+0x31/0x70 [can] [60919.600783] __netif_receive_skb_one_core+0x5a/0x80 [60919.609864] process_backlog+0x9b/0x150 [60919.618691] net_rx_action+0x156/0x400 [60919.627310] ? sched_clock_cpu+0xc/0xa0 [60919.635714] __do_softirq+0xe8/0x2e9 [60919.644161] do_softirq_own_stack+0x2a/0x40 [60919.652154] [60919.659899] do_softirq.part.17+0x4f/0x60 [60919.667475] __local_bh_enable_ip+0x60/0x70 [60919.675089] __dev_queue_xmit+0x539/0x920 [60919.682267] ? finish_wait+0x80/0x80 [60919.689218] ? finish_wait+0x80/0x80 [60919.695886] ? sock_alloc_send_pskb+0x211/0x230 [60919.702395] ? can_send+0xe5/0x1f0 [can] [60919.708882] can_send+0xe5/0x1f0 [can] [60919.715037] raw_sendmsg+0x16d/0x268 [can_raw] It's because raw_setsockopt() concurrently with unregister_netdevice_many(). Concurrent scenario as following. cpu0 cpu1 raw_bind raw_setsockopt unregister_netdevice_many unlist_netdevice dev_get_by_index raw_notifier raw_enable_filters ...... can_rx_register can_rcv_list_find(..., net->can.rx_alldev_list) ...... sock_close raw_release(sock_a) ...... can_receive can_rcv_filter(net->can.rx_alldev_list, ...) raw_rcv(skb, sock_a) BUG After unlist_netdevice(), dev_get_by_index() return NULL in raw_setsockopt(). Function raw_enable_filters() will add sock and can_filter to net->can.rx_alldev_list. Then the sock is closed. Followed by, we sock_sendmsg() to a new vcan device use the same can_filter. Protocol stack match the old receiver whose sock has been released on net->can.rx_alldev_list in can_rcv_filter(). Function raw_rcv() uses the freed sock. UAF BUG is triggered. We can find that the key issue is that net_device has not been protected in raw_setsockopt(). Use rtnl_lock to protect net_device in raw_setsockopt(). Fixes: c18ce101f2e4 ("[CAN]: Add raw protocol") Link: https://lore.kernel.org/r/20210722070819.1048263-1-william.xuanziyang@huawei.com Cc: linux-stable Signed-off-by: Ziyang Xuan Acked-by: Oliver Hartkopp Signed-off-by: Marc Kleine-Budde --- net/can/raw.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/can/raw.c b/net/can/raw.c index ed4fcb7ab0c3..cd5a49380116 100644 --- a/net/can/raw.c +++ b/net/can/raw.c @@ -546,10 +546,18 @@ static int raw_setsockopt(struct socket *sock, int level, int optname, return -EFAULT; } + rtnl_lock(); lock_sock(sk); - if (ro->bound && ro->ifindex) + if (ro->bound && ro->ifindex) { dev = dev_get_by_index(sock_net(sk), ro->ifindex); + if (!dev) { + if (count > 1) + kfree(filter); + err = -ENODEV; + goto out_fil; + } + } if (ro->bound) { /* (try to) register the new filters */ @@ -588,6 +596,7 @@ static int raw_setsockopt(struct socket *sock, int level, int optname, dev_put(dev); release_sock(sk); + rtnl_unlock(); break; @@ -600,10 +609,16 @@ static int raw_setsockopt(struct socket *sock, int level, int optname, err_mask &= CAN_ERR_MASK; + rtnl_lock(); lock_sock(sk); - if (ro->bound && ro->ifindex) + if (ro->bound && ro->ifindex) { dev = dev_get_by_index(sock_net(sk), ro->ifindex); + if (!dev) { + err = -ENODEV; + goto out_err; + } + } /* remove current error mask */ if (ro->bound) { @@ -627,6 +642,7 @@ static int raw_setsockopt(struct socket *sock, int level, int optname, dev_put(dev); release_sock(sk); + rtnl_unlock(); break; -- cgit v1.2.3 From 0c71437dd50dd687c15d8ca80b3b68f10bb21d63 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Wed, 14 Jul 2021 13:16:02 +0200 Subject: can: j1939: j1939_session_deactivate(): clarify lifetime of session object The j1939_session_deactivate() is decrementing the session ref-count and potentially can free() the session. This would cause use-after-free situation. However, the code calling j1939_session_deactivate() does always hold another reference to the session, so that it would not be free()ed in this code path. This patch adds a comment to make this clear and a WARN_ON, to ensure that future changes will not violate this requirement. Further this patch avoids dereferencing the session pointer as a precaution to avoid use-after-free if the session is actually free()ed. Fixes: 9d71dd0c7009 ("can: add support of SAE J1939 protocol") Link: https://lore.kernel.org/r/20210714111602.24021-1-o.rempel@pengutronix.de Reported-by: Xiaochen Zou Signed-off-by: Oleksij Rempel Signed-off-by: Marc Kleine-Budde --- net/can/j1939/transport.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/can/j1939/transport.c b/net/can/j1939/transport.c index c3946c355882..bb1092c3e7e3 100644 --- a/net/can/j1939/transport.c +++ b/net/can/j1939/transport.c @@ -1075,11 +1075,16 @@ static bool j1939_session_deactivate_locked(struct j1939_session *session) static bool j1939_session_deactivate(struct j1939_session *session) { + struct j1939_priv *priv = session->priv; bool active; - j1939_session_list_lock(session->priv); + j1939_session_list_lock(priv); + /* This function should be called with a session ref-count of at + * least 2. + */ + WARN_ON_ONCE(kref_read(&session->kref) < 2); active = j1939_session_deactivate_locked(session); - j1939_session_list_unlock(session->priv); + j1939_session_list_unlock(priv); return active; } -- cgit v1.2.3 From c6eea1c8bda56737752465a298dc6ce07d6b8ce3 Mon Sep 17 00:00:00 2001 From: Zhang Changzhong Date: Tue, 6 Jul 2021 19:00:08 +0800 Subject: can: j1939: j1939_xtp_rx_dat_one(): fix rxtimer value between consecutive TP.DT to 750ms For receive side, the max time interval between two consecutive TP.DT should be 750ms. Fixes: 9d71dd0c7009 ("can: add support of SAE J1939 protocol") Link: https://lore.kernel.org/r/1625569210-47506-1-git-send-email-zhangchangzhong@huawei.com Cc: linux-stable Signed-off-by: Zhang Changzhong Acked-by: Oleksij Rempel Signed-off-by: Marc Kleine-Budde --- net/can/j1939/transport.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/can/j1939/transport.c b/net/can/j1939/transport.c index bb1092c3e7e3..bdc95bd7a851 100644 --- a/net/can/j1939/transport.c +++ b/net/can/j1939/transport.c @@ -1874,7 +1874,7 @@ static void j1939_xtp_rx_dat_one(struct j1939_session *session, if (!session->transmission) j1939_tp_schedule_txtimer(session, 0); } else { - j1939_tp_set_rxtimeout(session, 250); + j1939_tp_set_rxtimeout(session, 750); } session->last_cmd = 0xff; consume_skb(se_skb); -- cgit v1.2.3 From 3cf4375a090473d240281a0d2b04a3a5aaeac34b Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 23 Jul 2021 18:46:01 -0400 Subject: tipc: do not write skb_shinfo frags when doing decrytion One skb's skb_shinfo frags are not writable, and they can be shared with other skbs' like by pskb_copy(). To write the frags may cause other skb's data crash. So before doing en/decryption, skb_cow_data() should always be called for a cloned or nonlinear skb if req dst is using the same sg as req src. While at it, the likely branch can be removed, as it will be covered by skb_cow_data(). Note that esp_input() has the same issue, and I will fix it in another patch. tipc_aead_encrypt() doesn't have this issue, as it only processes linear data in the unlikely branch. Fixes: fc1b6d6de220 ("tipc: introduce TIPC encryption & authentication") Reported-by: Shuang Li Signed-off-by: Xin Long Acked-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/crypto.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/tipc/crypto.c b/net/tipc/crypto.c index e5c43d4d5a75..c9391d38de85 100644 --- a/net/tipc/crypto.c +++ b/net/tipc/crypto.c @@ -898,16 +898,10 @@ static int tipc_aead_decrypt(struct net *net, struct tipc_aead *aead, if (unlikely(!aead)) return -ENOKEY; - /* Cow skb data if needed */ - if (likely(!skb_cloned(skb) && - (!skb_is_nonlinear(skb) || !skb_has_frag_list(skb)))) { - nsg = 1 + skb_shinfo(skb)->nr_frags; - } else { - nsg = skb_cow_data(skb, 0, &unused); - if (unlikely(nsg < 0)) { - pr_err("RX: skb_cow_data() returned %d\n", nsg); - return nsg; - } + nsg = skb_cow_data(skb, 0, &unused); + if (unlikely(nsg < 0)) { + pr_err("RX: skb_cow_data() returned %d\n", nsg); + return nsg; } /* Allocate memory for the AEAD operation */ -- cgit v1.2.3 From 149ea30fdd5c28b89a3bfdecfc75cdab1deddb14 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Fri, 23 Jul 2021 17:56:00 +0300 Subject: devlink: Fix phys_port_name of virtual port and merge error Merge commit cited in fixes tag was incorrect. Due to it phys_port_name of the virtual port resulted in incorrect name. Also the phys_port_name of the physical port was written twice due to the merge error. Fix it by removing the old code and inserting back the misplaced code. Related commits of interest in net and net-next branches that resulted in merge conflict are: in net-next branch: commit f285f37cb1e6 ("devlink: append split port number to the port name") in net branch: commit b28d8f0c25a9 ("devlink: Correct VIRTUAL port to not have phys_port attributes") Fixes: 126285651b7 ("Merge ra.kernel.org:/pub/scm/linux/kernel/git/netdev/net") Signed-off-by: Parav Pandit Reported-by: Niklas Schnelle Tested-by: Niklas Schnelle Signed-off-by: David S. Miller --- net/core/devlink.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/core/devlink.c b/net/core/devlink.c index 8fdd04f00fd7..85032626de24 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -9328,18 +9328,10 @@ static int __devlink_port_phys_port_name_get(struct devlink_port *devlink_port, switch (attrs->flavour) { case DEVLINK_PORT_FLAVOUR_PHYSICAL: - case DEVLINK_PORT_FLAVOUR_VIRTUAL: n = snprintf(name, len, "p%u", attrs->phys.port_number); if (n < len && attrs->split) n += snprintf(name + n, len - n, "s%u", attrs->phys.split_subport_number); - if (!attrs->split) - n = snprintf(name, len, "p%u", attrs->phys.port_number); - else - n = snprintf(name, len, "p%us%u", - attrs->phys.port_number, - attrs->phys.split_subport_number); - break; case DEVLINK_PORT_FLAVOUR_CPU: case DEVLINK_PORT_FLAVOUR_DSA: @@ -9381,6 +9373,8 @@ static int __devlink_port_phys_port_name_get(struct devlink_port *devlink_port, n = snprintf(name, len, "pf%usf%u", attrs->pci_sf.pf, attrs->pci_sf.sf); break; + case DEVLINK_PORT_FLAVOUR_VIRTUAL: + return -EOPNOTSUPP; } if (n >= len) -- cgit v1.2.3 From 058e6e0ed0eace43401c945082dec1d669b5b231 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sun, 25 Jul 2021 13:42:50 -0400 Subject: sctp: improve the code for pmtu probe send and recv update This patch does 3 things: - make sctp_transport_pl_send() and sctp_transport_pl_recv() return bool type to decide if more probe is needed to send. - pr_debug() only when probe is really needed to send. - count pl.raise_count in sctp_transport_pl_send() instead of sctp_transport_pl_recv(), and it's only incremented for the 1st probe for the same size. These are preparations for the next patch to make probes happen only when there's packet loss in Search Complete state. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- include/net/sctp/structs.h | 4 ++-- net/sctp/sm_statefuns.c | 15 +++++++-------- net/sctp/transport.c | 41 +++++++++++++++++++++++------------------ 3 files changed, 32 insertions(+), 28 deletions(-) (limited to 'net') diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 32fc4a309df5..f3d414ed208e 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -1024,8 +1024,8 @@ bool sctp_transport_update_pmtu(struct sctp_transport *t, u32 pmtu); void sctp_transport_immediate_rtx(struct sctp_transport *); void sctp_transport_dst_release(struct sctp_transport *t); void sctp_transport_dst_confirm(struct sctp_transport *t); -void sctp_transport_pl_send(struct sctp_transport *t); -void sctp_transport_pl_recv(struct sctp_transport *t); +bool sctp_transport_pl_send(struct sctp_transport *t); +bool sctp_transport_pl_recv(struct sctp_transport *t); /* This is the structure we use to queue packets as they come into diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index 09a8f23ec709..32df65f68c12 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -1109,12 +1109,12 @@ enum sctp_disposition sctp_sf_send_probe(struct net *net, if (!sctp_transport_pl_enabled(transport)) return SCTP_DISPOSITION_CONSUME; - sctp_transport_pl_send(transport); - - reply = sctp_make_heartbeat(asoc, transport, transport->pl.probe_size); - if (!reply) - return SCTP_DISPOSITION_NOMEM; - sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(reply)); + if (sctp_transport_pl_send(transport)) { + reply = sctp_make_heartbeat(asoc, transport, transport->pl.probe_size); + if (!reply) + return SCTP_DISPOSITION_NOMEM; + sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(reply)); + } sctp_add_cmd_sf(commands, SCTP_CMD_PROBE_TIMER_UPDATE, SCTP_TRANSPORT(transport)); @@ -1274,8 +1274,7 @@ enum sctp_disposition sctp_sf_backbeat_8_3(struct net *net, !sctp_transport_pl_enabled(link)) return SCTP_DISPOSITION_DISCARD; - sctp_transport_pl_recv(link); - if (link->pl.state == SCTP_PL_COMPLETE) + if (sctp_transport_pl_recv(link)) return SCTP_DISPOSITION_CONSUME; return sctp_sf_send_probe(net, ep, asoc, type, link, commands); diff --git a/net/sctp/transport.c b/net/sctp/transport.c index 397a6244dd97..23e7bd3e3bd4 100644 --- a/net/sctp/transport.c +++ b/net/sctp/transport.c @@ -258,16 +258,12 @@ void sctp_transport_pmtu(struct sctp_transport *transport, struct sock *sk) sctp_transport_pl_update(transport); } -void sctp_transport_pl_send(struct sctp_transport *t) +bool sctp_transport_pl_send(struct sctp_transport *t) { - pr_debug("%s: PLPMTUD: transport: %p, state: %d, pmtu: %d, size: %d, high: %d\n", - __func__, t, t->pl.state, t->pl.pmtu, t->pl.probe_size, t->pl.probe_high); - - if (t->pl.probe_count < SCTP_MAX_PROBES) { - t->pl.probe_count++; - return; - } + if (t->pl.probe_count < SCTP_MAX_PROBES) + goto out; + t->pl.probe_count = 0; if (t->pl.state == SCTP_PL_BASE) { if (t->pl.probe_size == SCTP_BASE_PLPMTU) { /* BASE_PLPMTU Confirmation Failed */ t->pl.state = SCTP_PL_ERROR; /* Base -> Error */ @@ -299,10 +295,20 @@ void sctp_transport_pl_send(struct sctp_transport *t) sctp_assoc_sync_pmtu(t->asoc); } } - t->pl.probe_count = 1; + +out: + if (t->pl.state == SCTP_PL_COMPLETE && t->pl.raise_count < 30 && + !t->pl.probe_count) + t->pl.raise_count++; + + pr_debug("%s: PLPMTUD: transport: %p, state: %d, pmtu: %d, size: %d, high: %d\n", + __func__, t, t->pl.state, t->pl.pmtu, t->pl.probe_size, t->pl.probe_high); + + t->pl.probe_count++; + return true; } -void sctp_transport_pl_recv(struct sctp_transport *t) +bool sctp_transport_pl_recv(struct sctp_transport *t) { pr_debug("%s: PLPMTUD: transport: %p, state: %d, pmtu: %d, size: %d, high: %d\n", __func__, t, t->pl.state, t->pl.pmtu, t->pl.probe_size, t->pl.probe_high); @@ -323,7 +329,7 @@ void sctp_transport_pl_recv(struct sctp_transport *t) if (!t->pl.probe_high) { t->pl.probe_size = min(t->pl.probe_size + SCTP_PL_BIG_STEP, SCTP_MAX_PLPMTU); - return; + return false; } t->pl.probe_size += SCTP_PL_MIN_STEP; if (t->pl.probe_size >= t->pl.probe_high) { @@ -335,14 +341,13 @@ void sctp_transport_pl_recv(struct sctp_transport *t) t->pathmtu = t->pl.pmtu + sctp_transport_pl_hlen(t); sctp_assoc_sync_pmtu(t->asoc); } - } else if (t->pl.state == SCTP_PL_COMPLETE) { - t->pl.raise_count++; - if (t->pl.raise_count == 30) { - /* Raise probe_size again after 30 * interval in Search Complete */ - t->pl.state = SCTP_PL_SEARCH; /* Search Complete -> Search */ - t->pl.probe_size += SCTP_PL_MIN_STEP; - } + } else if (t->pl.state == SCTP_PL_COMPLETE && t->pl.raise_count == 30) { + /* Raise probe_size again after 30 * interval in Search Complete */ + t->pl.state = SCTP_PL_SEARCH; /* Search Complete -> Search */ + t->pl.probe_size += SCTP_PL_MIN_STEP; } + + return t->pl.state == SCTP_PL_COMPLETE; } static bool sctp_transport_pl_toobig(struct sctp_transport *t, u32 pmtu) -- cgit v1.2.3 From eacf078cf4c7aa23e9591738511f142cc39b5186 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sun, 25 Jul 2021 13:42:51 -0400 Subject: sctp: send pmtu probe only if packet loss in Search Complete state This patch is to introduce last_rtx_chunks into sctp_transport to detect if there's any packet retransmission/loss happened by checking against asoc's rtx_data_chunks in sctp_transport_pl_send(). If there is, namely, transport->last_rtx_chunks != asoc->rtx_data_chunks, the pmtu probe will be sent out. Otherwise, increment the pl.raise_count and return when it's in Search Complete state. With this patch, if in Search Complete state, which is a long period, it doesn't need to keep probing the current pmtu unless there's data packet loss. This will save quite some traffic. v1->v2: - add the missing Fixes tag. Fixes: 0dac127c0557 ("sctp: do black hole detection in search complete state") Signed-off-by: Xin Long Signed-off-by: David S. Miller --- include/net/sctp/structs.h | 1 + net/sctp/transport.c | 6 +++++- 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index f3d414ed208e..651bba654d77 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -984,6 +984,7 @@ struct sctp_transport { } cacc; struct { + __u32 last_rtx_chunks; __u16 pmtu; __u16 probe_size; __u16 probe_high; diff --git a/net/sctp/transport.c b/net/sctp/transport.c index 23e7bd3e3bd4..a3d3ca6dd63d 100644 --- a/net/sctp/transport.c +++ b/net/sctp/transport.c @@ -263,6 +263,7 @@ bool sctp_transport_pl_send(struct sctp_transport *t) if (t->pl.probe_count < SCTP_MAX_PROBES) goto out; + t->pl.last_rtx_chunks = t->asoc->rtx_data_chunks; t->pl.probe_count = 0; if (t->pl.state == SCTP_PL_BASE) { if (t->pl.probe_size == SCTP_BASE_PLPMTU) { /* BASE_PLPMTU Confirmation Failed */ @@ -298,8 +299,10 @@ bool sctp_transport_pl_send(struct sctp_transport *t) out: if (t->pl.state == SCTP_PL_COMPLETE && t->pl.raise_count < 30 && - !t->pl.probe_count) + !t->pl.probe_count && t->pl.last_rtx_chunks == t->asoc->rtx_data_chunks) { t->pl.raise_count++; + return false; + } pr_debug("%s: PLPMTUD: transport: %p, state: %d, pmtu: %d, size: %d, high: %d\n", __func__, t, t->pl.state, t->pl.pmtu, t->pl.probe_size, t->pl.probe_high); @@ -313,6 +316,7 @@ bool sctp_transport_pl_recv(struct sctp_transport *t) pr_debug("%s: PLPMTUD: transport: %p, state: %d, pmtu: %d, size: %d, high: %d\n", __func__, t, t->pl.state, t->pl.pmtu, t->pl.probe_size, t->pl.probe_high); + t->pl.last_rtx_chunks = t->asoc->rtx_data_chunks; t->pl.pmtu = t->pl.probe_size; t->pl.probe_count = 0; if (t->pl.state == SCTP_PL_BASE) { -- cgit v1.2.3 From 2ebda027148315581b89a2ed2fef84ad53b2aedd Mon Sep 17 00:00:00 2001 From: Chen Shen Date: Mon, 26 Jul 2021 13:47:34 +0800 Subject: sctp: delete addr based on sin6_scope_id sctp_inet6addr_event deletes 'addr' from 'local_addr_list' when setting netdev down, but it is possible to delete the incorrect entry (match the first one with the same ipaddr, but the different 'ifindex'), if there are some netdevs with the same 'local-link' ipaddr added already. It should delete the entry depending on 'sin6_addr' and 'sin6_scope_id' both. otherwise, the endpoint will call 'sctp_sf_ootb' if it can't find the according association when receives 'heartbeat', and finally will reply 'abort'. For example: 1.when linux startup the entries in local_addr_list: ifindex:35 addr:fe80::40:43ff:fe80:0 (eths0.201) ifindex:36 addr:fe80::40:43ff:fe80:0 (eths0.209) ifindex:37 addr:fe80::40:43ff:fe80:0 (eths0.210) the route table: local fe80::40:43ff:fe80:0 dev eths0.201 local fe80::40:43ff:fe80:0 dev eths0.209 local fe80::40:43ff:fe80:0 dev eths0.210 2.after 'ifconfig eths0.209 down' the entries in local_addr_list: ifindex:36 addr:fe80::40:43ff:fe80:0 (eths0.209) ifindex:37 addr:fe80::40:43ff:fe80:0 (eths0.210) the route table: local fe80::40:43ff:fe80:0 dev eths0.201 local fe80::40:43ff:fe80:0 dev eths0.210 3.asoc not found for src:[fe80::40:43ff:fe80:0]:37381 dst:[:1]:53335 ::1->fe80::40:43ff:fe80:0 HEARTBEAT fe80::40:43ff:fe80:0->::1 ABORT Signed-off-by: Chen Shen Signed-off-by: David S. Miller --- net/sctp/ipv6.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index e48dd909dee5..470dbdc27d58 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -100,8 +100,9 @@ static int sctp_inet6addr_event(struct notifier_block *this, unsigned long ev, list_for_each_entry_safe(addr, temp, &net->sctp.local_addr_list, list) { if (addr->a.sa.sa_family == AF_INET6 && - ipv6_addr_equal(&addr->a.v6.sin6_addr, - &ifa->addr)) { + ipv6_addr_equal(&addr->a.v6.sin6_addr, + &ifa->addr) && + addr->a.v6.sin6_scope_id == ifa->idev->dev->ifindex) { sctp_addr_wq_mgmt(net, addr, SCTP_ADDR_DEL); found = 1; addr->valid = 0; -- cgit v1.2.3 From c7c9d2102c9c098916ab9e0ab248006107d00d6c Mon Sep 17 00:00:00 2001 From: Pavel Skripkin Date: Sun, 25 Jul 2021 00:11:59 +0300 Subject: net: llc: fix skb_over_panic Syzbot reported skb_over_panic() in llc_pdu_init_as_xid_cmd(). The problem was in wrong LCC header manipulations. Syzbot's reproducer tries to send XID packet. llc_ui_sendmsg() is doing following steps: 1. skb allocation with size = len + header size len is passed from userpace and header size is 3 since addr->sllc_xid is set. 2. skb_reserve() for header_len = 3 3. filling all other space with memcpy_from_msg() Ok, at this moment we have fully loaded skb, only headers needs to be filled. Then code comes to llc_sap_action_send_xid_c(). This function pushes 3 bytes for LLC PDU header and initializes it. Then comes llc_pdu_init_as_xid_cmd(). It initalizes next 3 bytes *AFTER* LLC PDU header and call skb_push(skb, 3). This looks wrong for 2 reasons: 1. Bytes rigth after LLC header are user data, so this function was overwriting payload. 2. skb_push(skb, 3) call can cause skb_over_panic() since all free space was filled in llc_ui_sendmsg(). (This can happen is user passed 686 len: 686 + 14 (eth header) + 3 (LLC header) = 703. SKB_DATA_ALIGN(703) = 704) So, in this patch I added 2 new private constansts: LLC_PDU_TYPE_U_XID and LLC_PDU_LEN_U_XID. LLC_PDU_LEN_U_XID is used to correctly reserve header size to handle LLC + XID case. LLC_PDU_TYPE_U_XID is used by llc_pdu_header_init() function to push 6 bytes instead of 3. And finally I removed skb_push() call from llc_pdu_init_as_xid_cmd(). This changes should not affect other parts of LLC, since after all steps we just transmit buffer. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-and-tested-by: syzbot+5e5a981ad7cc54c4b2b4@syzkaller.appspotmail.com Signed-off-by: Pavel Skripkin Signed-off-by: David S. Miller --- include/net/llc_pdu.h | 31 +++++++++++++++++++++++-------- net/llc/af_llc.c | 10 +++++++++- net/llc/llc_s_ac.c | 2 +- 3 files changed, 33 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/include/net/llc_pdu.h b/include/net/llc_pdu.h index c0f0a13ed818..49aa79c7b278 100644 --- a/include/net/llc_pdu.h +++ b/include/net/llc_pdu.h @@ -15,9 +15,11 @@ #include /* Lengths of frame formats */ -#define LLC_PDU_LEN_I 4 /* header and 2 control bytes */ -#define LLC_PDU_LEN_S 4 -#define LLC_PDU_LEN_U 3 /* header and 1 control byte */ +#define LLC_PDU_LEN_I 4 /* header and 2 control bytes */ +#define LLC_PDU_LEN_S 4 +#define LLC_PDU_LEN_U 3 /* header and 1 control byte */ +/* header and 1 control byte and XID info */ +#define LLC_PDU_LEN_U_XID (LLC_PDU_LEN_U + sizeof(struct llc_xid_info)) /* Known SAP addresses */ #define LLC_GLOBAL_SAP 0xFF #define LLC_NULL_SAP 0x00 /* not network-layer visible */ @@ -50,9 +52,10 @@ #define LLC_PDU_TYPE_U_MASK 0x03 /* 8-bit control field */ #define LLC_PDU_TYPE_MASK 0x03 -#define LLC_PDU_TYPE_I 0 /* first bit */ -#define LLC_PDU_TYPE_S 1 /* first two bits */ -#define LLC_PDU_TYPE_U 3 /* first two bits */ +#define LLC_PDU_TYPE_I 0 /* first bit */ +#define LLC_PDU_TYPE_S 1 /* first two bits */ +#define LLC_PDU_TYPE_U 3 /* first two bits */ +#define LLC_PDU_TYPE_U_XID 4 /* private type for detecting XID commands */ #define LLC_PDU_TYPE_IS_I(pdu) \ ((!(pdu->ctrl_1 & LLC_PDU_TYPE_I_MASK)) ? 1 : 0) @@ -230,9 +233,18 @@ static inline struct llc_pdu_un *llc_pdu_un_hdr(struct sk_buff *skb) static inline void llc_pdu_header_init(struct sk_buff *skb, u8 type, u8 ssap, u8 dsap, u8 cr) { - const int hlen = type == LLC_PDU_TYPE_U ? 3 : 4; + int hlen = 4; /* default value for I and S types */ struct llc_pdu_un *pdu; + switch (type) { + case LLC_PDU_TYPE_U: + hlen = 3; + break; + case LLC_PDU_TYPE_U_XID: + hlen = 6; + break; + } + skb_push(skb, hlen); skb_reset_network_header(skb); pdu = llc_pdu_un_hdr(skb); @@ -374,7 +386,10 @@ static inline void llc_pdu_init_as_xid_cmd(struct sk_buff *skb, xid_info->fmt_id = LLC_XID_FMT_ID; /* 0x81 */ xid_info->type = svcs_supported; xid_info->rw = rx_window << 1; /* size of receive window */ - skb_put(skb, sizeof(struct llc_xid_info)); + + /* no need to push/put since llc_pdu_header_init() has already + * pushed 3 + 3 bytes + */ } /** diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index 7180979114e4..ac5cadd02cfa 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -98,8 +98,16 @@ static inline u8 llc_ui_header_len(struct sock *sk, struct sockaddr_llc *addr) { u8 rc = LLC_PDU_LEN_U; - if (addr->sllc_test || addr->sllc_xid) + if (addr->sllc_test) rc = LLC_PDU_LEN_U; + else if (addr->sllc_xid) + /* We need to expand header to sizeof(struct llc_xid_info) + * since llc_pdu_init_as_xid_cmd() sets 4,5,6 bytes of LLC header + * as XID PDU. In llc_ui_sendmsg() we reserved header size and then + * filled all other space with user data. If we won't reserve this + * bytes, llc_pdu_init_as_xid_cmd() will overwrite user data + */ + rc = LLC_PDU_LEN_U_XID; else if (sk->sk_type == SOCK_STREAM) rc = LLC_PDU_LEN_I; return rc; diff --git a/net/llc/llc_s_ac.c b/net/llc/llc_s_ac.c index b554f26c68ee..79d1cef8f15a 100644 --- a/net/llc/llc_s_ac.c +++ b/net/llc/llc_s_ac.c @@ -79,7 +79,7 @@ int llc_sap_action_send_xid_c(struct llc_sap *sap, struct sk_buff *skb) struct llc_sap_state_ev *ev = llc_sap_ev(skb); int rc; - llc_pdu_header_init(skb, LLC_PDU_TYPE_U, ev->saddr.lsap, + llc_pdu_header_init(skb, LLC_PDU_TYPE_U_XID, ev->saddr.lsap, ev->daddr.lsap, LLC_PDU_CMD); llc_pdu_init_as_xid_cmd(skb, LLC_XID_NULL_CLASS_2, 0); rc = llc_mac_hdr_init(skb, ev->saddr.mac, ev->daddr.mac); -- cgit v1.2.3 From 343597d558e79fe704ba8846b5b2ed24056b89c2 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Tue, 27 Jul 2021 09:04:58 -0700 Subject: bpf, sockmap: Zap ingress queues after stopping strparser We don't want strparser to run and pass skbs into skmsg handlers when the psock is null. We just sk_drop them in this case. When removing a live socket from map it means extra drops that we do not need to incur. Move the zap below strparser close to avoid this condition. This way we stop the stream parser first stopping it from processing packets and then delete the psock. Fixes: a136678c0bdbb ("bpf: sk_msg, zap ingress queue on psock down") Signed-off-by: John Fastabend Signed-off-by: Andrii Nakryiko Acked-by: Jakub Sitnicki Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20210727160500.1713554-2-john.fastabend@gmail.com --- net/core/skmsg.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 15d71288e741..28115ef742e8 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -773,8 +773,6 @@ static void sk_psock_destroy(struct work_struct *work) void sk_psock_drop(struct sock *sk, struct sk_psock *psock) { - sk_psock_stop(psock, false); - write_lock_bh(&sk->sk_callback_lock); sk_psock_restore_proto(sk, psock); rcu_assign_sk_user_data(sk, NULL); @@ -784,6 +782,8 @@ void sk_psock_drop(struct sock *sk, struct sk_psock *psock) sk_psock_stop_verdict(sk, psock); write_unlock_bh(&sk->sk_callback_lock); + sk_psock_stop(psock, false); + INIT_RCU_WORK(&psock->rwork, sk_psock_destroy); queue_rcu_work(system_wq, &psock->rwork); } -- cgit v1.2.3 From 476d98018f32e68e7c5d4e8456940cf2b6d66f10 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Tue, 27 Jul 2021 09:04:59 -0700 Subject: bpf, sockmap: On cleanup we additionally need to remove cached skb Its possible if a socket is closed and the receive thread is under memory pressure it may have cached a skb. We need to ensure these skbs are free'd along with the normal ingress_skb queue. Before 799aa7f98d53 ("skmsg: Avoid lock_sock() in sk_psock_backlog()") tear down and backlog processing both had sock_lock for the common case of socket close or unhash. So it was not possible to have both running in parrallel so all we would need is the kfree in those kernels. But, latest kernels include the commit 799aa7f98d5e and this requires a bit more work. Without the ingress_lock guarding reading/writing the state->skb case its possible the tear down could run before the state update causing it to leak memory or worse when the backlog reads the state it could potentially run interleaved with the tear down and we might end up free'ing the state->skb from tear down side but already have the reference from backlog side. To resolve such races we wrap accesses in ingress_lock on both sides serializing tear down and backlog case. In both cases this only happens after an EAGAIN error case so having an extra lock in place is likely fine. The normal path will skip the locks. Note, we check state->skb before grabbing lock. This works because we can only enqueue with the mutex we hold already. Avoiding a race on adding state->skb after the check. And if tear down path is running that is also fine if the tear down path then removes state->skb we will simply set skb=NULL and the subsequent goto is skipped. This slight complication avoids locking in normal case. With this fix we no longer see this warning splat from tcp side on socket close when we hit the above case with redirect to ingress self. [224913.935822] WARNING: CPU: 3 PID: 32100 at net/core/stream.c:208 sk_stream_kill_queues+0x212/0x220 [224913.935841] Modules linked in: fuse overlay bpf_preload x86_pkg_temp_thermal intel_uncore wmi_bmof squashfs sch_fq_codel efivarfs ip_tables x_tables uas xhci_pci ixgbe mdio xfrm_algo xhci_hcd wmi [224913.935897] CPU: 3 PID: 32100 Comm: fgs-bench Tainted: G I 5.14.0-rc1alu+ #181 [224913.935908] Hardware name: Dell Inc. Precision 5820 Tower/002KVM, BIOS 1.9.2 01/24/2019 [224913.935914] RIP: 0010:sk_stream_kill_queues+0x212/0x220 [224913.935923] Code: 8b 83 20 02 00 00 85 c0 75 20 5b 5d 41 5c 41 5d 41 5e 41 5f c3 48 89 df e8 2b 11 fe ff eb c3 0f 0b e9 7c ff ff ff 0f 0b eb ce <0f> 0b 5b 5d 41 5c 41 5d 41 5e 41 5f c3 90 0f 1f 44 00 00 41 57 41 [224913.935932] RSP: 0018:ffff88816271fd38 EFLAGS: 00010206 [224913.935941] RAX: 0000000000000ae8 RBX: ffff88815acd5240 RCX: dffffc0000000000 [224913.935948] RDX: 0000000000000003 RSI: 0000000000000ae8 RDI: ffff88815acd5460 [224913.935954] RBP: ffff88815acd5460 R08: ffffffff955c0ae8 R09: fffffbfff2e6f543 [224913.935961] R10: ffffffff9737aa17 R11: fffffbfff2e6f542 R12: ffff88815acd5390 [224913.935967] R13: ffff88815acd5480 R14: ffffffff98d0c080 R15: ffffffff96267500 [224913.935974] FS: 00007f86e6bd1700(0000) GS:ffff888451cc0000(0000) knlGS:0000000000000000 [224913.935981] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [224913.935988] CR2: 000000c0008eb000 CR3: 00000001020e0005 CR4: 00000000003706e0 [224913.935994] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [224913.936000] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [224913.936007] Call Trace: [224913.936016] inet_csk_destroy_sock+0xba/0x1f0 [224913.936033] __tcp_close+0x620/0x790 [224913.936047] tcp_close+0x20/0x80 [224913.936056] inet_release+0x8f/0xf0 [224913.936070] __sock_release+0x72/0x120 [224913.936083] sock_close+0x14/0x20 Fixes: a136678c0bdbb ("bpf: sk_msg, zap ingress queue on psock down") Signed-off-by: John Fastabend Signed-off-by: Andrii Nakryiko Acked-by: Jakub Sitnicki Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20210727160500.1713554-3-john.fastabend@gmail.com --- net/core/skmsg.c | 35 +++++++++++++++++++++++++++++------ 1 file changed, 29 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 28115ef742e8..036cdb33a94a 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -590,23 +590,42 @@ static void sock_drop(struct sock *sk, struct sk_buff *skb) kfree_skb(skb); } +static void sk_psock_skb_state(struct sk_psock *psock, + struct sk_psock_work_state *state, + struct sk_buff *skb, + int len, int off) +{ + spin_lock_bh(&psock->ingress_lock); + if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) { + state->skb = skb; + state->len = len; + state->off = off; + } else { + sock_drop(psock->sk, skb); + } + spin_unlock_bh(&psock->ingress_lock); +} + static void sk_psock_backlog(struct work_struct *work) { struct sk_psock *psock = container_of(work, struct sk_psock, work); struct sk_psock_work_state *state = &psock->work_state; - struct sk_buff *skb; + struct sk_buff *skb = NULL; bool ingress; u32 len, off; int ret; mutex_lock(&psock->work_mutex); - if (state->skb) { + if (unlikely(state->skb)) { + spin_lock_bh(&psock->ingress_lock); skb = state->skb; len = state->len; off = state->off; state->skb = NULL; - goto start; + spin_unlock_bh(&psock->ingress_lock); } + if (skb) + goto start; while ((skb = skb_dequeue(&psock->ingress_skb))) { len = skb->len; @@ -621,9 +640,8 @@ start: len, ingress); if (ret <= 0) { if (ret == -EAGAIN) { - state->skb = skb; - state->len = len; - state->off = off; + sk_psock_skb_state(psock, state, skb, + len, off); goto end; } /* Hard errors break pipe and stop xmit. */ @@ -722,6 +740,11 @@ static void __sk_psock_zap_ingress(struct sk_psock *psock) skb_bpf_redirect_clear(skb); sock_drop(psock->sk, skb); } + kfree_skb(psock->work_state.skb); + /* We null the skb here to ensure that calls to sk_psock_backlog + * do not pick up the free'd skb. + */ + psock->work_state.skb = NULL; __sk_psock_purge_ingress_msg(psock); } -- cgit v1.2.3 From 9635720b7c88592214562cb72605bdab6708006c Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Tue, 27 Jul 2021 09:05:00 -0700 Subject: bpf, sockmap: Fix memleak on ingress msg enqueue If backlog handler is running during a tear down operation we may enqueue data on the ingress msg queue while tear down is trying to free it. sk_psock_backlog() sk_psock_handle_skb() skb_psock_skb_ingress() sk_psock_skb_ingress_enqueue() sk_psock_queue_msg(psock,msg) spin_lock(ingress_lock) sk_psock_zap_ingress() _sk_psock_purge_ingerss_msg() _sk_psock_purge_ingress_msg() -- free ingress_msg list -- spin_unlock(ingress_lock) spin_lock(ingress_lock) list_add_tail(msg,ingress_msg) <- entry on list with no one left to free it. spin_unlock(ingress_lock) To fix we only enqueue from backlog if the ENABLED bit is set. The tear down logic clears the bit with ingress_lock set so we wont enqueue the msg in the last step. Fixes: 799aa7f98d53 ("skmsg: Avoid lock_sock() in sk_psock_backlog()") Signed-off-by: John Fastabend Signed-off-by: Andrii Nakryiko Acked-by: Jakub Sitnicki Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20210727160500.1713554-4-john.fastabend@gmail.com --- include/linux/skmsg.h | 54 +++++++++++++++++++++++++++++++++------------------ net/core/skmsg.c | 6 ------ 2 files changed, 35 insertions(+), 25 deletions(-) (limited to 'net') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 96f319099744..14ab0c0bc924 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -285,11 +285,45 @@ static inline struct sk_psock *sk_psock(const struct sock *sk) return rcu_dereference_sk_user_data(sk); } +static inline void sk_psock_set_state(struct sk_psock *psock, + enum sk_psock_state_bits bit) +{ + set_bit(bit, &psock->state); +} + +static inline void sk_psock_clear_state(struct sk_psock *psock, + enum sk_psock_state_bits bit) +{ + clear_bit(bit, &psock->state); +} + +static inline bool sk_psock_test_state(const struct sk_psock *psock, + enum sk_psock_state_bits bit) +{ + return test_bit(bit, &psock->state); +} + +static inline void sock_drop(struct sock *sk, struct sk_buff *skb) +{ + sk_drops_add(sk, skb); + kfree_skb(skb); +} + +static inline void drop_sk_msg(struct sk_psock *psock, struct sk_msg *msg) +{ + if (msg->skb) + sock_drop(psock->sk, msg->skb); + kfree(msg); +} + static inline void sk_psock_queue_msg(struct sk_psock *psock, struct sk_msg *msg) { spin_lock_bh(&psock->ingress_lock); - list_add_tail(&msg->list, &psock->ingress_msg); + if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) + list_add_tail(&msg->list, &psock->ingress_msg); + else + drop_sk_msg(psock, msg); spin_unlock_bh(&psock->ingress_lock); } @@ -406,24 +440,6 @@ static inline void sk_psock_restore_proto(struct sock *sk, psock->psock_update_sk_prot(sk, psock, true); } -static inline void sk_psock_set_state(struct sk_psock *psock, - enum sk_psock_state_bits bit) -{ - set_bit(bit, &psock->state); -} - -static inline void sk_psock_clear_state(struct sk_psock *psock, - enum sk_psock_state_bits bit) -{ - clear_bit(bit, &psock->state); -} - -static inline bool sk_psock_test_state(const struct sk_psock *psock, - enum sk_psock_state_bits bit) -{ - return test_bit(bit, &psock->state); -} - static inline struct sk_psock *sk_psock_get(struct sock *sk) { struct sk_psock *psock; diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 036cdb33a94a..2d6249b28928 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -584,12 +584,6 @@ static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb, return sk_psock_skb_ingress(psock, skb); } -static void sock_drop(struct sock *sk, struct sk_buff *skb) -{ - sk_drops_add(sk, skb); - kfree_skb(skb); -} - static void sk_psock_skb_state(struct sk_psock *psock, struct sk_psock_work_state *state, struct sk_buff *skb, -- cgit v1.2.3 From 557fb5862c9272ad9b21407afe1da8acfd9b53eb Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Tue, 27 Jul 2021 23:40:54 -0300 Subject: sctp: fix return value check in __sctp_rcv_asconf_lookup As Ben Hutchings noticed, this check should have been inverted: the call returns true in case of success. Reported-by: Ben Hutchings Fixes: 0c5dc070ff3d ("sctp: validate from_addr_param return") Signed-off-by: Marcelo Ricardo Leitner Reviewed-by: Xin Long Signed-off-by: David S. Miller --- net/sctp/input.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sctp/input.c b/net/sctp/input.c index eb3c2a34a31c..5ef86fdb1176 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -1203,7 +1203,7 @@ static struct sctp_association *__sctp_rcv_asconf_lookup( if (unlikely(!af)) return NULL; - if (af->from_addr_param(&paddr, param, peer_port, 0)) + if (!af->from_addr_param(&paddr, param, peer_port, 0)) return NULL; return __sctp_lookup_association(net, laddr, &paddr, transportp); -- cgit v1.2.3 From 1e60cebf82948cfdc9497ea4553bab125587593c Mon Sep 17 00:00:00 2001 From: zhang kai Date: Wed, 28 Jul 2021 18:54:18 +0800 Subject: net: let flow have same hash in two directions using same source and destination ip/port for flow hash calculation within the two directions. Signed-off-by: zhang kai Signed-off-by: David S. Miller --- net/core/flow_dissector.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 2aadbfc5193b..4b2415d34873 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -1504,7 +1504,7 @@ __be32 flow_get_u32_dst(const struct flow_keys *flow) } EXPORT_SYMBOL(flow_get_u32_dst); -/* Sort the source and destination IP (and the ports if the IP are the same), +/* Sort the source and destination IP and the ports, * to have consistent hash within the two directions */ static inline void __flow_hash_consistentify(struct flow_keys *keys) @@ -1515,11 +1515,11 @@ static inline void __flow_hash_consistentify(struct flow_keys *keys) case FLOW_DISSECTOR_KEY_IPV4_ADDRS: addr_diff = (__force u32)keys->addrs.v4addrs.dst - (__force u32)keys->addrs.v4addrs.src; - if ((addr_diff < 0) || - (addr_diff == 0 && - ((__force u16)keys->ports.dst < - (__force u16)keys->ports.src))) { + if (addr_diff < 0) swap(keys->addrs.v4addrs.src, keys->addrs.v4addrs.dst); + + if ((__force u16)keys->ports.dst < + (__force u16)keys->ports.src) { swap(keys->ports.src, keys->ports.dst); } break; @@ -1527,13 +1527,13 @@ static inline void __flow_hash_consistentify(struct flow_keys *keys) addr_diff = memcmp(&keys->addrs.v6addrs.dst, &keys->addrs.v6addrs.src, sizeof(keys->addrs.v6addrs.dst)); - if ((addr_diff < 0) || - (addr_diff == 0 && - ((__force u16)keys->ports.dst < - (__force u16)keys->ports.src))) { + if (addr_diff < 0) { for (i = 0; i < 4; i++) swap(keys->addrs.v6addrs.src.s6_addr32[i], keys->addrs.v6addrs.dst.s6_addr32[i]); + } + if ((__force u16)keys->ports.dst < + (__force u16)keys->ports.src) { swap(keys->ports.src, keys->ports.dst); } break; -- cgit v1.2.3