From 7c2330f1afe13b3a934140857bc8060d00103a89 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 16 Sep 2013 18:08:02 -0700 Subject: regulator: fix fatal kernel-doc error Fix fatal kernel-doc error in : Error(include/linux/regulator/driver.h:52): cannot understand prototype: 'struct regulator_linear_range ' Signed-off-by: Randy Dunlap [Rewrote first line -- broonie] Signed-off-by: Mark Brown --- include/linux/regulator/driver.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/regulator/driver.h b/include/linux/regulator/driver.h index 67e13aa5a478..9bdad43ad228 100644 --- a/include/linux/regulator/driver.h +++ b/include/linux/regulator/driver.h @@ -40,6 +40,8 @@ enum regulator_status { }; /** + * struct regulator_linear_range - specify linear voltage ranges + * * Specify a range of voltages for regulator_map_linar_range() and * regulator_list_linear_range(). * -- cgit v1.2.3 From 5e130367d43ff22836bbae380d197d600fe8ddbb Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Fri, 13 Sep 2013 08:58:17 +0300 Subject: Bluetooth: Introduce a new HCI_RFKILLED flag This makes it more convenient to check for rfkill (no need to check for dev->rfkill before calling rfkill_blocked()) and also avoids potential races if the RFKILL state needs to be checked from within the rfkill callback. Signed-off-by: Johan Hedberg Cc: stable@vger.kernel.org Acked-by: Marcel Holtmann Signed-off-by: Gustavo Padovan --- include/net/bluetooth/hci.h | 1 + net/bluetooth/hci_core.c | 15 ++++++++++----- 2 files changed, 11 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index aaeaf0938ec0..15f10841e2b5 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -104,6 +104,7 @@ enum { enum { HCI_SETUP, HCI_AUTO_OFF, + HCI_RFKILLED, HCI_MGMT, HCI_PAIRABLE, HCI_SERVICE_CACHE, diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 634debab4d54..0d5bc246b607 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -1146,7 +1146,7 @@ int hci_dev_open(__u16 dev) goto done; } - if (hdev->rfkill && rfkill_blocked(hdev->rfkill)) { + if (test_bit(HCI_RFKILLED, &hdev->dev_flags)) { ret = -ERFKILL; goto done; } @@ -1566,10 +1566,12 @@ static int hci_rfkill_set_block(void *data, bool blocked) BT_DBG("%p name %s blocked %d", hdev, hdev->name, blocked); - if (!blocked) - return 0; - - hci_dev_do_close(hdev); + if (blocked) { + set_bit(HCI_RFKILLED, &hdev->dev_flags); + hci_dev_do_close(hdev); + } else { + clear_bit(HCI_RFKILLED, &hdev->dev_flags); +} return 0; } @@ -2209,6 +2211,9 @@ int hci_register_dev(struct hci_dev *hdev) } } + if (hdev->rfkill && rfkill_blocked(hdev->rfkill)) + set_bit(HCI_RFKILLED, &hdev->dev_flags); + set_bit(HCI_SETUP, &hdev->dev_flags); if (hdev->dev_type != HCI_AMP) -- cgit v1.2.3 From c16526a7b99c1c28e9670a8c8e3dbcf741bb32be Mon Sep 17 00:00:00 2001 From: Simon Kirby Date: Sat, 10 Aug 2013 01:26:18 -0700 Subject: ipvs: fix overflow on dest weight multiply Schedulers such as lblc and lblcr require the weight to be as high as the maximum number of active connections. In commit b552f7e3a9524abcbcdf ("ipvs: unify the formula to estimate the overhead of processing connections"), the consideration of inactconns and activeconns was cleaned up to always count activeconns as 256 times more important than inactconns. In cases where 3000 or more connections are expected, a weight of 3000 * 256 * 3000 connections overflows the 32-bit signed result used to determine if rescheduling is required. On amd64, this merely changes the multiply and comparison instructions to 64-bit. On x86, a 64-bit result is already present from imull, so only a few more comparison instructions are emitted. Signed-off-by: Simon Kirby Acked-by: Julian Anastasov Signed-off-by: Simon Horman --- include/net/ip_vs.h | 2 +- net/netfilter/ipvs/ip_vs_lblc.c | 4 ++-- net/netfilter/ipvs/ip_vs_lblcr.c | 12 ++++++------ net/netfilter/ipvs/ip_vs_nq.c | 8 ++++---- net/netfilter/ipvs/ip_vs_sed.c | 8 ++++---- net/netfilter/ipvs/ip_vs_wlc.c | 6 +++--- 6 files changed, 20 insertions(+), 20 deletions(-) (limited to 'include') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index f0d70f066f3d..fe782ed2fe72 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -1649,7 +1649,7 @@ static inline void ip_vs_conn_drop_conntrack(struct ip_vs_conn *cp) /* CONFIG_IP_VS_NFCT */ #endif -static inline unsigned int +static inline int ip_vs_dest_conn_overhead(struct ip_vs_dest *dest) { /* diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c index 1383b0eadc0e..eb814bf74e64 100644 --- a/net/netfilter/ipvs/ip_vs_lblc.c +++ b/net/netfilter/ipvs/ip_vs_lblc.c @@ -443,8 +443,8 @@ __ip_vs_lblc_schedule(struct ip_vs_service *svc) continue; doh = ip_vs_dest_conn_overhead(dest); - if (loh * atomic_read(&dest->weight) > - doh * atomic_read(&least->weight)) { + if ((__s64)loh * atomic_read(&dest->weight) > + (__s64)doh * atomic_read(&least->weight)) { least = dest; loh = doh; } diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c index 5199448697f6..e65f7c573090 100644 --- a/net/netfilter/ipvs/ip_vs_lblcr.c +++ b/net/netfilter/ipvs/ip_vs_lblcr.c @@ -200,8 +200,8 @@ static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set) continue; doh = ip_vs_dest_conn_overhead(dest); - if ((loh * atomic_read(&dest->weight) > - doh * atomic_read(&least->weight)) + if (((__s64)loh * atomic_read(&dest->weight) > + (__s64)doh * atomic_read(&least->weight)) && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { least = dest; loh = doh; @@ -246,8 +246,8 @@ static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set) dest = rcu_dereference_protected(e->dest, 1); doh = ip_vs_dest_conn_overhead(dest); /* moh/mw < doh/dw ==> moh*dw < doh*mw, where mw,dw>0 */ - if ((moh * atomic_read(&dest->weight) < - doh * atomic_read(&most->weight)) + if (((__s64)moh * atomic_read(&dest->weight) < + (__s64)doh * atomic_read(&most->weight)) && (atomic_read(&dest->weight) > 0)) { most = dest; moh = doh; @@ -611,8 +611,8 @@ __ip_vs_lblcr_schedule(struct ip_vs_service *svc) continue; doh = ip_vs_dest_conn_overhead(dest); - if (loh * atomic_read(&dest->weight) > - doh * atomic_read(&least->weight)) { + if ((__s64)loh * atomic_read(&dest->weight) > + (__s64)doh * atomic_read(&least->weight)) { least = dest; loh = doh; } diff --git a/net/netfilter/ipvs/ip_vs_nq.c b/net/netfilter/ipvs/ip_vs_nq.c index d8d9860934fe..961a6de9bb29 100644 --- a/net/netfilter/ipvs/ip_vs_nq.c +++ b/net/netfilter/ipvs/ip_vs_nq.c @@ -40,7 +40,7 @@ #include -static inline unsigned int +static inline int ip_vs_nq_dest_overhead(struct ip_vs_dest *dest) { /* @@ -59,7 +59,7 @@ ip_vs_nq_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, struct ip_vs_iphdr *iph) { struct ip_vs_dest *dest, *least = NULL; - unsigned int loh = 0, doh; + int loh = 0, doh; IP_VS_DBG(6, "%s(): Scheduling...\n", __func__); @@ -92,8 +92,8 @@ ip_vs_nq_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, } if (!least || - (loh * atomic_read(&dest->weight) > - doh * atomic_read(&least->weight))) { + ((__s64)loh * atomic_read(&dest->weight) > + (__s64)doh * atomic_read(&least->weight))) { least = dest; loh = doh; } diff --git a/net/netfilter/ipvs/ip_vs_sed.c b/net/netfilter/ipvs/ip_vs_sed.c index a5284cc3d882..e446b9fa7424 100644 --- a/net/netfilter/ipvs/ip_vs_sed.c +++ b/net/netfilter/ipvs/ip_vs_sed.c @@ -44,7 +44,7 @@ #include -static inline unsigned int +static inline int ip_vs_sed_dest_overhead(struct ip_vs_dest *dest) { /* @@ -63,7 +63,7 @@ ip_vs_sed_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, struct ip_vs_iphdr *iph) { struct ip_vs_dest *dest, *least; - unsigned int loh, doh; + int loh, doh; IP_VS_DBG(6, "%s(): Scheduling...\n", __func__); @@ -99,8 +99,8 @@ ip_vs_sed_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, if (dest->flags & IP_VS_DEST_F_OVERLOAD) continue; doh = ip_vs_sed_dest_overhead(dest); - if (loh * atomic_read(&dest->weight) > - doh * atomic_read(&least->weight)) { + if ((__s64)loh * atomic_read(&dest->weight) > + (__s64)doh * atomic_read(&least->weight)) { least = dest; loh = doh; } diff --git a/net/netfilter/ipvs/ip_vs_wlc.c b/net/netfilter/ipvs/ip_vs_wlc.c index 6dc1fa128840..b5b4650d50a9 100644 --- a/net/netfilter/ipvs/ip_vs_wlc.c +++ b/net/netfilter/ipvs/ip_vs_wlc.c @@ -35,7 +35,7 @@ ip_vs_wlc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, struct ip_vs_iphdr *iph) { struct ip_vs_dest *dest, *least; - unsigned int loh, doh; + int loh, doh; IP_VS_DBG(6, "ip_vs_wlc_schedule(): Scheduling...\n"); @@ -71,8 +71,8 @@ ip_vs_wlc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, if (dest->flags & IP_VS_DEST_F_OVERLOAD) continue; doh = ip_vs_dest_conn_overhead(dest); - if (loh * atomic_read(&dest->weight) > - doh * atomic_read(&least->weight)) { + if ((__s64)loh * atomic_read(&dest->weight) > + (__s64)doh * atomic_read(&least->weight)) { least = dest; loh = doh; } -- cgit v1.2.3 From bcbde4c0a7556cca72874c5e1efa4dccb5198a2b Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Thu, 12 Sep 2013 11:21:07 +0300 Subject: ipvs: make the service replacement more robust commit 578bc3ef1e473a ("ipvs: reorganize dest trash") added IP_VS_DEST_STATE_REMOVING flag and RCU callback named ip_vs_dest_wait_readers() to keep dests and services after removal for at least a RCU grace period. But we have the following corner cases: - we can not reuse the same dest if its service is removed while IP_VS_DEST_STATE_REMOVING is still set because another dest removal in the first grace period can not extend this period. It can happen when ipvsadm -C && ipvsadm -R is used. - dest->svc can be replaced but ip_vs_in_stats() and ip_vs_out_stats() have no explicit read memory barriers when accessing dest->svc. It can happen that dest->svc was just freed (replaced) while we use it to update the stats. We solve the problems as follows: - IP_VS_DEST_STATE_REMOVING is removed and we ensure a fixed idle period for the dest (IP_VS_DEST_TRASH_PERIOD). idle_start will remember when for first time after deletion we noticed dest->refcnt=0. Later, the connections can grab a reference while in RCU grace period but if refcnt becomes 0 we can safely free the dest and its svc. - dest->svc becomes RCU pointer. As result, we add explicit RCU locking in ip_vs_in_stats() and ip_vs_out_stats(). - __ip_vs_unbind_svc is renamed to __ip_vs_svc_put(), it now can free the service immediately or after a RCU grace period. dest->svc is not set to NULL anymore. As result, unlinked dests and their services are freed always after IP_VS_DEST_TRASH_PERIOD period, unused services are freed after a RCU grace period. Signed-off-by: Julian Anastasov Signed-off-by: Simon Horman --- include/net/ip_vs.h | 7 +--- net/netfilter/ipvs/ip_vs_core.c | 12 +++++- net/netfilter/ipvs/ip_vs_ctl.c | 86 +++++++++++++++++------------------------ 3 files changed, 47 insertions(+), 58 deletions(-) (limited to 'include') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index fe782ed2fe72..9c4d37ec45a1 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -723,8 +723,6 @@ struct ip_vs_dest_dst { struct rcu_head rcu_head; }; -/* In grace period after removing */ -#define IP_VS_DEST_STATE_REMOVING 0x01 /* * The real server destination forwarding entry * with ip address, port number, and so on. @@ -742,7 +740,7 @@ struct ip_vs_dest { atomic_t refcnt; /* reference counter */ struct ip_vs_stats stats; /* statistics */ - unsigned long state; /* state flags */ + unsigned long idle_start; /* start time, jiffies */ /* connection counters and thresholds */ atomic_t activeconns; /* active connections */ @@ -756,14 +754,13 @@ struct ip_vs_dest { struct ip_vs_dest_dst __rcu *dest_dst; /* cached dst info */ /* for virtual service */ - struct ip_vs_service *svc; /* service it belongs to */ + struct ip_vs_service __rcu *svc; /* service it belongs to */ __u16 protocol; /* which protocol (TCP/UDP) */ __be16 vport; /* virtual port number */ union nf_inet_addr vaddr; /* virtual IP address */ __u32 vfwmark; /* firewall mark of service */ struct list_head t_list; /* in dest_trash */ - struct rcu_head rcu_head; unsigned int in_rs_table:1; /* we are in rs_table */ }; diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 4f69e83ff836..74fd00c27210 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -116,6 +116,7 @@ ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb) if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { struct ip_vs_cpu_stats *s; + struct ip_vs_service *svc; s = this_cpu_ptr(dest->stats.cpustats); s->ustats.inpkts++; @@ -123,11 +124,14 @@ ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb) s->ustats.inbytes += skb->len; u64_stats_update_end(&s->syncp); - s = this_cpu_ptr(dest->svc->stats.cpustats); + rcu_read_lock(); + svc = rcu_dereference(dest->svc); + s = this_cpu_ptr(svc->stats.cpustats); s->ustats.inpkts++; u64_stats_update_begin(&s->syncp); s->ustats.inbytes += skb->len; u64_stats_update_end(&s->syncp); + rcu_read_unlock(); s = this_cpu_ptr(ipvs->tot_stats.cpustats); s->ustats.inpkts++; @@ -146,6 +150,7 @@ ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb) if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { struct ip_vs_cpu_stats *s; + struct ip_vs_service *svc; s = this_cpu_ptr(dest->stats.cpustats); s->ustats.outpkts++; @@ -153,11 +158,14 @@ ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb) s->ustats.outbytes += skb->len; u64_stats_update_end(&s->syncp); - s = this_cpu_ptr(dest->svc->stats.cpustats); + rcu_read_lock(); + svc = rcu_dereference(dest->svc); + s = this_cpu_ptr(svc->stats.cpustats); s->ustats.outpkts++; u64_stats_update_begin(&s->syncp); s->ustats.outbytes += skb->len; u64_stats_update_end(&s->syncp); + rcu_read_unlock(); s = this_cpu_ptr(ipvs->tot_stats.cpustats); s->ustats.outpkts++; diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index c8148e487386..a3df9bddc4f7 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -460,7 +460,7 @@ static inline void __ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc) { atomic_inc(&svc->refcnt); - dest->svc = svc; + rcu_assign_pointer(dest->svc, svc); } static void ip_vs_service_free(struct ip_vs_service *svc) @@ -470,18 +470,25 @@ static void ip_vs_service_free(struct ip_vs_service *svc) kfree(svc); } -static void -__ip_vs_unbind_svc(struct ip_vs_dest *dest) +static void ip_vs_service_rcu_free(struct rcu_head *head) { - struct ip_vs_service *svc = dest->svc; + struct ip_vs_service *svc; + + svc = container_of(head, struct ip_vs_service, rcu_head); + ip_vs_service_free(svc); +} - dest->svc = NULL; +static void __ip_vs_svc_put(struct ip_vs_service *svc, bool do_delay) +{ if (atomic_dec_and_test(&svc->refcnt)) { IP_VS_DBG_BUF(3, "Removing service %u/%s:%u\n", svc->fwmark, IP_VS_DBG_ADDR(svc->af, &svc->addr), ntohs(svc->port)); - ip_vs_service_free(svc); + if (do_delay) + call_rcu(&svc->rcu_head, ip_vs_service_rcu_free); + else + ip_vs_service_free(svc); } } @@ -667,11 +674,6 @@ ip_vs_trash_get_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr, IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port), atomic_read(&dest->refcnt)); - /* We can not reuse dest while in grace period - * because conns still can use dest->svc - */ - if (test_bit(IP_VS_DEST_STATE_REMOVING, &dest->state)) - continue; if (dest->af == svc->af && ip_vs_addr_equal(svc->af, &dest->addr, daddr) && dest->port == dport && @@ -697,8 +699,10 @@ out: static void ip_vs_dest_free(struct ip_vs_dest *dest) { + struct ip_vs_service *svc = rcu_dereference_protected(dest->svc, 1); + __ip_vs_dst_cache_reset(dest); - __ip_vs_unbind_svc(dest); + __ip_vs_svc_put(svc, false); free_percpu(dest->stats.cpustats); kfree(dest); } @@ -771,6 +775,7 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest, struct ip_vs_dest_user_kern *udest, int add) { struct netns_ipvs *ipvs = net_ipvs(svc->net); + struct ip_vs_service *old_svc; struct ip_vs_scheduler *sched; int conn_flags; @@ -792,13 +797,14 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest, atomic_set(&dest->conn_flags, conn_flags); /* bind the service */ - if (!dest->svc) { + old_svc = rcu_dereference_protected(dest->svc, 1); + if (!old_svc) { __ip_vs_bind_svc(dest, svc); } else { - if (dest->svc != svc) { - __ip_vs_unbind_svc(dest); + if (old_svc != svc) { ip_vs_zero_stats(&dest->stats); __ip_vs_bind_svc(dest, svc); + __ip_vs_svc_put(old_svc, true); } } @@ -998,16 +1004,6 @@ ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) return 0; } -static void ip_vs_dest_wait_readers(struct rcu_head *head) -{ - struct ip_vs_dest *dest = container_of(head, struct ip_vs_dest, - rcu_head); - - /* End of grace period after unlinking */ - clear_bit(IP_VS_DEST_STATE_REMOVING, &dest->state); -} - - /* * Delete a destination (must be already unlinked from the service) */ @@ -1023,20 +1019,16 @@ static void __ip_vs_del_dest(struct net *net, struct ip_vs_dest *dest, */ ip_vs_rs_unhash(dest); - if (!cleanup) { - set_bit(IP_VS_DEST_STATE_REMOVING, &dest->state); - call_rcu(&dest->rcu_head, ip_vs_dest_wait_readers); - } - spin_lock_bh(&ipvs->dest_trash_lock); IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, dest->refcnt=%d\n", IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port), atomic_read(&dest->refcnt)); if (list_empty(&ipvs->dest_trash) && !cleanup) mod_timer(&ipvs->dest_trash_timer, - jiffies + IP_VS_DEST_TRASH_PERIOD); + jiffies + (IP_VS_DEST_TRASH_PERIOD >> 1)); /* dest lives in trash without reference */ list_add(&dest->t_list, &ipvs->dest_trash); + dest->idle_start = 0; spin_unlock_bh(&ipvs->dest_trash_lock); ip_vs_dest_put(dest); } @@ -1108,24 +1100,30 @@ static void ip_vs_dest_trash_expire(unsigned long data) struct net *net = (struct net *) data; struct netns_ipvs *ipvs = net_ipvs(net); struct ip_vs_dest *dest, *next; + unsigned long now = jiffies; spin_lock(&ipvs->dest_trash_lock); list_for_each_entry_safe(dest, next, &ipvs->dest_trash, t_list) { - /* Skip if dest is in grace period */ - if (test_bit(IP_VS_DEST_STATE_REMOVING, &dest->state)) - continue; if (atomic_read(&dest->refcnt) > 0) continue; + if (dest->idle_start) { + if (time_before(now, dest->idle_start + + IP_VS_DEST_TRASH_PERIOD)) + continue; + } else { + dest->idle_start = max(1UL, now); + continue; + } IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u from trash\n", dest->vfwmark, - IP_VS_DBG_ADDR(dest->svc->af, &dest->addr), + IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port)); list_del(&dest->t_list); ip_vs_dest_free(dest); } if (!list_empty(&ipvs->dest_trash)) mod_timer(&ipvs->dest_trash_timer, - jiffies + IP_VS_DEST_TRASH_PERIOD); + jiffies + (IP_VS_DEST_TRASH_PERIOD >> 1)); spin_unlock(&ipvs->dest_trash_lock); } @@ -1320,14 +1318,6 @@ out: return ret; } -static void ip_vs_service_rcu_free(struct rcu_head *head) -{ - struct ip_vs_service *svc; - - svc = container_of(head, struct ip_vs_service, rcu_head); - ip_vs_service_free(svc); -} - /* * Delete a service from the service list * - The service must be unlinked, unlocked and not referenced! @@ -1376,13 +1366,7 @@ static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup) /* * Free the service if nobody refers to it */ - if (atomic_dec_and_test(&svc->refcnt)) { - IP_VS_DBG_BUF(3, "Removing service %u/%s:%u\n", - svc->fwmark, - IP_VS_DBG_ADDR(svc->af, &svc->addr), - ntohs(svc->port)); - call_rcu(&svc->rcu_head, ip_vs_service_rcu_free); - } + __ip_vs_svc_put(svc, true); /* decrease the module use count */ ip_vs_use_count_dec(); -- cgit v1.2.3 From c5ecceefdb840af45db436adc58219ae97b6ef3c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 13 Sep 2013 23:39:17 +0200 Subject: perf: Update ABI comment For some mysterious reason the sample_id field of PERF_RECORD_MMAP went AWOL. Reported-by: Vince Weaver Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- include/uapi/linux/perf_event.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 40a1fb807396..7f6d584c267b 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -528,6 +528,7 @@ enum perf_event_type { * u64 len; * u64 pgoff; * char filename[]; + * struct sample_id sample_id; * }; */ PERF_RECORD_MMAP = 1, -- cgit v1.2.3 From fa7315871046b9a4c48627905691dbde57e51033 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 19 Sep 2013 10:16:42 +0200 Subject: perf: Fix capabilities bitfield compatibility in 'struct perf_event_mmap_page' Solve the problems around the broken definition of perf_event_mmap_page:: cap_usr_time and cap_usr_rdpmc fields which used to overlap, partially fixed by: 860f085b74e9 ("perf: Fix broken union in 'struct perf_event_mmap_page'") The problem with the fix (merged in v3.12-rc1 and not yet released officially), noticed by Vince Weaver is that the new behavior is not detectable by new user-space, and that due to the reuse of the field names it's easy to mis-compile a binary if old headers are used on a new kernel or new headers are used on an old kernel. To solve all that make this change explicit, detectable and self-contained, by iterating the ABI the following way: - Always clear bit 0, and rename it to usrpage->cap_bit0, to at least not confuse old user-space binaries. RDPMC will be marked as unavailable to old binaries but that's within the ABI, this is a capability bit. - Rename bit 1 to ->cap_bit0_is_deprecated and always set it to 1, so new libraries can reliably detect that bit 0 is deprecated and perma-zero without having to check the kernel version. - Use bits 2, 3, 4 for the newly defined, correct functionality: cap_user_rdpmc : 1, /* The RDPMC instruction can be used to read counts */ cap_user_time : 1, /* The time_* fields are used */ cap_user_time_zero : 1, /* The time_zero field is used */ - Rename all the bitfield names in perf_event.h to be different from the old names, to make sure it's not possible to mis-compile it accidentally with old assumptions. The 'size' field can then be used in the future to add new fields and it will act as a natural ABI version indicator as well. Also adjust tools/perf/ userspace for the new definitions, noticed by Adrian Hunter. Reported-by: Vince Weaver Signed-off-by: Peter Zijlstra Also-Fixed-by: Adrian Hunter Link: http://lkml.kernel.org/n/tip-zr03yxjrpXesOzzupszqglbv@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 10 +++++----- include/uapi/linux/perf_event.h | 14 +++++++++----- kernel/events/core.c | 21 +++++++++++++++++++++ tools/perf/arch/x86/util/tsc.c | 6 +++--- 4 files changed, 38 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 8355c84b9729..a9c606bb4945 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1883,9 +1883,9 @@ static struct pmu pmu = { void arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now) { - userpg->cap_usr_time = 0; - userpg->cap_usr_time_zero = 0; - userpg->cap_usr_rdpmc = x86_pmu.attr_rdpmc; + userpg->cap_user_time = 0; + userpg->cap_user_time_zero = 0; + userpg->cap_user_rdpmc = x86_pmu.attr_rdpmc; userpg->pmc_width = x86_pmu.cntval_bits; if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) @@ -1894,13 +1894,13 @@ void arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now) if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) return; - userpg->cap_usr_time = 1; + userpg->cap_user_time = 1; userpg->time_mult = this_cpu_read(cyc2ns); userpg->time_shift = CYC2NS_SCALE_FACTOR; userpg->time_offset = this_cpu_read(cyc2ns_offset) - now; if (sched_clock_stable && !check_tsc_disabled()) { - userpg->cap_usr_time_zero = 1; + userpg->cap_user_time_zero = 1; userpg->time_zero = this_cpu_read(cyc2ns_offset); } } diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 7f6d584c267b..009a655a5d35 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -380,10 +380,13 @@ struct perf_event_mmap_page { union { __u64 capabilities; struct { - __u64 cap_usr_time : 1, - cap_usr_rdpmc : 1, - cap_usr_time_zero : 1, - cap_____res : 61; + __u64 cap_bit0 : 1, /* Always 0, deprecated, see commit 860f085b74e9 */ + cap_bit0_is_deprecated : 1, /* Always 1, signals that bit 0 is zero */ + + cap_user_rdpmc : 1, /* The RDPMC instruction can be used to read counts */ + cap_user_time : 1, /* The time_* fields are used */ + cap_user_time_zero : 1, /* The time_zero field is used */ + cap_____res : 59; }; }; @@ -442,12 +445,13 @@ struct perf_event_mmap_page { * ((rem * time_mult) >> time_shift); */ __u64 time_zero; + __u32 size; /* Header size up to __reserved[] fields. */ /* * Hole for extension of the self monitor capabilities */ - __u64 __reserved[119]; /* align to 1k */ + __u8 __reserved[118*8+4]; /* align to 1k. */ /* * Control data for the mmap() data buffer. diff --git a/kernel/events/core.c b/kernel/events/core.c index dd236b66ca3a..cb4238e85b38 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3660,6 +3660,26 @@ static void calc_timer_values(struct perf_event *event, *running = ctx_time - event->tstamp_running; } +static void perf_event_init_userpage(struct perf_event *event) +{ + struct perf_event_mmap_page *userpg; + struct ring_buffer *rb; + + rcu_read_lock(); + rb = rcu_dereference(event->rb); + if (!rb) + goto unlock; + + userpg = rb->user_page; + + /* Allow new userspace to detect that bit 0 is deprecated */ + userpg->cap_bit0_is_deprecated = 1; + userpg->size = offsetof(struct perf_event_mmap_page, __reserved); + +unlock: + rcu_read_unlock(); +} + void __weak arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now) { } @@ -4044,6 +4064,7 @@ again: ring_buffer_attach(event, rb); rcu_assign_pointer(event->rb, rb); + perf_event_init_userpage(event); perf_event_update_userpage(event); unlock: diff --git a/tools/perf/arch/x86/util/tsc.c b/tools/perf/arch/x86/util/tsc.c index 9570c2b0f83c..b2519e49424f 100644 --- a/tools/perf/arch/x86/util/tsc.c +++ b/tools/perf/arch/x86/util/tsc.c @@ -32,7 +32,7 @@ u64 tsc_to_perf_time(u64 cyc, struct perf_tsc_conversion *tc) int perf_read_tsc_conversion(const struct perf_event_mmap_page *pc, struct perf_tsc_conversion *tc) { - bool cap_usr_time_zero; + bool cap_user_time_zero; u32 seq; int i = 0; @@ -42,7 +42,7 @@ int perf_read_tsc_conversion(const struct perf_event_mmap_page *pc, tc->time_mult = pc->time_mult; tc->time_shift = pc->time_shift; tc->time_zero = pc->time_zero; - cap_usr_time_zero = pc->cap_usr_time_zero; + cap_user_time_zero = pc->cap_user_time_zero; rmb(); if (pc->lock == seq && !(seq & 1)) break; @@ -52,7 +52,7 @@ int perf_read_tsc_conversion(const struct perf_event_mmap_page *pc, } } - if (!cap_usr_time_zero) + if (!cap_user_time_zero) return -EOPNOTSUPP; return 0; -- cgit v1.2.3 From f84cb8a46a771f36a04a02c61ea635c968ed5f6a Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Thu, 19 Sep 2013 12:13:58 -0400 Subject: dm mpath: disable WRITE SAME if it fails Workaround the SCSI layer's problematic WRITE SAME heuristics by disabling WRITE SAME in the DM multipath device's queue_limits if an underlying device disabled it. The WRITE SAME heuristics, with both the original commit 5db44863b6eb ("[SCSI] sd: Implement support for WRITE SAME") and the updated commit 66c28f971 ("[SCSI] sd: Update WRITE SAME heuristics"), default to enabling WRITE SAME(10) even without successfully determining it is supported. After the first failed WRITE SAME the SCSI layer will disable WRITE SAME for the device (by setting sdkp->device->no_write_same which results in 'max_write_same_sectors' in device's queue_limits to be set to 0). When a device is stacked ontop of such a SCSI device any changes to that SCSI device's queue_limits do not automatically propagate up the stack. As such, a DM multipath device will not have its WRITE SAME support disabled. This causes the block layer to continue to issue WRITE SAME requests to the mpath device which causes paths to fail and (if mpath IO isn't configured to queue when no paths are available) it will result in actual IO errors to the upper layers. This fix doesn't help configurations that have additional devices stacked ontop of the mpath device (e.g. LVM created linear DM devices ontop). A proper fix that restacks all the queue_limits from the bottom of the device stack up will need to be explored if SCSI will continue to use this model of optimistically allowing op codes and then disabling them after they fail for the first time. Before this patch: EXT4-fs (dm-6): mounted filesystem with ordered data mode. Opts: (null) device-mapper: multipath: XXX snitm debugging: got -EREMOTEIO (-121) device-mapper: multipath: XXX snitm debugging: failing WRITE SAME IO with error=-121 end_request: critical target error, dev dm-6, sector 528 dm-6: WRITE SAME failed. Manually zeroing. device-mapper: multipath: Failing path 8:112. end_request: I/O error, dev dm-6, sector 4616 dm-6: WRITE SAME failed. Manually zeroing. end_request: I/O error, dev dm-6, sector 4616 end_request: I/O error, dev dm-6, sector 5640 end_request: I/O error, dev dm-6, sector 6664 end_request: I/O error, dev dm-6, sector 7688 end_request: I/O error, dev dm-6, sector 524288 Buffer I/O error on device dm-6, logical block 65536 lost page write due to I/O error on dm-6 JBD2: Error -5 detected when updating journal superblock for dm-6-8. end_request: I/O error, dev dm-6, sector 524296 Aborting journal on device dm-6-8. end_request: I/O error, dev dm-6, sector 524288 Buffer I/O error on device dm-6, logical block 65536 lost page write due to I/O error on dm-6 JBD2: Error -5 detected when updating journal superblock for dm-6-8. # cat /sys/block/sdh/queue/write_same_max_bytes 0 # cat /sys/block/dm-6/queue/write_same_max_bytes 33553920 After this patch: EXT4-fs (dm-6): mounted filesystem with ordered data mode. Opts: (null) device-mapper: multipath: XXX snitm debugging: got -EREMOTEIO (-121) device-mapper: multipath: XXX snitm debugging: WRITE SAME I/O failed with error=-121 end_request: critical target error, dev dm-6, sector 528 dm-6: WRITE SAME failed. Manually zeroing. # cat /sys/block/sdh/queue/write_same_max_bytes 0 # cat /sys/block/dm-6/queue/write_same_max_bytes 0 It should be noted that WRITE SAME support wasn't enabled in DM multipath until v3.10. Signed-off-by: Mike Snitzer Cc: Martin K. Petersen Cc: Hannes Reinecke Cc: stable@vger.kernel.org # 3.10+ --- drivers/md/dm-mpath.c | 11 ++++++++++- drivers/md/dm.c | 11 +++++++++++ include/linux/device-mapper.h | 3 ++- 3 files changed, 23 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 075747e25f77..e08be94959fd 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -1299,8 +1299,17 @@ static int do_end_io(struct multipath *m, struct request *clone, if (!error && !clone->errors) return 0; /* I/O complete */ - if (noretry_error(error)) + if (noretry_error(error)) { + if ((clone->cmd_flags & REQ_WRITE_SAME) && + !clone->q->limits.max_write_same_sectors) { + struct queue_limits *limits; + + /* device doesn't really support WRITE SAME, disable it */ + limits = dm_get_queue_limits(dm_table_get_md(m->ti->table)); + limits->max_write_same_sectors = 0; + } return error; + } if (mpio->pgpath) fail_path(mpio->pgpath); diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 6a5e9ed2fcc3..7b0ccdc5d65c 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -2277,6 +2277,17 @@ struct target_type *dm_get_immutable_target_type(struct mapped_device *md) return md->immutable_target_type; } +/* + * The queue_limits are only valid as long as you have a reference + * count on 'md'. + */ +struct queue_limits *dm_get_queue_limits(struct mapped_device *md) +{ + BUG_ON(!atomic_read(&md->holders)); + return &md->queue->limits; +} +EXPORT_SYMBOL_GPL(dm_get_queue_limits); + /* * Fully initialize a request-based queue (->elevator, ->request_fn, etc). */ diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index 653073de09e3..ed419c62dde1 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -406,13 +406,14 @@ int dm_noflush_suspending(struct dm_target *ti); union map_info *dm_get_mapinfo(struct bio *bio); union map_info *dm_get_rq_mapinfo(struct request *rq); +struct queue_limits *dm_get_queue_limits(struct mapped_device *md); + /* * Geometry functions. */ int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo); int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo); - /*----------------------------------------------------------------- * Functions for manipulating device-mapper tables. *---------------------------------------------------------------*/ -- cgit v1.2.3 From c26d436cbf7a9549ec1073480a2e3f0d3f64e02d Mon Sep 17 00:00:00 2001 From: Andre Naujoks Date: Fri, 13 Sep 2013 19:37:12 +0200 Subject: lib: introduce upper case hex ascii helpers To be able to use the hex ascii functions in case sensitive environments the array hex_asc_upper[] and the needed functions for hex_byte_pack_upper() are introduced. Signed-off-by: Andre Naujoks Signed-off-by: David S. Miller --- include/linux/kernel.h | 11 +++++++++++ lib/hexdump.c | 2 ++ 2 files changed, 13 insertions(+) (limited to 'include') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 482ad2d84a32..672ddc4de4af 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -439,6 +439,17 @@ static inline char *hex_byte_pack(char *buf, u8 byte) return buf; } +extern const char hex_asc_upper[]; +#define hex_asc_upper_lo(x) hex_asc_upper[((x) & 0x0f)] +#define hex_asc_upper_hi(x) hex_asc_upper[((x) & 0xf0) >> 4] + +static inline char *hex_byte_pack_upper(char *buf, u8 byte) +{ + *buf++ = hex_asc_upper_hi(byte); + *buf++ = hex_asc_upper_lo(byte); + return buf; +} + static inline char * __deprecated pack_hex_byte(char *buf, u8 byte) { return hex_byte_pack(buf, byte); diff --git a/lib/hexdump.c b/lib/hexdump.c index 3f0494c9d57a..8499c810909a 100644 --- a/lib/hexdump.c +++ b/lib/hexdump.c @@ -14,6 +14,8 @@ const char hex_asc[] = "0123456789abcdef"; EXPORT_SYMBOL(hex_asc); +const char hex_asc_upper[] = "0123456789ABCDEF"; +EXPORT_SYMBOL(hex_asc_upper); /** * hex_to_bin - convert a hex digit to its real value -- cgit v1.2.3 From 42baf21d91d4f52f5b9a4d11dc8e7b1e3b93de7c Mon Sep 17 00:00:00 2001 From: Michel Dänzer Date: Wed, 18 Sep 2013 18:23:51 +0200 Subject: drm/radeon/cik: Add tiling mode index for 1D tiled depth/stencil surfaces MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CIK uses a different index for 1D DST surfaces compared to SI. Expose the new index so libdrm_radeon can use it properly for userspace drivers. Signed-off-by: Michel Dänzer Signed-off-by: Alex Deucher --- include/uapi/drm/radeon_drm.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/uapi/drm/radeon_drm.h b/include/uapi/drm/radeon_drm.h index fa8b3adf9ffb..46d41e8b0dcc 100644 --- a/include/uapi/drm/radeon_drm.h +++ b/include/uapi/drm/radeon_drm.h @@ -1007,4 +1007,6 @@ struct drm_radeon_info { #define SI_TILE_MODE_DEPTH_STENCIL_2D_4AA 3 #define SI_TILE_MODE_DEPTH_STENCIL_2D_8AA 2 +#define CIK_TILE_MODE_DEPTH_STENCIL_1D 5 + #endif -- cgit v1.2.3 From 47d06e532e95b71c0db3839ebdef3fe8812fca2c Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Tue, 10 Sep 2013 10:52:35 -0400 Subject: random: run random_int_secret_init() run after all late_initcalls The some platforms (e.g., ARM) initializes their clocks as late_initcalls for some unknown reason. So make sure random_int_secret_init() is run after all of the late_initcalls are run. Cc: stable@vger.kernel.org Signed-off-by: "Theodore Ts'o" --- drivers/char/random.c | 3 +-- include/linux/random.h | 1 + init/main.c | 2 ++ 3 files changed, 4 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/drivers/char/random.c b/drivers/char/random.c index 0d91fe52f3f5..92e6c67e1ae6 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -1462,12 +1462,11 @@ struct ctl_table random_table[] = { static u32 random_int_secret[MD5_MESSAGE_BYTES / 4] ____cacheline_aligned; -static int __init random_int_secret_init(void) +int random_int_secret_init(void) { get_random_bytes(random_int_secret, sizeof(random_int_secret)); return 0; } -late_initcall(random_int_secret_init); /* * Get a random word for internal kernel use only. Similar to urandom but diff --git a/include/linux/random.h b/include/linux/random.h index 3b9377d6b7a5..6312dd9ba449 100644 --- a/include/linux/random.h +++ b/include/linux/random.h @@ -17,6 +17,7 @@ extern void add_interrupt_randomness(int irq, int irq_flags); extern void get_random_bytes(void *buf, int nbytes); extern void get_random_bytes_arch(void *buf, int nbytes); void generate_random_uuid(unsigned char uuid_out[16]); +extern int random_int_secret_init(void); #ifndef MODULE extern const struct file_operations random_fops, urandom_fops; diff --git a/init/main.c b/init/main.c index d03d2ec2eacf..586cd3359c02 100644 --- a/init/main.c +++ b/init/main.c @@ -75,6 +75,7 @@ #include #include #include +#include #include #include @@ -778,6 +779,7 @@ static void __init do_basic_setup(void) do_ctors(); usermodehelper_enable(); do_initcalls(); + random_int_secret_init(); } static void __init do_pre_smp_initcalls(void) -- cgit v1.2.3 From 9fe34f5d920b183ec063550e0f4ec854aa373316 Mon Sep 17 00:00:00 2001 From: Noel Burton-Krahn Date: Wed, 18 Sep 2013 12:24:40 -0700 Subject: mrp: add periodictimer to allow retries when packets get lost MRP doesn't implement the periodictimer in 802.1Q, so it never retries if packets get lost. I ran into this problem when MRP sent a MVRP JoinIn before the interface was fully up. The JoinIn was lost, MRP didn't retry, and MVRP registration failed. Tested against Juniper QFabric switches Signed-off-by: Noel Burton-Krahn Acked-by: David Ward Signed-off-by: David S. Miller --- include/net/mrp.h | 1 + net/802/mrp.c | 27 +++++++++++++++++++++++++++ 2 files changed, 28 insertions(+) (limited to 'include') diff --git a/include/net/mrp.h b/include/net/mrp.h index 4fbf02aa2ec1..0f7558b638ae 100644 --- a/include/net/mrp.h +++ b/include/net/mrp.h @@ -112,6 +112,7 @@ struct mrp_applicant { struct mrp_application *app; struct net_device *dev; struct timer_list join_timer; + struct timer_list periodic_timer; spinlock_t lock; struct sk_buff_head queue; diff --git a/net/802/mrp.c b/net/802/mrp.c index 1eb05d80b07b..3ed616215870 100644 --- a/net/802/mrp.c +++ b/net/802/mrp.c @@ -24,6 +24,11 @@ static unsigned int mrp_join_time __read_mostly = 200; module_param(mrp_join_time, uint, 0644); MODULE_PARM_DESC(mrp_join_time, "Join time in ms (default 200ms)"); + +static unsigned int mrp_periodic_time __read_mostly = 1000; +module_param(mrp_periodic_time, uint, 0644); +MODULE_PARM_DESC(mrp_periodic_time, "Periodic time in ms (default 1s)"); + MODULE_LICENSE("GPL"); static const u8 @@ -595,6 +600,24 @@ static void mrp_join_timer(unsigned long data) mrp_join_timer_arm(app); } +static void mrp_periodic_timer_arm(struct mrp_applicant *app) +{ + mod_timer(&app->periodic_timer, + jiffies + msecs_to_jiffies(mrp_periodic_time)); +} + +static void mrp_periodic_timer(unsigned long data) +{ + struct mrp_applicant *app = (struct mrp_applicant *)data; + + spin_lock(&app->lock); + mrp_mad_event(app, MRP_EVENT_PERIODIC); + mrp_pdu_queue(app); + spin_unlock(&app->lock); + + mrp_periodic_timer_arm(app); +} + static int mrp_pdu_parse_end_mark(struct sk_buff *skb, int *offset) { __be16 endmark; @@ -845,6 +868,9 @@ int mrp_init_applicant(struct net_device *dev, struct mrp_application *appl) rcu_assign_pointer(dev->mrp_port->applicants[appl->type], app); setup_timer(&app->join_timer, mrp_join_timer, (unsigned long)app); mrp_join_timer_arm(app); + setup_timer(&app->periodic_timer, mrp_periodic_timer, + (unsigned long)app); + mrp_periodic_timer_arm(app); return 0; err3: @@ -870,6 +896,7 @@ void mrp_uninit_applicant(struct net_device *dev, struct mrp_application *appl) * all pending messages before the applicant is gone. */ del_timer_sync(&app->join_timer); + del_timer_sync(&app->periodic_timer); spin_lock_bh(&app->lock); mrp_mad_event(app, MRP_EVENT_TX); -- cgit v1.2.3 From 82aeef0bf03684b377678c00c05e613f30dca39c Mon Sep 17 00:00:00 2001 From: "Li, Zhen-Hua" Date: Fri, 13 Sep 2013 14:27:32 +0800 Subject: x86/iommu: correct ICS register offset According to Intel Vt-D specs, the offset of Invalidation complete status register should be 0x9C, not 0x98. See Intel's VT-d spec, Revision 1.3, Chapter 10.4, Page 98; Signed-off-by: Li, Zhen-Hua Signed-off-by: Joerg Roedel --- include/linux/intel-iommu.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index 78e2ada50cd5..d380c5e68008 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -55,7 +55,7 @@ #define DMAR_IQT_REG 0x88 /* Invalidation queue tail register */ #define DMAR_IQ_SHIFT 4 /* Invalidation queue head/tail shift */ #define DMAR_IQA_REG 0x90 /* Invalidation queue addr register */ -#define DMAR_ICS_REG 0x98 /* Invalidation complete status register */ +#define DMAR_ICS_REG 0x9c /* Invalidation complete status register */ #define DMAR_IRTA_REG 0xb8 /* Interrupt remapping table addr register */ #define OFFSET_STRIDE (9) -- cgit v1.2.3 From 9809b18fcf6b8d8ec4d3643677345907e6b50eca Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 24 Sep 2013 15:27:30 -0700 Subject: watchdog: update watchdog_thresh properly watchdog_tresh controls how often nmi perf event counter checks per-cpu hrtimer_interrupts counter and blows up if the counter hasn't changed since the last check. The counter is updated by per-cpu watchdog_hrtimer hrtimer which is scheduled with 2/5 watchdog_thresh period which guarantees that hrtimer is scheduled 2 times per the main period. Both hrtimer and perf event are started together when the watchdog is enabled. So far so good. But... But what happens when watchdog_thresh is updated from sysctl handler? proc_dowatchdog will set a new sampling period and hrtimer callback (watchdog_timer_fn) will use the new value in the next round. The problem, however, is that nobody tells the perf event that the sampling period has changed so it is ticking with the period configured when it has been set up. This might result in an ear ripping dissonance between perf and hrtimer parts if the watchdog_thresh is increased. And even worse it might lead to KABOOM if the watchdog is configured to panic on such a spurious lockup. This patch fixes the issue by updating both nmi perf even counter and hrtimers if the threshold value has changed. The nmi one is disabled and then reinitialized from scratch. This has an unpleasant side effect that the allocation of the new event might fail theoretically so the hard lockup detector would be disabled for such cpus. On the other hand such a memory allocation failure is very unlikely because the original event is deallocated right before. It would be much nicer if we just changed perf event period but there doesn't seem to be any API to do that right now. It is also unfortunate that perf_event_alloc uses GFP_KERNEL allocation unconditionally so we cannot use on_each_cpu() and do the same thing from the per-cpu context. The update from the current CPU should be safe because perf_event_disable removes the event atomically before it clears the per-cpu watchdog_ev so it cannot change anything under running handler feet. The hrtimer is simply restarted (thanks to Don Zickus who has pointed this out) if it is queued because we cannot rely it will fire&adopt to the new sampling period before a new nmi event triggers (when the treshold is decreased). [akpm@linux-foundation.org: the UP version of __smp_call_function_single ended up in the wrong place] Signed-off-by: Michal Hocko Acked-by: Don Zickus Cc: Frederic Weisbecker Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Fabio Estevam Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/smp.h | 6 ++++++ kernel/watchdog.c | 53 ++++++++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 56 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/smp.h b/include/linux/smp.h index cfb7ca094b38..731f5237d5f4 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -155,6 +155,12 @@ smp_call_function_any(const struct cpumask *mask, smp_call_func_t func, static inline void kick_all_cpus_sync(void) { } +static inline void __smp_call_function_single(int cpuid, + struct call_single_data *data, int wait) +{ + on_each_cpu(data->func, data->info, wait); +} + #endif /* !SMP */ /* diff --git a/kernel/watchdog.c b/kernel/watchdog.c index ced7d0609931..4431610f049a 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -486,7 +486,52 @@ static struct smp_hotplug_thread watchdog_threads = { .unpark = watchdog_enable, }; -static int watchdog_enable_all_cpus(void) +static void restart_watchdog_hrtimer(void *info) +{ + struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); + int ret; + + /* + * No need to cancel and restart hrtimer if it is currently executing + * because it will reprogram itself with the new period now. + * We should never see it unqueued here because we are running per-cpu + * with interrupts disabled. + */ + ret = hrtimer_try_to_cancel(hrtimer); + if (ret == 1) + hrtimer_start(hrtimer, ns_to_ktime(sample_period), + HRTIMER_MODE_REL_PINNED); +} + +static void update_timers(int cpu) +{ + struct call_single_data data = {.func = restart_watchdog_hrtimer}; + /* + * Make sure that perf event counter will adopt to a new + * sampling period. Updating the sampling period directly would + * be much nicer but we do not have an API for that now so + * let's use a big hammer. + * Hrtimer will adopt the new period on the next tick but this + * might be late already so we have to restart the timer as well. + */ + watchdog_nmi_disable(cpu); + __smp_call_function_single(cpu, &data, 1); + watchdog_nmi_enable(cpu); +} + +static void update_timers_all_cpus(void) +{ + int cpu; + + get_online_cpus(); + preempt_disable(); + for_each_online_cpu(cpu) + update_timers(cpu); + preempt_enable(); + put_online_cpus(); +} + +static int watchdog_enable_all_cpus(bool sample_period_changed) { int err = 0; @@ -496,6 +541,8 @@ static int watchdog_enable_all_cpus(void) pr_err("Failed to create watchdog threads, disabled\n"); else watchdog_running = 1; + } else if (sample_period_changed) { + update_timers_all_cpus(); } return err; @@ -537,7 +584,7 @@ int proc_dowatchdog(struct ctl_table *table, int write, * watchdog_*_all_cpus() function takes care of this. */ if (watchdog_user_enabled && watchdog_thresh) - err = watchdog_enable_all_cpus(); + err = watchdog_enable_all_cpus(old_thresh != watchdog_thresh); else watchdog_disable_all_cpus(); @@ -557,5 +604,5 @@ void __init lockup_detector_init(void) set_sample_period(); if (watchdog_user_enabled) - watchdog_enable_all_cpus(); + watchdog_enable_all_cpus(false); } -- cgit v1.2.3 From 694fbc0fe78518d06efa63910bf4ecee660e7852 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 24 Sep 2013 15:27:37 -0700 Subject: revert "memcg: enhance memcg iterator to support predicates" Revert commit de57780dc659 ("memcg: enhance memcg iterator to support predicates") I merged this prematurely - Michal and Johannes still disagree about the overall design direction and the future remains unclear. Cc: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 49 ++++---------------------------- mm/memcontrol.c | 70 ++++++++++------------------------------------ mm/vmscan.c | 16 +++++++---- 3 files changed, 32 insertions(+), 103 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 60e95872da29..ef2b9bd7fafa 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -53,23 +53,6 @@ struct mem_cgroup_reclaim_cookie { unsigned int generation; }; -enum mem_cgroup_filter_t { - VISIT, /* visit current node */ - SKIP, /* skip the current node and continue traversal */ - SKIP_TREE, /* skip the whole subtree and continue traversal */ -}; - -/* - * mem_cgroup_filter_t predicate might instruct mem_cgroup_iter_cond how to - * iterate through the hierarchy tree. Each tree element is checked by the - * predicate before it is returned by the iterator. If a filter returns - * SKIP or SKIP_TREE then the iterator code continues traversal (with the - * next node down the hierarchy or the next node that doesn't belong under the - * memcg's subtree). - */ -typedef enum mem_cgroup_filter_t -(*mem_cgroup_iter_filter)(struct mem_cgroup *memcg, struct mem_cgroup *root); - #ifdef CONFIG_MEMCG /* * All "charge" functions with gfp_mask should use GFP_KERNEL or @@ -137,18 +120,9 @@ mem_cgroup_prepare_migration(struct page *page, struct page *newpage, extern void mem_cgroup_end_migration(struct mem_cgroup *memcg, struct page *oldpage, struct page *newpage, bool migration_ok); -struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root, - struct mem_cgroup *prev, - struct mem_cgroup_reclaim_cookie *reclaim, - mem_cgroup_iter_filter cond); - -static inline struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, - struct mem_cgroup *prev, - struct mem_cgroup_reclaim_cookie *reclaim) -{ - return mem_cgroup_iter_cond(root, prev, reclaim, NULL); -} - +struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *, + struct mem_cgroup *, + struct mem_cgroup_reclaim_cookie *); void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *); /* @@ -260,8 +234,7 @@ static inline void mem_cgroup_dec_page_stat(struct page *page, mem_cgroup_update_page_stat(page, idx, -1); } -enum mem_cgroup_filter_t -mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg, +bool mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg, struct mem_cgroup *root); void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx); @@ -376,15 +349,6 @@ static inline void mem_cgroup_end_migration(struct mem_cgroup *memcg, struct page *oldpage, struct page *newpage, bool migration_ok) { } -static inline struct mem_cgroup * -mem_cgroup_iter_cond(struct mem_cgroup *root, - struct mem_cgroup *prev, - struct mem_cgroup_reclaim_cookie *reclaim, - mem_cgroup_iter_filter cond) -{ - /* first call must return non-NULL, second return NULL */ - return (struct mem_cgroup *)(unsigned long)!prev; -} static inline struct mem_cgroup * mem_cgroup_iter(struct mem_cgroup *root, @@ -471,11 +435,10 @@ static inline void mem_cgroup_dec_page_stat(struct page *page, } static inline -enum mem_cgroup_filter_t -mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg, +bool mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg, struct mem_cgroup *root) { - return VISIT; + return false; } static inline void mem_cgroup_split_huge_fixup(struct page *head) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 916892c2b8e0..65e7bec4b0f0 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -862,15 +862,6 @@ struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) return memcg; } -static enum mem_cgroup_filter_t -mem_cgroup_filter(struct mem_cgroup *memcg, struct mem_cgroup *root, - mem_cgroup_iter_filter cond) -{ - if (!cond) - return VISIT; - return cond(memcg, root); -} - /* * Returns a next (in a pre-order walk) alive memcg (with elevated css * ref. count) or NULL if the whole root's subtree has been visited. @@ -878,7 +869,7 @@ mem_cgroup_filter(struct mem_cgroup *memcg, struct mem_cgroup *root, * helper function to be used by mem_cgroup_iter */ static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root, - struct mem_cgroup *last_visited, mem_cgroup_iter_filter cond) + struct mem_cgroup *last_visited) { struct cgroup_subsys_state *prev_css, *next_css; @@ -896,31 +887,11 @@ skip_node: if (next_css) { struct mem_cgroup *mem = mem_cgroup_from_css(next_css); - switch (mem_cgroup_filter(mem, root, cond)) { - case SKIP: + if (css_tryget(&mem->css)) + return mem; + else { prev_css = next_css; goto skip_node; - case SKIP_TREE: - if (mem == root) - return NULL; - /* - * css_rightmost_descendant is not an optimal way to - * skip through a subtree (especially for imbalanced - * trees leaning to right) but that's what we have right - * now. More effective solution would be traversing - * right-up for first non-NULL without calling - * css_next_descendant_pre afterwards. - */ - prev_css = css_rightmost_descendant(next_css); - goto skip_node; - case VISIT: - if (css_tryget(&mem->css)) - return mem; - else { - prev_css = next_css; - goto skip_node; - } - break; } } @@ -984,7 +955,6 @@ static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter, * @root: hierarchy root * @prev: previously returned memcg, NULL on first invocation * @reclaim: cookie for shared reclaim walks, NULL for full walks - * @cond: filter for visited nodes, NULL for no filter * * Returns references to children of the hierarchy below @root, or * @root itself, or %NULL after a full round-trip. @@ -997,18 +967,15 @@ static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter, * divide up the memcgs in the hierarchy among all concurrent * reclaimers operating on the same zone and priority. */ -struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root, +struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, struct mem_cgroup *prev, - struct mem_cgroup_reclaim_cookie *reclaim, - mem_cgroup_iter_filter cond) + struct mem_cgroup_reclaim_cookie *reclaim) { struct mem_cgroup *memcg = NULL; struct mem_cgroup *last_visited = NULL; - if (mem_cgroup_disabled()) { - /* first call must return non-NULL, second return NULL */ - return (struct mem_cgroup *)(unsigned long)!prev; - } + if (mem_cgroup_disabled()) + return NULL; if (!root) root = root_mem_cgroup; @@ -1019,9 +986,7 @@ struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root, if (!root->use_hierarchy && root != root_mem_cgroup) { if (prev) goto out_css_put; - if (mem_cgroup_filter(root, root, cond) == VISIT) - return root; - return NULL; + return root; } rcu_read_lock(); @@ -1044,7 +1009,7 @@ struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root, last_visited = mem_cgroup_iter_load(iter, root, &seq); } - memcg = __mem_cgroup_iter_next(root, last_visited, cond); + memcg = __mem_cgroup_iter_next(root, last_visited); if (reclaim) { mem_cgroup_iter_update(iter, last_visited, memcg, seq); @@ -1055,11 +1020,7 @@ struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root, reclaim->generation = iter->generation; } - /* - * We have finished the whole tree walk or no group has been - * visited because filter told us to skip the root node. - */ - if (!memcg && (prev || (cond && !last_visited))) + if (prev && !memcg) goto out_unlock; } out_unlock: @@ -1804,14 +1765,13 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) * a) it is over its soft limit * b) any parent up the hierarchy is over its soft limit */ -enum mem_cgroup_filter_t -mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg, +bool mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg, struct mem_cgroup *root) { struct mem_cgroup *parent = memcg; if (res_counter_soft_limit_excess(&memcg->res)) - return VISIT; + return true; /* * If any parent up to the root in the hierarchy is over its soft limit @@ -1819,12 +1779,12 @@ mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg, */ while ((parent = parent_mem_cgroup(parent))) { if (res_counter_soft_limit_excess(&parent->res)) - return VISIT; + return true; if (parent == root) break; } - return SKIP; + return false; } static DEFINE_SPINLOCK(memcg_oom_lock); diff --git a/mm/vmscan.c b/mm/vmscan.c index cdd300b81485..17a7134de4d7 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2185,16 +2185,21 @@ __shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim) .zone = zone, .priority = sc->priority, }; - struct mem_cgroup *memcg = NULL; - mem_cgroup_iter_filter filter = (soft_reclaim) ? - mem_cgroup_soft_reclaim_eligible : NULL; + struct mem_cgroup *memcg; nr_reclaimed = sc->nr_reclaimed; nr_scanned = sc->nr_scanned; - while ((memcg = mem_cgroup_iter_cond(root, memcg, &reclaim, filter))) { + memcg = mem_cgroup_iter(root, NULL, &reclaim); + do { struct lruvec *lruvec; + if (soft_reclaim && + !mem_cgroup_soft_reclaim_eligible(memcg, root)) { + memcg = mem_cgroup_iter(root, memcg, &reclaim); + continue; + } + lruvec = mem_cgroup_zone_lruvec(zone, memcg); shrink_lruvec(lruvec, sc); @@ -2214,7 +2219,8 @@ __shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim) mem_cgroup_iter_break(root, memcg); break; } - } + memcg = mem_cgroup_iter(root, memcg, &reclaim); + } while (memcg); vmpressure(sc->gfp_mask, sc->target_mem_cgroup, sc->nr_scanned - nr_scanned, -- cgit v1.2.3 From b1aff7fcf86c88472b0a70f15d89d7a4adba07bb Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 24 Sep 2013 15:27:38 -0700 Subject: revert "vmscan, memcg: do softlimit reclaim also for targeted reclaim" Revert commit a5b7c87f9207 ("vmscan, memcg: do softlimit reclaim also for targeted reclaim") I merged this prematurely - Michal and Johannes still disagree about the overall design direction and the future remains unclear. Cc: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 6 ++---- mm/memcontrol.c | 14 +++++--------- mm/vmscan.c | 4 ++-- 3 files changed, 9 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index ef2b9bd7fafa..6054c9f3a5e8 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -234,8 +234,7 @@ static inline void mem_cgroup_dec_page_stat(struct page *page, mem_cgroup_update_page_stat(page, idx, -1); } -bool mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg, - struct mem_cgroup *root); +bool mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg); void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx); static inline void mem_cgroup_count_vm_event(struct mm_struct *mm, @@ -435,8 +434,7 @@ static inline void mem_cgroup_dec_page_stat(struct page *page, } static inline -bool mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg, - struct mem_cgroup *root) +bool mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg) { return false; } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 65e7bec4b0f0..47cdc7eb1a6b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1760,13 +1760,11 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) #endif /* - * A group is eligible for the soft limit reclaim under the given root - * hierarchy if - * a) it is over its soft limit + * A group is eligible for the soft limit reclaim if + * a) it is over its soft limit * b) any parent up the hierarchy is over its soft limit */ -bool mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg, - struct mem_cgroup *root) +bool mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg) { struct mem_cgroup *parent = memcg; @@ -1774,14 +1772,12 @@ bool mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg, return true; /* - * If any parent up to the root in the hierarchy is over its soft limit - * then we have to obey and reclaim from this group as well. + * If any parent up the hierarchy is over its soft limit then we + * have to obey and reclaim from this group as well. */ while ((parent = parent_mem_cgroup(parent))) { if (res_counter_soft_limit_excess(&parent->res)) return true; - if (parent == root) - break; } return false; diff --git a/mm/vmscan.c b/mm/vmscan.c index 17a7134de4d7..0e081cada4ba 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -142,7 +142,7 @@ static bool global_reclaim(struct scan_control *sc) static bool mem_cgroup_should_soft_reclaim(struct scan_control *sc) { - return !mem_cgroup_disabled(); + return !mem_cgroup_disabled() && global_reclaim(sc); } #else static bool global_reclaim(struct scan_control *sc) @@ -2195,7 +2195,7 @@ __shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim) struct lruvec *lruvec; if (soft_reclaim && - !mem_cgroup_soft_reclaim_eligible(memcg, root)) { + !mem_cgroup_soft_reclaim_eligible(memcg)) { memcg = mem_cgroup_iter(root, memcg, &reclaim); continue; } -- cgit v1.2.3 From 0608f43da64a1f1c42507304b5f25bc8b1227aa4 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 24 Sep 2013 15:27:41 -0700 Subject: revert "memcg, vmscan: integrate soft reclaim tighter with zone shrinking code" Revert commit 3b38722efd9f ("memcg, vmscan: integrate soft reclaim tighter with zone shrinking code") I merged this prematurely - Michal and Johannes still disagree about the overall design direction and the future remains unclear. Cc: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 10 ++- mm/memcontrol.c | 163 +++++++++++++++++++++++++++++++++++++++------ mm/vmscan.c | 62 ++++++++--------- 3 files changed, 175 insertions(+), 60 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 6054c9f3a5e8..ecc82b37c4cc 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -234,7 +234,9 @@ static inline void mem_cgroup_dec_page_stat(struct page *page, mem_cgroup_update_page_stat(page, idx, -1); } -bool mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg); +unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, + gfp_t gfp_mask, + unsigned long *total_scanned); void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx); static inline void mem_cgroup_count_vm_event(struct mm_struct *mm, @@ -434,9 +436,11 @@ static inline void mem_cgroup_dec_page_stat(struct page *page, } static inline -bool mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg) +unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, + gfp_t gfp_mask, + unsigned long *total_scanned) { - return false; + return 0; } static inline void mem_cgroup_split_huge_fixup(struct page *head) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 852dbec07ce6..1c52ddbc839b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1991,28 +1991,57 @@ static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) } #endif -/* - * A group is eligible for the soft limit reclaim if - * a) it is over its soft limit - * b) any parent up the hierarchy is over its soft limit - */ -bool mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg) -{ - struct mem_cgroup *parent = memcg; - - if (res_counter_soft_limit_excess(&memcg->res)) - return true; - - /* - * If any parent up the hierarchy is over its soft limit then we - * have to obey and reclaim from this group as well. - */ - while ((parent = parent_mem_cgroup(parent))) { - if (res_counter_soft_limit_excess(&parent->res)) - return true; +static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, + struct zone *zone, + gfp_t gfp_mask, + unsigned long *total_scanned) +{ + struct mem_cgroup *victim = NULL; + int total = 0; + int loop = 0; + unsigned long excess; + unsigned long nr_scanned; + struct mem_cgroup_reclaim_cookie reclaim = { + .zone = zone, + .priority = 0, + }; + + excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT; + + while (1) { + victim = mem_cgroup_iter(root_memcg, victim, &reclaim); + if (!victim) { + loop++; + if (loop >= 2) { + /* + * If we have not been able to reclaim + * anything, it might because there are + * no reclaimable pages under this hierarchy + */ + if (!total) + break; + /* + * We want to do more targeted reclaim. + * excess >> 2 is not to excessive so as to + * reclaim too much, nor too less that we keep + * coming back to reclaim from this cgroup + */ + if (total >= (excess >> 2) || + (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) + break; + } + continue; + } + if (!mem_cgroup_reclaimable(victim, false)) + continue; + total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false, + zone, &nr_scanned); + *total_scanned += nr_scanned; + if (!res_counter_soft_limit_excess(&root_memcg->res)) + break; } - - return false; + mem_cgroup_iter_break(root_memcg, victim); + return total; } static DEFINE_SPINLOCK(memcg_oom_lock); @@ -4761,6 +4790,98 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, return ret; } +unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, + gfp_t gfp_mask, + unsigned long *total_scanned) +{ + unsigned long nr_reclaimed = 0; + struct mem_cgroup_per_zone *mz, *next_mz = NULL; + unsigned long reclaimed; + int loop = 0; + struct mem_cgroup_tree_per_zone *mctz; + unsigned long long excess; + unsigned long nr_scanned; + + if (order > 0) + return 0; + + mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone)); + /* + * This loop can run a while, specially if mem_cgroup's continuously + * keep exceeding their soft limit and putting the system under + * pressure + */ + do { + if (next_mz) + mz = next_mz; + else + mz = mem_cgroup_largest_soft_limit_node(mctz); + if (!mz) + break; + + nr_scanned = 0; + reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone, + gfp_mask, &nr_scanned); + nr_reclaimed += reclaimed; + *total_scanned += nr_scanned; + spin_lock(&mctz->lock); + + /* + * If we failed to reclaim anything from this memory cgroup + * it is time to move on to the next cgroup + */ + next_mz = NULL; + if (!reclaimed) { + do { + /* + * Loop until we find yet another one. + * + * By the time we get the soft_limit lock + * again, someone might have aded the + * group back on the RB tree. Iterate to + * make sure we get a different mem. + * mem_cgroup_largest_soft_limit_node returns + * NULL if no other cgroup is present on + * the tree + */ + next_mz = + __mem_cgroup_largest_soft_limit_node(mctz); + if (next_mz == mz) + css_put(&next_mz->memcg->css); + else /* next_mz == NULL or other memcg */ + break; + } while (1); + } + __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz); + excess = res_counter_soft_limit_excess(&mz->memcg->res); + /* + * One school of thought says that we should not add + * back the node to the tree if reclaim returns 0. + * But our reclaim could return 0, simply because due + * to priority we are exposing a smaller subset of + * memory to reclaim from. Consider this as a longer + * term TODO. + */ + /* If excess == 0, no tree ops */ + __mem_cgroup_insert_exceeded(mz->memcg, mz, mctz, excess); + spin_unlock(&mctz->lock); + css_put(&mz->memcg->css); + loop++; + /* + * Could not reclaim anything and there are no more + * mem cgroups to try or we seem to be looping without + * reclaiming anything. + */ + if (!nr_reclaimed && + (next_mz == NULL || + loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS)) + break; + } while (!nr_reclaimed); + if (next_mz) + css_put(&next_mz->memcg->css); + return nr_reclaimed; +} + /** * mem_cgroup_force_empty_list - clears LRU of a group * @memcg: group to clear diff --git a/mm/vmscan.c b/mm/vmscan.c index 0e081cada4ba..beb35778c69f 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -139,21 +139,11 @@ static bool global_reclaim(struct scan_control *sc) { return !sc->target_mem_cgroup; } - -static bool mem_cgroup_should_soft_reclaim(struct scan_control *sc) -{ - return !mem_cgroup_disabled() && global_reclaim(sc); -} #else static bool global_reclaim(struct scan_control *sc) { return true; } - -static bool mem_cgroup_should_soft_reclaim(struct scan_control *sc) -{ - return false; -} #endif unsigned long zone_reclaimable_pages(struct zone *zone) @@ -2174,8 +2164,7 @@ static inline bool should_continue_reclaim(struct zone *zone, } } -static void -__shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim) +static void shrink_zone(struct zone *zone, struct scan_control *sc) { unsigned long nr_reclaimed, nr_scanned; @@ -2194,12 +2183,6 @@ __shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim) do { struct lruvec *lruvec; - if (soft_reclaim && - !mem_cgroup_soft_reclaim_eligible(memcg)) { - memcg = mem_cgroup_iter(root, memcg, &reclaim); - continue; - } - lruvec = mem_cgroup_zone_lruvec(zone, memcg); shrink_lruvec(lruvec, sc); @@ -2230,24 +2213,6 @@ __shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim) sc->nr_scanned - nr_scanned, sc)); } - -static void shrink_zone(struct zone *zone, struct scan_control *sc) -{ - bool do_soft_reclaim = mem_cgroup_should_soft_reclaim(sc); - unsigned long nr_scanned = sc->nr_scanned; - - __shrink_zone(zone, sc, do_soft_reclaim); - - /* - * No group is over the soft limit or those that are do not have - * pages in the zone we are reclaiming so we have to reclaim everybody - */ - if (do_soft_reclaim && (sc->nr_scanned == nr_scanned)) { - __shrink_zone(zone, sc, false); - return; - } -} - /* Returns true if compaction should go ahead for a high-order request */ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) { @@ -2309,6 +2274,8 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) { struct zoneref *z; struct zone *zone; + unsigned long nr_soft_reclaimed; + unsigned long nr_soft_scanned; bool aborted_reclaim = false; /* @@ -2348,6 +2315,18 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) continue; } } + /* + * This steals pages from memory cgroups over softlimit + * and returns the number of reclaimed pages and + * scanned pages. This works for global memory pressure + * and balancing, not for a memcg's limit. + */ + nr_soft_scanned = 0; + nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone, + sc->order, sc->gfp_mask, + &nr_soft_scanned); + sc->nr_reclaimed += nr_soft_reclaimed; + sc->nr_scanned += nr_soft_scanned; /* need some check for avoid more shrink_zone() */ } @@ -2941,6 +2920,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, { int i; int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ + unsigned long nr_soft_reclaimed; + unsigned long nr_soft_scanned; struct scan_control sc = { .gfp_mask = GFP_KERNEL, .priority = DEF_PRIORITY, @@ -3055,6 +3036,15 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, sc.nr_scanned = 0; + nr_soft_scanned = 0; + /* + * Call soft limit reclaim before calling shrink_zone. + */ + nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone, + order, sc.gfp_mask, + &nr_soft_scanned); + sc.nr_reclaimed += nr_soft_reclaimed; + /* * There should be no need to raise the scanning * priority if enough pages are already being scanned -- cgit v1.2.3 From b0b8c960ffcc5bfc82d1a09ad931673e3a36c15a Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Wed, 4 Sep 2013 10:52:57 -0500 Subject: of: clean-up ifdefs in of_irq.h Much of of_irq.h is needlessly ifdef'ed. Clean this up and minimize the amount ifdef'ed code. This fixes some build warnings when CONFIG_OF is not enabled (seen on i386 and x86_64): include/linux/of_irq.h:82:7: warning: 'struct device_node' declared inside parameter list [enabled by default] include/linux/of_irq.h:82:7: warning: its scope is only this definition or declaration, which is probably not what you want [enabled by default] include/linux/of_irq.h:87:47: warning: 'struct device_node' declared inside parameter list [enabled by default] Compile tested on i386, sparc and arm. Reported-by: Randy Dunlap Cc: Grant Likely Signed-off-by: Rob Herring --- include/linux/of_irq.h | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/linux/of_irq.h b/include/linux/of_irq.h index 535cecf1e02f..fcd63baee5f2 100644 --- a/include/linux/of_irq.h +++ b/include/linux/of_irq.h @@ -1,8 +1,6 @@ #ifndef __OF_IRQ_H #define __OF_IRQ_H -#if defined(CONFIG_OF) -struct of_irq; #include #include #include @@ -10,14 +8,6 @@ struct of_irq; #include #include -/* - * irq_of_parse_and_map() is used by all OF enabled platforms; but SPARC - * implements it differently. However, the prototype is the same for all, - * so declare it here regardless of the CONFIG_OF_IRQ setting. - */ -extern unsigned int irq_of_parse_and_map(struct device_node *node, int index); - -#if defined(CONFIG_OF_IRQ) /** * of_irq - container for device_node/irq_specifier pair for an irq controller * @controller: pointer to interrupt controller device tree node @@ -71,11 +61,17 @@ extern int of_irq_to_resource(struct device_node *dev, int index, extern int of_irq_count(struct device_node *dev); extern int of_irq_to_resource_table(struct device_node *dev, struct resource *res, int nr_irqs); -extern struct device_node *of_irq_find_parent(struct device_node *child); extern void of_irq_init(const struct of_device_id *matches); -#endif /* CONFIG_OF_IRQ */ +#if defined(CONFIG_OF) +/* + * irq_of_parse_and_map() is used by all OF enabled platforms; but SPARC + * implements it differently. However, the prototype is the same for all, + * so declare it here regardless of the CONFIG_OF_IRQ setting. + */ +extern unsigned int irq_of_parse_and_map(struct device_node *node, int index); +extern struct device_node *of_irq_find_parent(struct device_node *child); #else /* !CONFIG_OF */ static inline unsigned int irq_of_parse_and_map(struct device_node *dev, -- cgit v1.2.3 From 19872d20c890073c5207d9e02bb8f14d451a11eb Mon Sep 17 00:00:00 2001 From: David Herrmann Date: Mon, 9 Sep 2013 18:33:54 +0200 Subject: HID: uhid: allocate static minor udev has this nice feature of creating "dead" /dev/ device-nodes if it finds a devnode: modalias. Once the node is accessed, the kernel automatically loads the module that provides the node. However, this requires udev to know the major:minor code to use for the node. This feature was introduced by: commit 578454ff7eab61d13a26b568f99a89a2c9edc881 Author: Kay Sievers Date: Thu May 20 18:07:20 2010 +0200 driver core: add devname module aliases to allow module on-demand auto-loading However, uhid uses dynamic minor numbers so this doesn't actually work. We need to load uhid to know which minor it's going to use. Hence, allocate a static minor (just like uinput does) and we're good to go. Reported-by: Tom Gundersen Signed-off-by: David Herrmann Signed-off-by: Jiri Kosina --- drivers/hid/uhid.c | 3 ++- include/linux/miscdevice.h | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/hid/uhid.c b/drivers/hid/uhid.c index 5bf2fb785844..93b00d76374c 100644 --- a/drivers/hid/uhid.c +++ b/drivers/hid/uhid.c @@ -615,7 +615,7 @@ static const struct file_operations uhid_fops = { static struct miscdevice uhid_misc = { .fops = &uhid_fops, - .minor = MISC_DYNAMIC_MINOR, + .minor = UHID_MINOR, .name = UHID_NAME, }; @@ -634,4 +634,5 @@ module_exit(uhid_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("David Herrmann "); MODULE_DESCRIPTION("User-space I/O driver support for HID subsystem"); +MODULE_ALIAS_MISCDEV(UHID_MINOR); MODULE_ALIAS("devname:" UHID_NAME); diff --git a/include/linux/miscdevice.h b/include/linux/miscdevice.h index 09c2300ddb37..cb358355ef43 100644 --- a/include/linux/miscdevice.h +++ b/include/linux/miscdevice.h @@ -45,6 +45,7 @@ #define MAPPER_CTRL_MINOR 236 #define LOOP_CTRL_MINOR 237 #define VHOST_NET_MINOR 238 +#define UHID_MINOR 239 #define MISC_DYNAMIC_MINOR 255 struct device; -- cgit v1.2.3 From 5bc2afc2b53fc73f154e6344cd898585628e6d27 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 23 Sep 2013 18:01:28 -0400 Subject: NFSv4: Honour the 'opened' parameter in the atomic_open() filesystem method Determine if we've created a new file by examining the directory change attribute and/or the O_EXCL flag. This fixes a regression when doing a non-exclusive create of a new file. If the FILE_CREATED flag is not set, the atomic_open() command will perform full file access permissions checks instead of just checking for MAY_OPEN. Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 2 +- fs/nfs/nfs4file.c | 3 ++- fs/nfs/nfs4proc.c | 26 +++++++++++++++++++------- include/linux/nfs_xdr.h | 3 ++- 4 files changed, 24 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 854a8f05a610..02b0df769e2d 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1458,7 +1458,7 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry, trace_nfs_atomic_open_enter(dir, ctx, open_flags); nfs_block_sillyrename(dentry->d_parent); - inode = NFS_PROTO(dir)->open_context(dir, ctx, open_flags, &attr); + inode = NFS_PROTO(dir)->open_context(dir, ctx, open_flags, &attr, opened); nfs_unblock_sillyrename(dentry->d_parent); if (IS_ERR(inode)) { err = PTR_ERR(inode); diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index e5b804dd944c..77efaf15ec90 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -19,6 +19,7 @@ nfs4_file_open(struct inode *inode, struct file *filp) struct inode *dir; unsigned openflags = filp->f_flags; struct iattr attr; + int opened = 0; int err; /* @@ -55,7 +56,7 @@ nfs4_file_open(struct inode *inode, struct file *filp) nfs_wb_all(inode); } - inode = NFS_PROTO(dir)->open_context(dir, ctx, openflags, &attr); + inode = NFS_PROTO(dir)->open_context(dir, ctx, openflags, &attr, &opened); if (IS_ERR(inode)) { err = PTR_ERR(inode); switch (err) { diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 989bb9d3074d..488ef9b5c51a 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -912,6 +912,7 @@ struct nfs4_opendata { struct iattr attrs; unsigned long timestamp; unsigned int rpc_done : 1; + unsigned int file_created : 1; unsigned int is_recover : 1; int rpc_status; int cancelled; @@ -1946,8 +1947,13 @@ static int _nfs4_proc_open(struct nfs4_opendata *data) nfs_fattr_map_and_free_names(server, &data->f_attr); - if (o_arg->open_flags & O_CREAT) + if (o_arg->open_flags & O_CREAT) { update_changeattr(dir, &o_res->cinfo); + if (o_arg->open_flags & O_EXCL) + data->file_created = 1; + else if (o_res->cinfo.before != o_res->cinfo.after) + data->file_created = 1; + } if ((o_res->rflags & NFS4_OPEN_RESULT_LOCKTYPE_POSIX) == 0) server->caps &= ~NFS_CAP_POSIX_LOCK; if(o_res->rflags & NFS4_OPEN_RESULT_CONFIRM) { @@ -2191,7 +2197,8 @@ static int _nfs4_do_open(struct inode *dir, struct nfs_open_context *ctx, int flags, struct iattr *sattr, - struct nfs4_label *label) + struct nfs4_label *label, + int *opened) { struct nfs4_state_owner *sp; struct nfs4_state *state = NULL; @@ -2261,6 +2268,8 @@ static int _nfs4_do_open(struct inode *dir, nfs_setsecurity(state->inode, opendata->o_res.f_attr, olabel); } } + if (opendata->file_created) + *opened |= FILE_CREATED; if (pnfs_use_threshold(ctx_th, opendata->f_attr.mdsthreshold, server)) *ctx_th = opendata->f_attr.mdsthreshold; @@ -2289,7 +2298,8 @@ static struct nfs4_state *nfs4_do_open(struct inode *dir, struct nfs_open_context *ctx, int flags, struct iattr *sattr, - struct nfs4_label *label) + struct nfs4_label *label, + int *opened) { struct nfs_server *server = NFS_SERVER(dir); struct nfs4_exception exception = { }; @@ -2297,7 +2307,7 @@ static struct nfs4_state *nfs4_do_open(struct inode *dir, int status; do { - status = _nfs4_do_open(dir, ctx, flags, sattr, label); + status = _nfs4_do_open(dir, ctx, flags, sattr, label, opened); res = ctx->state; trace_nfs4_open_file(ctx, flags, status); if (status == 0) @@ -2659,7 +2669,8 @@ out: } static struct inode * -nfs4_atomic_open(struct inode *dir, struct nfs_open_context *ctx, int open_flags, struct iattr *attr) +nfs4_atomic_open(struct inode *dir, struct nfs_open_context *ctx, + int open_flags, struct iattr *attr, int *opened) { struct nfs4_state *state; struct nfs4_label l = {0, 0, 0, NULL}, *label = NULL; @@ -2667,7 +2678,7 @@ nfs4_atomic_open(struct inode *dir, struct nfs_open_context *ctx, int open_flags label = nfs4_label_init_security(dir, ctx->dentry, attr, &l); /* Protect against concurrent sillydeletes */ - state = nfs4_do_open(dir, ctx, open_flags, attr, label); + state = nfs4_do_open(dir, ctx, open_flags, attr, label, opened); nfs4_label_release_security(label); @@ -3332,6 +3343,7 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, struct nfs4_label l, *ilabel = NULL; struct nfs_open_context *ctx; struct nfs4_state *state; + int opened = 0; int status = 0; ctx = alloc_nfs_open_context(dentry, FMODE_READ); @@ -3341,7 +3353,7 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, ilabel = nfs4_label_init_security(dir, dentry, sattr, &l); sattr->ia_mode &= ~current_umask(); - state = nfs4_do_open(dir, ctx, flags, sattr, ilabel); + state = nfs4_do_open(dir, ctx, flags, sattr, ilabel, &opened); if (IS_ERR(state)) { status = PTR_ERR(state); goto out; diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 01fd84b566f7..49f52c8f4422 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1455,7 +1455,8 @@ struct nfs_rpc_ops { struct inode * (*open_context) (struct inode *dir, struct nfs_open_context *ctx, int open_flags, - struct iattr *iattr); + struct iattr *iattr, + int *); int (*have_delegation)(struct inode *, fmode_t); int (*return_delegation)(struct inode *); struct nfs_client *(*alloc_client) (const struct nfs_client_initdata *); -- cgit v1.2.3 From 2bedea8f26c92e2610f2f67889144990749461e0 Mon Sep 17 00:00:00 2001 From: Arend van Spriel Date: Wed, 25 Sep 2013 12:11:02 +0200 Subject: bcma: make bcma_core_pci_{up,down}() callable from atomic context This patch removes the bcma_core_pci_power_save() call from the bcma_core_pci_{up,down}() functions as it tries to schedule thus requiring to call them from non-atomic context. The function bcma_core_pci_power_save() is now exported so the calling module can explicitly use it in non-atomic context. This fixes the 'scheduling while atomic' issue reported by Tod Jackson and Joe Perches. [ 13.210710] BUG: scheduling while atomic: dhcpcd/1800/0x00000202 [ 13.210718] Modules linked in: brcmsmac nouveau coretemp kvm_intel kvm cordic brcmutil bcma dell_wmi atl1c ttm mxm_wmi wmi [ 13.210756] CPU: 2 PID: 1800 Comm: dhcpcd Not tainted 3.11.0-wl #1 [ 13.210762] Hardware name: Alienware M11x R2/M11x R2, BIOS A04 11/23/2010 [ 13.210767] ffff880177c92c40 ffff880170fd1948 ffffffff8169af5b 0000000000000007 [ 13.210777] ffff880170fd1ab0 ffff880170fd1958 ffffffff81697ee2 ffff880170fd19d8 [ 13.210785] ffffffff816a19f5 00000000000f4240 000000000000d080 ffff880170fd1fd8 [ 13.210794] Call Trace: [ 13.210813] [] dump_stack+0x4f/0x84 [ 13.210826] [] __schedule_bug+0x43/0x51 [ 13.210837] [] __schedule+0x6e5/0x810 [ 13.210845] [] schedule+0x24/0x70 [ 13.210855] [] schedule_hrtimeout_range_clock+0x10c/0x150 [ 13.210867] [] ? update_rmtp+0x60/0x60 [ 13.210877] [] ? hrtimer_start_range_ns+0xf/0x20 [ 13.210887] [] schedule_hrtimeout_range+0xe/0x10 [ 13.210897] [] usleep_range+0x3b/0x40 [ 13.210910] [] bcma_pcie_mdio_set_phy.isra.3+0x4f/0x80 [bcma] [ 13.210921] [] bcma_pcie_mdio_write.isra.4+0xbf/0xd0 [bcma] [ 13.210932] [] bcma_pcie_mdio_writeread.isra.6.constprop.13+0x18/0x30 [bcma] [ 13.210942] [] bcma_core_pci_power_save+0x3e/0x80 [bcma] [ 13.210953] [] bcma_core_pci_up+0x2d/0x60 [bcma] [ 13.210975] [] brcms_c_up+0xfc/0x430 [brcmsmac] [ 13.210989] [] brcms_up+0x1d/0x20 [brcmsmac] [ 13.211003] [] brcms_ops_start+0x298/0x340 [brcmsmac] [ 13.211020] [] ? cfg80211_netdev_notifier_call+0xd2/0x5f0 [ 13.211030] [] ? packet_notifier+0xad/0x1d0 [ 13.211064] [] ieee80211_do_open+0x325/0xf80 [ 13.211076] [] ? __raw_notifier_call_chain+0x9/0x10 [ 13.211086] [] ieee80211_open+0x71/0x80 [ 13.211101] [] __dev_open+0x87/0xe0 [ 13.211109] [] __dev_change_flags+0x9c/0x180 [ 13.211117] [] dev_change_flags+0x23/0x70 [ 13.211127] [] devinet_ioctl+0x5b8/0x6a0 [ 13.211136] [] inet_ioctl+0x75/0x90 [ 13.211147] [] sock_do_ioctl+0x2b/0x70 [ 13.211155] [] sock_ioctl+0x71/0x2a0 [ 13.211169] [] do_vfs_ioctl+0x87/0x520 [ 13.211180] [] ? ____fput+0x9/0x10 [ 13.211198] [] ? task_work_run+0x9c/0xd0 [ 13.211202] [] SyS_ioctl+0x91/0xb0 [ 13.211208] [] system_call_fastpath+0x16/0x1b [ 13.211217] NOHZ: local_softirq_pending 202 The issue was introduced in v3.11 kernel by following commit: commit aa51e598d04c6acf5477934cd6383f5a17ce9029 Author: Hauke Mehrtens Date: Sat Aug 24 00:32:31 2013 +0200 brcmsmac: use bcma PCIe up and down functions replace the calls to bcma_core_pci_extend_L1timer() by calls to the newly introduced bcma_core_pci_ip() and bcma_core_pci_down() Signed-off-by: Hauke Mehrtens Cc: Arend van Spriel Signed-off-by: John W. Linville This fix has been discussed with Hauke Mehrtens [1] selection option 3) and is intended for v3.12. Ref: [1] http://mid.gmane.org/5239B12D.3040206@hauke-m.de Cc: # 3.11.x Cc: Tod Jackson Cc: Joe Perches Cc: Rafal Milecki Cc: Hauke Mehrtens Reviewed-by: Hante Meuleman Signed-off-by: Arend van Spriel Signed-off-by: John W. Linville --- drivers/bcma/driver_pci.c | 49 +++++++++++++++++++----------------- include/linux/bcma/bcma_driver_pci.h | 1 + 2 files changed, 27 insertions(+), 23 deletions(-) (limited to 'include') diff --git a/drivers/bcma/driver_pci.c b/drivers/bcma/driver_pci.c index c9fd6943ce45..50329d1057ed 100644 --- a/drivers/bcma/driver_pci.c +++ b/drivers/bcma/driver_pci.c @@ -210,25 +210,6 @@ static void bcma_core_pci_config_fixup(struct bcma_drv_pci *pc) } } -static void bcma_core_pci_power_save(struct bcma_drv_pci *pc, bool up) -{ - u16 data; - - if (pc->core->id.rev >= 15 && pc->core->id.rev <= 20) { - data = up ? 0x74 : 0x7C; - bcma_pcie_mdio_writeread(pc, BCMA_CORE_PCI_MDIO_BLK1, - BCMA_CORE_PCI_MDIO_BLK1_MGMT1, 0x7F64); - bcma_pcie_mdio_writeread(pc, BCMA_CORE_PCI_MDIO_BLK1, - BCMA_CORE_PCI_MDIO_BLK1_MGMT3, data); - } else if (pc->core->id.rev >= 21 && pc->core->id.rev <= 22) { - data = up ? 0x75 : 0x7D; - bcma_pcie_mdio_writeread(pc, BCMA_CORE_PCI_MDIO_BLK1, - BCMA_CORE_PCI_MDIO_BLK1_MGMT1, 0x7E65); - bcma_pcie_mdio_writeread(pc, BCMA_CORE_PCI_MDIO_BLK1, - BCMA_CORE_PCI_MDIO_BLK1_MGMT3, data); - } -} - /************************************************** * Init. **************************************************/ @@ -255,6 +236,32 @@ void bcma_core_pci_init(struct bcma_drv_pci *pc) bcma_core_pci_clientmode_init(pc); } +void bcma_core_pci_power_save(struct bcma_bus *bus, bool up) +{ + struct bcma_drv_pci *pc; + u16 data; + + if (bus->hosttype != BCMA_HOSTTYPE_PCI) + return; + + pc = &bus->drv_pci[0]; + + if (pc->core->id.rev >= 15 && pc->core->id.rev <= 20) { + data = up ? 0x74 : 0x7C; + bcma_pcie_mdio_writeread(pc, BCMA_CORE_PCI_MDIO_BLK1, + BCMA_CORE_PCI_MDIO_BLK1_MGMT1, 0x7F64); + bcma_pcie_mdio_writeread(pc, BCMA_CORE_PCI_MDIO_BLK1, + BCMA_CORE_PCI_MDIO_BLK1_MGMT3, data); + } else if (pc->core->id.rev >= 21 && pc->core->id.rev <= 22) { + data = up ? 0x75 : 0x7D; + bcma_pcie_mdio_writeread(pc, BCMA_CORE_PCI_MDIO_BLK1, + BCMA_CORE_PCI_MDIO_BLK1_MGMT1, 0x7E65); + bcma_pcie_mdio_writeread(pc, BCMA_CORE_PCI_MDIO_BLK1, + BCMA_CORE_PCI_MDIO_BLK1_MGMT3, data); + } +} +EXPORT_SYMBOL_GPL(bcma_core_pci_power_save); + int bcma_core_pci_irq_ctl(struct bcma_drv_pci *pc, struct bcma_device *core, bool enable) { @@ -310,8 +317,6 @@ void bcma_core_pci_up(struct bcma_bus *bus) pc = &bus->drv_pci[0]; - bcma_core_pci_power_save(pc, true); - bcma_core_pci_extend_L1timer(pc, true); } EXPORT_SYMBOL_GPL(bcma_core_pci_up); @@ -326,7 +331,5 @@ void bcma_core_pci_down(struct bcma_bus *bus) pc = &bus->drv_pci[0]; bcma_core_pci_extend_L1timer(pc, false); - - bcma_core_pci_power_save(pc, false); } EXPORT_SYMBOL_GPL(bcma_core_pci_down); diff --git a/include/linux/bcma/bcma_driver_pci.h b/include/linux/bcma/bcma_driver_pci.h index d66033f418c9..0333e605ea0d 100644 --- a/include/linux/bcma/bcma_driver_pci.h +++ b/include/linux/bcma/bcma_driver_pci.h @@ -242,6 +242,7 @@ extern int bcma_core_pci_irq_ctl(struct bcma_drv_pci *pc, struct bcma_device *core, bool enable); extern void bcma_core_pci_up(struct bcma_bus *bus); extern void bcma_core_pci_down(struct bcma_bus *bus); +extern void bcma_core_pci_power_save(struct bcma_bus *bus, bool up); extern int bcma_core_pci_pcibios_map_irq(const struct pci_dev *dev); extern int bcma_core_pci_plat_dev_init(struct pci_dev *dev); -- cgit v1.2.3 From 3a4916050ba2e0f1d114ef540abdf02b2b173e61 Mon Sep 17 00:00:00 2001 From: "K. Y. Srinivasan" Date: Fri, 6 Sep 2013 11:49:56 -0700 Subject: Drivers: hv: util: Correctly support ws2008R2 and earlier The current code does not correctly negotiate the version numbers for the util driver when hosted on earlier hosts. The version numbers presented by this driver were not compatible with the version numbers supported by Windows Server 2008. Fix this problem. I would like to thank Olaf Hering (ohering@suse.com) for identifying the problem. Reported-by: Olaf Hering Signed-off-by: K. Y. Srinivasan Signed-off-by: Greg Kroah-Hartman --- drivers/hv/hv_kvp.c | 38 ++++++++++++++++++-------- drivers/hv/hv_snapshot.c | 6 ++-- drivers/hv/hv_util.c | 71 ++++++++++++++++++++++++++++++++++++------------ include/linux/hyperv.h | 7 +++-- 4 files changed, 88 insertions(+), 34 deletions(-) (limited to 'include') diff --git a/drivers/hv/hv_kvp.c b/drivers/hv/hv_kvp.c index 28b03325b872..09988b289622 100644 --- a/drivers/hv/hv_kvp.c +++ b/drivers/hv/hv_kvp.c @@ -32,13 +32,17 @@ /* * Pre win8 version numbers used in ws2008 and ws 2008 r2 (win7) */ +#define WS2008_SRV_MAJOR 1 +#define WS2008_SRV_MINOR 0 +#define WS2008_SRV_VERSION (WS2008_SRV_MAJOR << 16 | WS2008_SRV_MINOR) + #define WIN7_SRV_MAJOR 3 #define WIN7_SRV_MINOR 0 -#define WIN7_SRV_MAJOR_MINOR (WIN7_SRV_MAJOR << 16 | WIN7_SRV_MINOR) +#define WIN7_SRV_VERSION (WIN7_SRV_MAJOR << 16 | WIN7_SRV_MINOR) #define WIN8_SRV_MAJOR 4 #define WIN8_SRV_MINOR 0 -#define WIN8_SRV_MAJOR_MINOR (WIN8_SRV_MAJOR << 16 | WIN8_SRV_MINOR) +#define WIN8_SRV_VERSION (WIN8_SRV_MAJOR << 16 | WIN8_SRV_MINOR) /* * Global state maintained for transaction that is being processed. @@ -587,6 +591,8 @@ void hv_kvp_onchannelcallback(void *context) struct icmsg_hdr *icmsghdrp; struct icmsg_negotiate *negop = NULL; + int util_fw_version; + int kvp_srv_version; if (kvp_transaction.active) { /* @@ -606,17 +612,26 @@ void hv_kvp_onchannelcallback(void *context) if (icmsghdrp->icmsgtype == ICMSGTYPE_NEGOTIATE) { /* - * We start with win8 version and if the host cannot - * support that we use the previous version. + * Based on the host, select appropriate + * framework and service versions we will + * negotiate. */ - if (vmbus_prep_negotiate_resp(icmsghdrp, negop, - recv_buffer, UTIL_FW_MAJOR_MINOR, - WIN8_SRV_MAJOR_MINOR)) - goto done; - + switch (vmbus_proto_version) { + case (VERSION_WS2008): + util_fw_version = UTIL_WS2K8_FW_VERSION; + kvp_srv_version = WS2008_SRV_VERSION; + break; + case (VERSION_WIN7): + util_fw_version = UTIL_FW_VERSION; + kvp_srv_version = WIN7_SRV_VERSION; + break; + default: + util_fw_version = UTIL_FW_VERSION; + kvp_srv_version = WIN8_SRV_VERSION; + } vmbus_prep_negotiate_resp(icmsghdrp, negop, - recv_buffer, UTIL_FW_MAJOR_MINOR, - WIN7_SRV_MAJOR_MINOR); + recv_buffer, util_fw_version, + kvp_srv_version); } else { kvp_msg = (struct hv_kvp_msg *)&recv_buffer[ @@ -649,7 +664,6 @@ void hv_kvp_onchannelcallback(void *context) return; } -done: icmsghdrp->icflags = ICMSGHDRFLAG_TRANSACTION | ICMSGHDRFLAG_RESPONSE; diff --git a/drivers/hv/hv_snapshot.c b/drivers/hv/hv_snapshot.c index e4572f3f2834..0c3546224376 100644 --- a/drivers/hv/hv_snapshot.c +++ b/drivers/hv/hv_snapshot.c @@ -26,7 +26,7 @@ #define VSS_MAJOR 5 #define VSS_MINOR 0 -#define VSS_MAJOR_MINOR (VSS_MAJOR << 16 | VSS_MINOR) +#define VSS_VERSION (VSS_MAJOR << 16 | VSS_MINOR) @@ -190,8 +190,8 @@ void hv_vss_onchannelcallback(void *context) if (icmsghdrp->icmsgtype == ICMSGTYPE_NEGOTIATE) { vmbus_prep_negotiate_resp(icmsghdrp, negop, - recv_buffer, UTIL_FW_MAJOR_MINOR, - VSS_MAJOR_MINOR); + recv_buffer, UTIL_FW_VERSION, + VSS_VERSION); } else { vss_msg = (struct hv_vss_msg *)&recv_buffer[ sizeof(struct vmbuspipe_hdr) + diff --git a/drivers/hv/hv_util.c b/drivers/hv/hv_util.c index cb82233541b1..273e3ddb3a20 100644 --- a/drivers/hv/hv_util.c +++ b/drivers/hv/hv_util.c @@ -28,17 +28,32 @@ #include #include -#define SHUTDOWN_MAJOR 3 -#define SHUTDOWN_MINOR 0 -#define SHUTDOWN_MAJOR_MINOR (SHUTDOWN_MAJOR << 16 | SHUTDOWN_MINOR) -#define TIMESYNCH_MAJOR 3 -#define TIMESYNCH_MINOR 0 -#define TIMESYNCH_MAJOR_MINOR (TIMESYNCH_MAJOR << 16 | TIMESYNCH_MINOR) +#define SD_MAJOR 3 +#define SD_MINOR 0 +#define SD_VERSION (SD_MAJOR << 16 | SD_MINOR) -#define HEARTBEAT_MAJOR 3 -#define HEARTBEAT_MINOR 0 -#define HEARTBEAT_MAJOR_MINOR (HEARTBEAT_MAJOR << 16 | HEARTBEAT_MINOR) +#define SD_WS2008_MAJOR 1 +#define SD_WS2008_VERSION (SD_WS2008_MAJOR << 16 | SD_MINOR) + +#define TS_MAJOR 3 +#define TS_MINOR 0 +#define TS_VERSION (TS_MAJOR << 16 | TS_MINOR) + +#define TS_WS2008_MAJOR 1 +#define TS_WS2008_VERSION (TS_WS2008_MAJOR << 16 | TS_MINOR) + +#define HB_MAJOR 3 +#define HB_MINOR 0 +#define HB_VERSION (HB_MAJOR << 16 | HB_MINOR) + +#define HB_WS2008_MAJOR 1 +#define HB_WS2008_VERSION (HB_WS2008_MAJOR << 16 | HB_MINOR) + +static int sd_srv_version; +static int ts_srv_version; +static int hb_srv_version; +static int util_fw_version; static void shutdown_onchannelcallback(void *context); static struct hv_util_service util_shutdown = { @@ -99,8 +114,8 @@ static void shutdown_onchannelcallback(void *context) if (icmsghdrp->icmsgtype == ICMSGTYPE_NEGOTIATE) { vmbus_prep_negotiate_resp(icmsghdrp, negop, - shut_txf_buf, UTIL_FW_MAJOR_MINOR, - SHUTDOWN_MAJOR_MINOR); + shut_txf_buf, util_fw_version, + sd_srv_version); } else { shutdown_msg = (struct shutdown_msg_data *)&shut_txf_buf[ @@ -216,6 +231,7 @@ static void timesync_onchannelcallback(void *context) struct icmsg_hdr *icmsghdrp; struct ictimesync_data *timedatap; u8 *time_txf_buf = util_timesynch.recv_buffer; + struct icmsg_negotiate *negop = NULL; vmbus_recvpacket(channel, time_txf_buf, PAGE_SIZE, &recvlen, &requestid); @@ -225,9 +241,10 @@ static void timesync_onchannelcallback(void *context) sizeof(struct vmbuspipe_hdr)]; if (icmsghdrp->icmsgtype == ICMSGTYPE_NEGOTIATE) { - vmbus_prep_negotiate_resp(icmsghdrp, NULL, time_txf_buf, - UTIL_FW_MAJOR_MINOR, - TIMESYNCH_MAJOR_MINOR); + vmbus_prep_negotiate_resp(icmsghdrp, negop, + time_txf_buf, + util_fw_version, + ts_srv_version); } else { timedatap = (struct ictimesync_data *)&time_txf_buf[ sizeof(struct vmbuspipe_hdr) + @@ -257,6 +274,7 @@ static void heartbeat_onchannelcallback(void *context) struct icmsg_hdr *icmsghdrp; struct heartbeat_msg_data *heartbeat_msg; u8 *hbeat_txf_buf = util_heartbeat.recv_buffer; + struct icmsg_negotiate *negop = NULL; vmbus_recvpacket(channel, hbeat_txf_buf, PAGE_SIZE, &recvlen, &requestid); @@ -266,9 +284,9 @@ static void heartbeat_onchannelcallback(void *context) sizeof(struct vmbuspipe_hdr)]; if (icmsghdrp->icmsgtype == ICMSGTYPE_NEGOTIATE) { - vmbus_prep_negotiate_resp(icmsghdrp, NULL, - hbeat_txf_buf, UTIL_FW_MAJOR_MINOR, - HEARTBEAT_MAJOR_MINOR); + vmbus_prep_negotiate_resp(icmsghdrp, negop, + hbeat_txf_buf, util_fw_version, + hb_srv_version); } else { heartbeat_msg = (struct heartbeat_msg_data *)&hbeat_txf_buf[ @@ -321,6 +339,25 @@ static int util_probe(struct hv_device *dev, goto error; hv_set_drvdata(dev, srv); + /* + * Based on the host; initialize the framework and + * service version numbers we will negotiate. + */ + switch (vmbus_proto_version) { + case (VERSION_WS2008): + util_fw_version = UTIL_WS2K8_FW_VERSION; + sd_srv_version = SD_WS2008_VERSION; + ts_srv_version = TS_WS2008_VERSION; + hb_srv_version = HB_WS2008_VERSION; + break; + + default: + util_fw_version = UTIL_FW_VERSION; + sd_srv_version = SD_VERSION; + ts_srv_version = TS_VERSION; + hb_srv_version = HB_VERSION; + } + return 0; error: diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index a3b8b2e2d244..d98503bde7e9 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -30,10 +30,13 @@ /* * Framework version for util services. */ +#define UTIL_FW_MINOR 0 + +#define UTIL_WS2K8_FW_MAJOR 1 +#define UTIL_WS2K8_FW_VERSION (UTIL_WS2K8_FW_MAJOR << 16 | UTIL_FW_MINOR) #define UTIL_FW_MAJOR 3 -#define UTIL_FW_MINOR 0 -#define UTIL_FW_MAJOR_MINOR (UTIL_FW_MAJOR << 16 | UTIL_FW_MINOR) +#define UTIL_FW_VERSION (UTIL_FW_MAJOR << 16 | UTIL_FW_MINOR) /* -- cgit v1.2.3 From 083986e8248d978b6c961d3da6beb0c921c68220 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Sat, 28 Sep 2013 11:23:59 +0200 Subject: mutex: replace CONFIG_HAVE_ARCH_MUTEX_CPU_RELAX with simple ifdef Linus suggested to replace #ifndef CONFIG_HAVE_ARCH_MUTEX_CPU_RELAX #define arch_mutex_cpu_relax() cpu_relax() #endif with just a simple #ifndef arch_mutex_cpu_relax # define arch_mutex_cpu_relax() cpu_relax() #endif to get rid of CONFIG_HAVE_CPU_RELAX_SIMPLE. So architectures can simply define arch_mutex_cpu_relax if they want an architecture specific function instead of having to add a select statement in their Kconfig in addition. Suggested-by: Linus Torvalds Signed-off-by: Heiko Carstens --- arch/Kconfig | 3 --- arch/s390/Kconfig | 1 - arch/s390/include/asm/mutex.h | 2 -- arch/s390/include/asm/processor.h | 2 ++ include/linux/mutex.h | 6 +++--- 5 files changed, 5 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/arch/Kconfig b/arch/Kconfig index 1feb169274fe..af2cc6eabcc7 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -286,9 +286,6 @@ config HAVE_PERF_USER_STACK_DUMP config HAVE_ARCH_JUMP_LABEL bool -config HAVE_ARCH_MUTEX_CPU_RELAX - bool - config HAVE_RCU_TABLE_FREE bool diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index dcc6ac2d8026..d3fa84070b82 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -102,7 +102,6 @@ config S390 select GENERIC_TIME_VSYSCALL_OLD select HAVE_ALIGNED_STRUCT_PAGE if SLUB select HAVE_ARCH_JUMP_LABEL if !MARCH_G5 - select HAVE_ARCH_MUTEX_CPU_RELAX select HAVE_ARCH_SECCOMP_FILTER select HAVE_ARCH_TRACEHOOK select HAVE_ARCH_TRANSPARENT_HUGEPAGE if 64BIT diff --git a/arch/s390/include/asm/mutex.h b/arch/s390/include/asm/mutex.h index 688271f5f2e4..458c1f7fbc18 100644 --- a/arch/s390/include/asm/mutex.h +++ b/arch/s390/include/asm/mutex.h @@ -7,5 +7,3 @@ */ #include - -#define arch_mutex_cpu_relax() barrier() diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h index 0eb37505cab1..ca7821f07260 100644 --- a/arch/s390/include/asm/processor.h +++ b/arch/s390/include/asm/processor.h @@ -198,6 +198,8 @@ static inline void cpu_relax(void) barrier(); } +#define arch_mutex_cpu_relax() barrier() + static inline void psw_set_key(unsigned int key) { asm volatile("spka 0(%0)" : : "d" (key)); diff --git a/include/linux/mutex.h b/include/linux/mutex.h index ccd4260834c5..bab49da8a0f0 100644 --- a/include/linux/mutex.h +++ b/include/linux/mutex.h @@ -15,8 +15,8 @@ #include #include #include - #include +#include /* * Simple, straightforward mutexes with strict semantics: @@ -175,8 +175,8 @@ extern void mutex_unlock(struct mutex *lock); extern int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock); -#ifndef CONFIG_HAVE_ARCH_MUTEX_CPU_RELAX -#define arch_mutex_cpu_relax() cpu_relax() +#ifndef arch_mutex_cpu_relax +# define arch_mutex_cpu_relax() cpu_relax() #endif #endif -- cgit v1.2.3 From 60e453a940ac678565b6641d65f8c18541bb9f28 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 23 Sep 2013 20:59:35 +0800 Subject: USBNET: fix handling padding packet Commit 638c5115a7949(USBNET: support DMA SG) introduces DMA SG if the usb host controller is capable of building packet from discontinuous buffers, but missed handling padding packet when building DMA SG. This patch attachs the pre-allocated padding packet at the end of the sg list, so padding packet can be sent to device if drivers require that. Reported-by: David Laight Acked-by: Oliver Neukum Signed-off-by: Ming Lei Signed-off-by: David S. Miller --- drivers/net/usb/usbnet.c | 27 +++++++++++++++++++++------ include/linux/usb/usbnet.h | 1 + 2 files changed, 22 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c index 7b331e613e02..bf94e10a37c8 100644 --- a/drivers/net/usb/usbnet.c +++ b/drivers/net/usb/usbnet.c @@ -1241,7 +1241,9 @@ static int build_dma_sg(const struct sk_buff *skb, struct urb *urb) if (num_sgs == 1) return 0; - urb->sg = kmalloc(num_sgs * sizeof(struct scatterlist), GFP_ATOMIC); + /* reserve one for zero packet */ + urb->sg = kmalloc((num_sgs + 1) * sizeof(struct scatterlist), + GFP_ATOMIC); if (!urb->sg) return -ENOMEM; @@ -1305,7 +1307,7 @@ netdev_tx_t usbnet_start_xmit (struct sk_buff *skb, if (build_dma_sg(skb, urb) < 0) goto drop; } - entry->length = length = urb->transfer_buffer_length; + length = urb->transfer_buffer_length; /* don't assume the hardware handles USB_ZERO_PACKET * NOTE: strictly conforming cdc-ether devices should expect @@ -1317,15 +1319,18 @@ netdev_tx_t usbnet_start_xmit (struct sk_buff *skb, if (length % dev->maxpacket == 0) { if (!(info->flags & FLAG_SEND_ZLP)) { if (!(info->flags & FLAG_MULTI_PACKET)) { - urb->transfer_buffer_length++; - if (skb_tailroom(skb)) { + length++; + if (skb_tailroom(skb) && !urb->num_sgs) { skb->data[skb->len] = 0; __skb_put(skb, 1); - } + } else if (urb->num_sgs) + sg_set_buf(&urb->sg[urb->num_sgs++], + dev->padding_pkt, 1); } } else urb->transfer_flags |= URB_ZERO_PACKET; } + entry->length = urb->transfer_buffer_length = length; spin_lock_irqsave(&dev->txq.lock, flags); retval = usb_autopm_get_interface_async(dev->intf); @@ -1509,6 +1514,7 @@ void usbnet_disconnect (struct usb_interface *intf) usb_kill_urb(dev->interrupt); usb_free_urb(dev->interrupt); + kfree(dev->padding_pkt); free_netdev(net); } @@ -1679,9 +1685,16 @@ usbnet_probe (struct usb_interface *udev, const struct usb_device_id *prod) /* initialize max rx_qlen and tx_qlen */ usbnet_update_max_qlen(dev); + if (dev->can_dma_sg && !(info->flags & FLAG_SEND_ZLP) && + !(info->flags & FLAG_MULTI_PACKET)) { + dev->padding_pkt = kzalloc(1, GFP_KERNEL); + if (!dev->padding_pkt) + goto out4; + } + status = register_netdev (net); if (status) - goto out4; + goto out5; netif_info(dev, probe, dev->net, "register '%s' at usb-%s-%s, %s, %pM\n", udev->dev.driver->name, @@ -1699,6 +1712,8 @@ usbnet_probe (struct usb_interface *udev, const struct usb_device_id *prod) return 0; +out5: + kfree(dev->padding_pkt); out4: usb_free_urb(dev->interrupt); out3: diff --git a/include/linux/usb/usbnet.h b/include/linux/usb/usbnet.h index 9cb2fe8ca944..e303eef94dd5 100644 --- a/include/linux/usb/usbnet.h +++ b/include/linux/usb/usbnet.h @@ -42,6 +42,7 @@ struct usbnet { struct usb_host_endpoint *status; unsigned maxpacket; struct timer_list delay; + const char *padding_pkt; /* protocol/interface state */ struct net_device *net; -- cgit v1.2.3 From 7df37ff33dc122f7bd0614d707939fe84322d264 Mon Sep 17 00:00:00 2001 From: "Catalin\\(ux\\) M. BOIE" Date: Mon, 23 Sep 2013 23:04:19 +0300 Subject: IPv6 NAT: Do not drop DNATed 6to4/6rd packets When a router is doing DNAT for 6to4/6rd packets the latest anti-spoofing commit 218774dc ("ipv6: add anti-spoofing checks for 6to4 and 6rd") will drop them because the IPv6 address embedded does not match the IPv4 destination. This patch will allow them to pass by testing if we have an address that matches on 6to4/6rd interface. I have been hit by this problem using Fedora and IPV6TO4_IPV4ADDR. Also, log the dropped packets (with rate limit). Signed-off-by: Catalin(ux) M. BOIE Acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- include/net/addrconf.h | 4 +++ net/ipv6/addrconf.c | 27 ++++++++++++++++ net/ipv6/sit.c | 84 +++++++++++++++++++++++++++++++++++++++++--------- 3 files changed, 100 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/include/net/addrconf.h b/include/net/addrconf.h index fb314de2b61b..86505bfa5d2c 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -67,6 +67,10 @@ int ipv6_chk_addr(struct net *net, const struct in6_addr *addr, int ipv6_chk_home_addr(struct net *net, const struct in6_addr *addr); #endif +bool ipv6_chk_custom_prefix(const struct in6_addr *addr, + const unsigned int prefix_len, + struct net_device *dev); + int ipv6_chk_prefix(const struct in6_addr *addr, struct net_device *dev); struct inet6_ifaddr *ipv6_get_ifaddr(struct net *net, diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index d6ff12617f36..a0c3abe72461 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1499,6 +1499,33 @@ static bool ipv6_chk_same_addr(struct net *net, const struct in6_addr *addr, return false; } +/* Compares an address/prefix_len with addresses on device @dev. + * If one is found it returns true. + */ +bool ipv6_chk_custom_prefix(const struct in6_addr *addr, + const unsigned int prefix_len, struct net_device *dev) +{ + struct inet6_dev *idev; + struct inet6_ifaddr *ifa; + bool ret = false; + + rcu_read_lock(); + idev = __in6_dev_get(dev); + if (idev) { + read_lock_bh(&idev->lock); + list_for_each_entry(ifa, &idev->addr_list, if_list) { + ret = ipv6_prefix_equal(addr, &ifa->addr, prefix_len); + if (ret) + break; + } + read_unlock_bh(&idev->lock); + } + rcu_read_unlock(); + + return ret; +} +EXPORT_SYMBOL(ipv6_chk_custom_prefix); + int ipv6_chk_prefix(const struct in6_addr *addr, struct net_device *dev) { struct inet6_dev *idev; diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 7ee5cb96db34..afd5605aea7c 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -566,6 +566,70 @@ static inline bool is_spoofed_6rd(struct ip_tunnel *tunnel, const __be32 v4addr, return false; } +/* Checks if an address matches an address on the tunnel interface. + * Used to detect the NAT of proto 41 packets and let them pass spoofing test. + * Long story: + * This function is called after we considered the packet as spoofed + * in is_spoofed_6rd. + * We may have a router that is doing NAT for proto 41 packets + * for an internal station. Destination a.a.a.a/PREFIX:bbbb:bbbb + * will be translated to n.n.n.n/PREFIX:bbbb:bbbb. And is_spoofed_6rd + * function will return true, dropping the packet. + * But, we can still check if is spoofed against the IP + * addresses associated with the interface. + */ +static bool only_dnatted(const struct ip_tunnel *tunnel, + const struct in6_addr *v6dst) +{ + int prefix_len; + +#ifdef CONFIG_IPV6_SIT_6RD + prefix_len = tunnel->ip6rd.prefixlen + 32 + - tunnel->ip6rd.relay_prefixlen; +#else + prefix_len = 48; +#endif + return ipv6_chk_custom_prefix(v6dst, prefix_len, tunnel->dev); +} + +/* Returns true if a packet is spoofed */ +static bool packet_is_spoofed(struct sk_buff *skb, + const struct iphdr *iph, + struct ip_tunnel *tunnel) +{ + const struct ipv6hdr *ipv6h; + + if (tunnel->dev->priv_flags & IFF_ISATAP) { + if (!isatap_chksrc(skb, iph, tunnel)) + return true; + + return false; + } + + if (tunnel->dev->flags & IFF_POINTOPOINT) + return false; + + ipv6h = ipv6_hdr(skb); + + if (unlikely(is_spoofed_6rd(tunnel, iph->saddr, &ipv6h->saddr))) { + net_warn_ratelimited("Src spoofed %pI4/%pI6c -> %pI4/%pI6c\n", + &iph->saddr, &ipv6h->saddr, + &iph->daddr, &ipv6h->daddr); + return true; + } + + if (likely(!is_spoofed_6rd(tunnel, iph->daddr, &ipv6h->daddr))) + return false; + + if (only_dnatted(tunnel, &ipv6h->daddr)) + return false; + + net_warn_ratelimited("Dst spoofed %pI4/%pI6c -> %pI4/%pI6c\n", + &iph->saddr, &ipv6h->saddr, + &iph->daddr, &ipv6h->daddr); + return true; +} + static int ipip6_rcv(struct sk_buff *skb) { const struct iphdr *iph = ip_hdr(skb); @@ -586,19 +650,9 @@ static int ipip6_rcv(struct sk_buff *skb) IPCB(skb)->flags = 0; skb->protocol = htons(ETH_P_IPV6); - if (tunnel->dev->priv_flags & IFF_ISATAP) { - if (!isatap_chksrc(skb, iph, tunnel)) { - tunnel->dev->stats.rx_errors++; - goto out; - } - } else if (!(tunnel->dev->flags&IFF_POINTOPOINT)) { - if (is_spoofed_6rd(tunnel, iph->saddr, - &ipv6_hdr(skb)->saddr) || - is_spoofed_6rd(tunnel, iph->daddr, - &ipv6_hdr(skb)->daddr)) { - tunnel->dev->stats.rx_errors++; - goto out; - } + if (packet_is_spoofed(skb, iph, tunnel)) { + tunnel->dev->stats.rx_errors++; + goto out; } __skb_tunnel_rx(skb, tunnel->dev, tunnel->net); @@ -748,7 +802,7 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, neigh = dst_neigh_lookup(skb_dst(skb), &iph6->daddr); if (neigh == NULL) { - net_dbg_ratelimited("sit: nexthop == NULL\n"); + net_dbg_ratelimited("nexthop == NULL\n"); goto tx_error; } @@ -777,7 +831,7 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, neigh = dst_neigh_lookup(skb_dst(skb), &iph6->daddr); if (neigh == NULL) { - net_dbg_ratelimited("sit: nexthop == NULL\n"); + net_dbg_ratelimited("nexthop == NULL\n"); goto tx_error; } -- cgit v1.2.3 From 50624c934db18ab90aaea4908f60dd39aab4e6e5 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 23 Sep 2013 21:19:49 -0700 Subject: net: Delay default_device_exit_batch until no devices are unregistering v2 There is currently serialization network namespaces exiting and network devices exiting as the final part of netdev_run_todo does not happen under the rtnl_lock. This is compounded by the fact that the only list of devices unregistering in netdev_run_todo is local to the netdev_run_todo. This lack of serialization in extreme cases results in network devices unregistering in netdev_run_todo after the loopback device of their network namespace has been freed (making dst_ifdown unsafe), and after the their network namespace has exited (making the NETDEV_UNREGISTER, and NETDEV_UNREGISTER_FINAL callbacks unsafe). Add the missing serialization by a per network namespace count of how many network devices are unregistering and having a wait queue that is woken up whenever the count is decreased. The count and wait queue allow default_device_exit_batch to wait until all of the unregistration activity for a network namespace has finished before proceeding to unregister the loopback device and then allowing the network namespace to exit. Only a single global wait queue is used because there is a single global lock, and there is a single waiter, per network namespace wait queues would be a waste of resources. The per network namespace count of unregistering devices gives a progress guarantee because the number of network devices unregistering in an exiting network namespace must ultimately drop to zero (assuming network device unregistration completes). The basic logic remains the same as in v1. This patch is now half comment and half rtnl_lock_unregistering an expanded version of wait_event performs no extra work in the common case where no network devices are unregistering when we get to default_device_exit_batch. Reported-by: Francesco Ruggeri Signed-off-by: "Eric W. Biederman" Signed-off-by: David S. Miller --- include/net/net_namespace.h | 1 + net/core/dev.c | 49 ++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 49 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 1313456a0994..9d22f08896c6 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -74,6 +74,7 @@ struct net { struct hlist_head *dev_index_head; unsigned int dev_base_seq; /* protected by rtnl_mutex */ int ifindex; + unsigned int dev_unreg_count; /* core fib_rules */ struct list_head rules_ops; diff --git a/net/core/dev.c b/net/core/dev.c index 5c713f2239cc..65f829cfd928 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5247,10 +5247,12 @@ static int dev_new_index(struct net *net) /* Delayed registration/unregisteration */ static LIST_HEAD(net_todo_list); +static DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq); static void net_set_todo(struct net_device *dev) { list_add_tail(&dev->todo_list, &net_todo_list); + dev_net(dev)->dev_unreg_count++; } static void rollback_registered_many(struct list_head *head) @@ -5918,6 +5920,12 @@ void netdev_run_todo(void) if (dev->destructor) dev->destructor(dev); + /* Report a network device has been unregistered */ + rtnl_lock(); + dev_net(dev)->dev_unreg_count--; + __rtnl_unlock(); + wake_up(&netdev_unregistering_wq); + /* Free network device */ kobject_put(&dev->dev.kobj); } @@ -6603,6 +6611,34 @@ static void __net_exit default_device_exit(struct net *net) rtnl_unlock(); } +static void __net_exit rtnl_lock_unregistering(struct list_head *net_list) +{ + /* Return with the rtnl_lock held when there are no network + * devices unregistering in any network namespace in net_list. + */ + struct net *net; + bool unregistering; + DEFINE_WAIT(wait); + + for (;;) { + prepare_to_wait(&netdev_unregistering_wq, &wait, + TASK_UNINTERRUPTIBLE); + unregistering = false; + rtnl_lock(); + list_for_each_entry(net, net_list, exit_list) { + if (net->dev_unreg_count > 0) { + unregistering = true; + break; + } + } + if (!unregistering) + break; + __rtnl_unlock(); + schedule(); + } + finish_wait(&netdev_unregistering_wq, &wait); +} + static void __net_exit default_device_exit_batch(struct list_head *net_list) { /* At exit all network devices most be removed from a network @@ -6614,7 +6650,18 @@ static void __net_exit default_device_exit_batch(struct list_head *net_list) struct net *net; LIST_HEAD(dev_kill_list); - rtnl_lock(); + /* To prevent network device cleanup code from dereferencing + * loopback devices or network devices that have been freed + * wait here for all pending unregistrations to complete, + * before unregistring the loopback device and allowing the + * network namespace be freed. + * + * The netdev todo list containing all network devices + * unregistrations that happen in default_device_exit_batch + * will run in the rtnl_unlock() at the end of + * default_device_exit_batch. + */ + rtnl_lock_unregistering(net_list); list_for_each_entry(net, net_list, exit_list) { for_each_netdev_reverse(net, dev) { if (dev->rtnl_link_ops) -- cgit v1.2.3 From 9a3bab6b05383f1e4c3716b3615500c51285959e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 24 Sep 2013 06:19:57 -0700 Subject: net: net_secret should not depend on TCP A host might need net_secret[] and never open a single socket. Problem added in commit aebda156a570782 ("net: defer net_secret[] initialization") Based on prior patch from Hannes Frederic Sowa. Reported-by: Hannes Frederic Sowa Signed-off-by: Eric Dumazet Acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- include/net/secure_seq.h | 1 - net/core/secure_seq.c | 27 ++++++++++++++++++++++++--- net/ipv4/af_inet.c | 4 +--- 3 files changed, 25 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/net/secure_seq.h b/include/net/secure_seq.h index 6ca975bebd37..c2e542b27a5a 100644 --- a/include/net/secure_seq.h +++ b/include/net/secure_seq.h @@ -3,7 +3,6 @@ #include -extern void net_secret_init(void); extern __u32 secure_ip_id(__be32 daddr); extern __u32 secure_ipv6_id(const __be32 daddr[4]); extern u32 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport); diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c index 6a2f13cee86a..3f1ec1586ae1 100644 --- a/net/core/secure_seq.c +++ b/net/core/secure_seq.c @@ -10,11 +10,24 @@ #include -static u32 net_secret[MD5_MESSAGE_BYTES / 4] ____cacheline_aligned; +#define NET_SECRET_SIZE (MD5_MESSAGE_BYTES / 4) -void net_secret_init(void) +static u32 net_secret[NET_SECRET_SIZE] ____cacheline_aligned; + +static void net_secret_init(void) { - get_random_bytes(net_secret, sizeof(net_secret)); + u32 tmp; + int i; + + if (likely(net_secret[0])) + return; + + for (i = NET_SECRET_SIZE; i > 0;) { + do { + get_random_bytes(&tmp, sizeof(tmp)); + } while (!tmp); + cmpxchg(&net_secret[--i], 0, tmp); + } } #ifdef CONFIG_INET @@ -42,6 +55,7 @@ __u32 secure_tcpv6_sequence_number(const __be32 *saddr, const __be32 *daddr, u32 hash[MD5_DIGEST_WORDS]; u32 i; + net_secret_init(); memcpy(hash, saddr, 16); for (i = 0; i < 4; i++) secret[i] = net_secret[i] + (__force u32)daddr[i]; @@ -63,6 +77,7 @@ u32 secure_ipv6_port_ephemeral(const __be32 *saddr, const __be32 *daddr, u32 hash[MD5_DIGEST_WORDS]; u32 i; + net_secret_init(); memcpy(hash, saddr, 16); for (i = 0; i < 4; i++) secret[i] = net_secret[i] + (__force u32) daddr[i]; @@ -82,6 +97,7 @@ __u32 secure_ip_id(__be32 daddr) { u32 hash[MD5_DIGEST_WORDS]; + net_secret_init(); hash[0] = (__force __u32) daddr; hash[1] = net_secret[13]; hash[2] = net_secret[14]; @@ -96,6 +112,7 @@ __u32 secure_ipv6_id(const __be32 daddr[4]) { __u32 hash[4]; + net_secret_init(); memcpy(hash, daddr, 16); md5_transform(hash, net_secret); @@ -107,6 +124,7 @@ __u32 secure_tcp_sequence_number(__be32 saddr, __be32 daddr, { u32 hash[MD5_DIGEST_WORDS]; + net_secret_init(); hash[0] = (__force u32)saddr; hash[1] = (__force u32)daddr; hash[2] = ((__force u16)sport << 16) + (__force u16)dport; @@ -121,6 +139,7 @@ u32 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport) { u32 hash[MD5_DIGEST_WORDS]; + net_secret_init(); hash[0] = (__force u32)saddr; hash[1] = (__force u32)daddr; hash[2] = (__force u32)dport ^ net_secret[14]; @@ -140,6 +159,7 @@ u64 secure_dccp_sequence_number(__be32 saddr, __be32 daddr, u32 hash[MD5_DIGEST_WORDS]; u64 seq; + net_secret_init(); hash[0] = (__force u32)saddr; hash[1] = (__force u32)daddr; hash[2] = ((__force u16)sport << 16) + (__force u16)dport; @@ -164,6 +184,7 @@ u64 secure_dccpv6_sequence_number(__be32 *saddr, __be32 *daddr, u64 seq; u32 i; + net_secret_init(); memcpy(hash, saddr, 16); for (i = 0; i < 4; i++) secret[i] = net_secret[i] + daddr[i]; diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 7a1874b7b8fd..cfeb85cff4f0 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -263,10 +263,8 @@ void build_ehash_secret(void) get_random_bytes(&rnd, sizeof(rnd)); } while (rnd == 0); - if (cmpxchg(&inet_ehash_secret, 0, rnd) == 0) { + if (cmpxchg(&inet_ehash_secret, 0, rnd) == 0) get_random_bytes(&ipv6_hash_secret, sizeof(ipv6_hash_secret)); - net_secret_init(); - } } EXPORT_SYMBOL(build_ehash_secret); -- cgit v1.2.3 From f4a87e7bd2eaef26a3ca25437ce8b807de2966ad Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Mon, 30 Sep 2013 08:51:46 +0100 Subject: netfilter: synproxy: fix BUG_ON triggered by corrupt TCP packets TCP packets hitting the SYN proxy through the SYNPROXY target are not validated by TCP conntrack. When th->doff is below 5, an underflow happens when calculating the options length, causing skb_header_pointer() to return NULL and triggering the BUG_ON(). Handle this case gracefully by checking for NULL instead of using BUG_ON(). Reported-by: Martin Topholm Tested-by: Martin Topholm Signed-off-by: Patrick McHardy Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_synproxy.h | 2 +- net/ipv4/netfilter/ipt_SYNPROXY.c | 10 +++++++--- net/ipv6/netfilter/ip6t_SYNPROXY.c | 10 +++++++--- net/netfilter/nf_synproxy_core.c | 12 +++++++----- 4 files changed, 22 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_conntrack_synproxy.h b/include/net/netfilter/nf_conntrack_synproxy.h index 806f54a290d6..f572f313d6f1 100644 --- a/include/net/netfilter/nf_conntrack_synproxy.h +++ b/include/net/netfilter/nf_conntrack_synproxy.h @@ -56,7 +56,7 @@ struct synproxy_options { struct tcphdr; struct xt_synproxy_info; -extern void synproxy_parse_options(const struct sk_buff *skb, unsigned int doff, +extern bool synproxy_parse_options(const struct sk_buff *skb, unsigned int doff, const struct tcphdr *th, struct synproxy_options *opts); extern unsigned int synproxy_options_size(const struct synproxy_options *opts); diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c index 67e17dcda65e..b6346bf2fde3 100644 --- a/net/ipv4/netfilter/ipt_SYNPROXY.c +++ b/net/ipv4/netfilter/ipt_SYNPROXY.c @@ -267,7 +267,8 @@ synproxy_tg4(struct sk_buff *skb, const struct xt_action_param *par) if (th == NULL) return NF_DROP; - synproxy_parse_options(skb, par->thoff, th, &opts); + if (!synproxy_parse_options(skb, par->thoff, th, &opts)) + return NF_DROP; if (th->syn && !(th->ack || th->fin || th->rst)) { /* Initial SYN from client */ @@ -350,7 +351,8 @@ static unsigned int ipv4_synproxy_hook(unsigned int hooknum, /* fall through */ case TCP_CONNTRACK_SYN_SENT: - synproxy_parse_options(skb, thoff, th, &opts); + if (!synproxy_parse_options(skb, thoff, th, &opts)) + return NF_DROP; if (!th->syn && th->ack && CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) { @@ -373,7 +375,9 @@ static unsigned int ipv4_synproxy_hook(unsigned int hooknum, if (!th->syn || !th->ack) break; - synproxy_parse_options(skb, thoff, th, &opts); + if (!synproxy_parse_options(skb, thoff, th, &opts)) + return NF_DROP; + if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP) synproxy->tsoff = opts.tsval - synproxy->its; diff --git a/net/ipv6/netfilter/ip6t_SYNPROXY.c b/net/ipv6/netfilter/ip6t_SYNPROXY.c index 19cfea8dbcaa..2748b042da72 100644 --- a/net/ipv6/netfilter/ip6t_SYNPROXY.c +++ b/net/ipv6/netfilter/ip6t_SYNPROXY.c @@ -282,7 +282,8 @@ synproxy_tg6(struct sk_buff *skb, const struct xt_action_param *par) if (th == NULL) return NF_DROP; - synproxy_parse_options(skb, par->thoff, th, &opts); + if (!synproxy_parse_options(skb, par->thoff, th, &opts)) + return NF_DROP; if (th->syn && !(th->ack || th->fin || th->rst)) { /* Initial SYN from client */ @@ -372,7 +373,8 @@ static unsigned int ipv6_synproxy_hook(unsigned int hooknum, /* fall through */ case TCP_CONNTRACK_SYN_SENT: - synproxy_parse_options(skb, thoff, th, &opts); + if (!synproxy_parse_options(skb, thoff, th, &opts)) + return NF_DROP; if (!th->syn && th->ack && CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) { @@ -395,7 +397,9 @@ static unsigned int ipv6_synproxy_hook(unsigned int hooknum, if (!th->syn || !th->ack) break; - synproxy_parse_options(skb, thoff, th, &opts); + if (!synproxy_parse_options(skb, thoff, th, &opts)) + return NF_DROP; + if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP) synproxy->tsoff = opts.tsval - synproxy->its; diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c index 6fd967c6278c..cdf4567ba9b3 100644 --- a/net/netfilter/nf_synproxy_core.c +++ b/net/netfilter/nf_synproxy_core.c @@ -24,7 +24,7 @@ int synproxy_net_id; EXPORT_SYMBOL_GPL(synproxy_net_id); -void +bool synproxy_parse_options(const struct sk_buff *skb, unsigned int doff, const struct tcphdr *th, struct synproxy_options *opts) { @@ -32,7 +32,8 @@ synproxy_parse_options(const struct sk_buff *skb, unsigned int doff, u8 buf[40], *ptr; ptr = skb_header_pointer(skb, doff + sizeof(*th), length, buf); - BUG_ON(ptr == NULL); + if (ptr == NULL) + return false; opts->options = 0; while (length > 0) { @@ -41,16 +42,16 @@ synproxy_parse_options(const struct sk_buff *skb, unsigned int doff, switch (opcode) { case TCPOPT_EOL: - return; + return true; case TCPOPT_NOP: length--; continue; default: opsize = *ptr++; if (opsize < 2) - return; + return true; if (opsize > length) - return; + return true; switch (opcode) { case TCPOPT_MSS: @@ -84,6 +85,7 @@ synproxy_parse_options(const struct sk_buff *skb, unsigned int doff, length -= opsize; } } + return true; } EXPORT_SYMBOL_GPL(synproxy_parse_options); -- cgit v1.2.3 From 559835ea7292e2f09304d81eda16f4209433245e Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Tue, 24 Sep 2013 10:25:40 -0700 Subject: vxlan: Use RCU apis to access sk_user_data. Use of RCU api makes vxlan code easier to understand. It also fixes bug due to missing ACCESS_ONCE() on sk_user_data dereference. In rare case without ACCESS_ONCE() compiler might omit vs on sk_user_data dereference. Compiler can use vs as alias for sk->sk_user_data, resulting in multiple sk_user_data dereference in rcu read context which could change. CC: Jesse Gross Signed-off-by: Pravin B Shelar Signed-off-by: David S. Miller --- drivers/net/vxlan.c | 9 +++------ include/net/sock.h | 5 +++++ 2 files changed, 8 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index d1292fe746bc..2ef5b6219f3f 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -952,8 +952,7 @@ void vxlan_sock_release(struct vxlan_sock *vs) spin_lock(&vn->sock_lock); hlist_del_rcu(&vs->hlist); - smp_wmb(); - vs->sock->sk->sk_user_data = NULL; + rcu_assign_sk_user_data(vs->sock->sk, NULL); vxlan_notify_del_rx_port(sk); spin_unlock(&vn->sock_lock); @@ -1048,8 +1047,7 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb) port = inet_sk(sk)->inet_sport; - smp_read_barrier_depends(); - vs = (struct vxlan_sock *)sk->sk_user_data; + vs = rcu_dereference_sk_user_data(sk); if (!vs) goto drop; @@ -2302,8 +2300,7 @@ static struct vxlan_sock *vxlan_socket_create(struct net *net, __be16 port, atomic_set(&vs->refcnt, 1); vs->rcv = rcv; vs->data = data; - smp_wmb(); - vs->sock->sk->sk_user_data = vs; + rcu_assign_sk_user_data(vs->sock->sk, vs); spin_lock(&vn->sock_lock); hlist_add_head_rcu(&vs->hlist, vs_head(net, port)); diff --git a/include/net/sock.h b/include/net/sock.h index 6ba2e7b0e2b1..1d37a8086bed 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -409,6 +409,11 @@ struct sock { void (*sk_destruct)(struct sock *sk); }; +#define __sk_user_data(sk) ((*((void __rcu **)&(sk)->sk_user_data))) + +#define rcu_dereference_sk_user_data(sk) rcu_dereference(__sk_user_data((sk))) +#define rcu_assign_sk_user_data(sk, ptr) rcu_assign_pointer(__sk_user_data((sk)), ptr) + /* * SK_CAN_REUSE and SK_NO_REUSE on a socket mean that the socket is OK * or not whether his port will be reused by someone else. SK_FORCE_REUSE -- cgit v1.2.3 From 2a156a6b52f45355d999b58b745eef93021cc81a Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 30 Sep 2013 13:45:13 -0700 Subject: include/asm-generic/vtime.h: avoid zero-length file patch(1) can't handle zero-length files - it appears to simply not create the file, so my powerpc build fails. Put something in here to make life easier. Cc: Hugh Dickins Cc: Frederic Weisbecker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/vtime.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/asm-generic/vtime.h b/include/asm-generic/vtime.h index e69de29bb2d1..b1a49677fe25 100644 --- a/include/asm-generic/vtime.h +++ b/include/asm-generic/vtime.h @@ -0,0 +1 @@ +/* no content, but patch(1) dislikes empty files */ -- cgit v1.2.3 From 117aad1e9e4d97448d1df3f84b08bd65811e6d6a Mon Sep 17 00:00:00 2001 From: Rafael Aquini Date: Mon, 30 Sep 2013 13:45:16 -0700 Subject: mm: avoid reinserting isolated balloon pages into LRU lists Isolated balloon pages can wrongly end up in LRU lists when migrate_pages() finishes its round without draining all the isolated page list. The same issue can happen when reclaim_clean_pages_from_list() tries to reclaim pages from an isolated page list, before migration, in the CMA path. Such balloon page leak opens a race window against LRU lists shrinkers that leads us to the following kernel panic: BUG: unable to handle kernel NULL pointer dereference at 0000000000000028 IP: [] shrink_page_list+0x24e/0x897 PGD 3cda2067 PUD 3d713067 PMD 0 Oops: 0000 [#1] SMP CPU: 0 PID: 340 Comm: kswapd0 Not tainted 3.12.0-rc1-22626-g4367597 #87 Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 RIP: shrink_page_list+0x24e/0x897 RSP: 0000:ffff88003da499b8 EFLAGS: 00010286 RAX: 0000000000000000 RBX: ffff88003e82bd60 RCX: 00000000000657d5 RDX: 0000000000000000 RSI: 000000000000031f RDI: ffff88003e82bd40 RBP: ffff88003da49ab0 R08: 0000000000000001 R09: 0000000081121a45 R10: ffffffff81121a45 R11: ffff88003c4a9a28 R12: ffff88003e82bd40 R13: ffff88003da0e800 R14: 0000000000000001 R15: ffff88003da49d58 FS: 0000000000000000(0000) GS:ffff88003fc00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000067d9000 CR3: 000000003ace5000 CR4: 00000000000407b0 Call Trace: shrink_inactive_list+0x240/0x3de shrink_lruvec+0x3e0/0x566 __shrink_zone+0x94/0x178 shrink_zone+0x3a/0x82 balance_pgdat+0x32a/0x4c2 kswapd+0x2f0/0x372 kthread+0xa2/0xaa ret_from_fork+0x7c/0xb0 Code: 80 7d 8f 01 48 83 95 68 ff ff ff 00 4c 89 e7 e8 5a 7b 00 00 48 85 c0 49 89 c5 75 08 80 7d 8f 00 74 3e eb 31 48 8b 80 18 01 00 00 <48> 8b 74 0d 48 8b 78 30 be 02 00 00 00 ff d2 eb RIP [] shrink_page_list+0x24e/0x897 RSP CR2: 0000000000000028 ---[ end trace 703d2451af6ffbfd ]--- Kernel panic - not syncing: Fatal exception This patch fixes the issue, by assuring the proper tests are made at putback_movable_pages() & reclaim_clean_pages_from_list() to avoid isolated balloon pages being wrongly reinserted in LRU lists. [akpm@linux-foundation.org: clarify awkward comment text] Signed-off-by: Rafael Aquini Reported-by: Luiz Capitulino Tested-by: Luiz Capitulino Cc: Mel Gorman Cc: Rik van Riel Cc: Hugh Dickins Cc: Johannes Weiner Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/balloon_compaction.h | 25 +++++++++++++++++++++++++ mm/migrate.c | 2 +- mm/vmscan.c | 4 +++- 3 files changed, 29 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/balloon_compaction.h b/include/linux/balloon_compaction.h index f7f1d7169b11..089743ade734 100644 --- a/include/linux/balloon_compaction.h +++ b/include/linux/balloon_compaction.h @@ -158,6 +158,26 @@ static inline bool balloon_page_movable(struct page *page) return false; } +/* + * isolated_balloon_page - identify an isolated balloon page on private + * compaction/migration page lists. + * + * After a compaction thread isolates a balloon page for migration, it raises + * the page refcount to prevent concurrent compaction threads from re-isolating + * the same page. For that reason putback_movable_pages(), or other routines + * that need to identify isolated balloon pages on private pagelists, cannot + * rely on balloon_page_movable() to accomplish the task. + */ +static inline bool isolated_balloon_page(struct page *page) +{ + /* Already isolated balloon pages, by default, have a raised refcount */ + if (page_flags_cleared(page) && !page_mapped(page) && + page_count(page) >= 2) + return __is_movable_balloon_page(page); + + return false; +} + /* * balloon_page_insert - insert a page into the balloon's page list and make * the page->mapping assignment accordingly. @@ -243,6 +263,11 @@ static inline bool balloon_page_movable(struct page *page) return false; } +static inline bool isolated_balloon_page(struct page *page) +{ + return false; +} + static inline bool balloon_page_isolate(struct page *page) { return false; diff --git a/mm/migrate.c b/mm/migrate.c index 9c8d5f59d30b..a26bccd44ccb 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -107,7 +107,7 @@ void putback_movable_pages(struct list_head *l) list_del(&page->lru); dec_zone_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page)); - if (unlikely(balloon_page_movable(page))) + if (unlikely(isolated_balloon_page(page))) balloon_page_putback(page); else putback_lru_page(page); diff --git a/mm/vmscan.c b/mm/vmscan.c index beb35778c69f..53f2f82f83ae 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -48,6 +48,7 @@ #include #include +#include #include "internal.h" @@ -1113,7 +1114,8 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, LIST_HEAD(clean_pages); list_for_each_entry_safe(page, next, page_list, lru) { - if (page_is_file_cache(page) && !PageDirty(page)) { + if (page_is_file_cache(page) && !PageDirty(page) && + !isolated_balloon_page(page)) { ClearPageActive(page); list_move(&page->lru, &clean_pages); } -- cgit v1.2.3 From 45906723578f768d029ba7774cd97b435f2c5125 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Mon, 30 Sep 2013 14:16:41 +0200 Subject: skbuff: size of hole is wrong in a comment Since commit c93bdd0e03e8 ("netvm: allow skb allocation to use PFMEMALLOC reserves"), hole size is one bit less than what is written in the comment. Signed-off-by: Nicolas Dichtel Signed-off-by: David S. Miller --- include/linux/skbuff.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 2ddb48d9312c..c2d89335f637 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -498,7 +498,7 @@ struct sk_buff { * headers if needed */ __u8 encapsulation:1; - /* 7/9 bit hole (depending on ndisc_nodetype presence) */ + /* 6/8 bit hole (depending on ndisc_nodetype presence) */ kmemcheck_bitfield_end(flags2); #if defined CONFIG_NET_DMA || defined CONFIG_NET_RX_BUSY_POLL -- cgit v1.2.3 From 5bc3db5c9ca8407f52918b6504d3b27230defedc Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Mon, 30 Sep 2013 21:30:22 -0700 Subject: tc: export tc_defact.h to userspace Jamal sent patch to add tc user simple actions to iproute2 but required header was not being exported. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- include/linux/tc_act/tc_defact.h | 19 ------------------- include/uapi/linux/tc_act/Kbuild | 1 + include/uapi/linux/tc_act/tc_defact.h | 19 +++++++++++++++++++ 3 files changed, 20 insertions(+), 19 deletions(-) delete mode 100644 include/linux/tc_act/tc_defact.h create mode 100644 include/uapi/linux/tc_act/tc_defact.h (limited to 'include') diff --git a/include/linux/tc_act/tc_defact.h b/include/linux/tc_act/tc_defact.h deleted file mode 100644 index 6f65d07c7ce2..000000000000 --- a/include/linux/tc_act/tc_defact.h +++ /dev/null @@ -1,19 +0,0 @@ -#ifndef __LINUX_TC_DEF_H -#define __LINUX_TC_DEF_H - -#include - -struct tc_defact { - tc_gen; -}; - -enum { - TCA_DEF_UNSPEC, - TCA_DEF_TM, - TCA_DEF_PARMS, - TCA_DEF_DATA, - __TCA_DEF_MAX -}; -#define TCA_DEF_MAX (__TCA_DEF_MAX - 1) - -#endif diff --git a/include/uapi/linux/tc_act/Kbuild b/include/uapi/linux/tc_act/Kbuild index 0623ec4e728f..56f121605c99 100644 --- a/include/uapi/linux/tc_act/Kbuild +++ b/include/uapi/linux/tc_act/Kbuild @@ -1,5 +1,6 @@ # UAPI Header export list header-y += tc_csum.h +header-y += tc_defact.h header-y += tc_gact.h header-y += tc_ipt.h header-y += tc_mirred.h diff --git a/include/uapi/linux/tc_act/tc_defact.h b/include/uapi/linux/tc_act/tc_defact.h new file mode 100644 index 000000000000..17dddb40f740 --- /dev/null +++ b/include/uapi/linux/tc_act/tc_defact.h @@ -0,0 +1,19 @@ +#ifndef __LINUX_TC_DEF_H +#define __LINUX_TC_DEF_H + +#include + +struct tc_defact { + tc_gen; +}; + +enum { + TCA_DEF_UNSPEC, + TCA_DEF_TM, + TCA_DEF_PARMS, + TCA_DEF_DATA, + __TCA_DEF_MAX +}; +#define TCA_DEF_MAX (__TCA_DEF_MAX - 1) + +#endif -- cgit v1.2.3 From 26794942461f438a6bc725ec7294b08a6bd782c4 Mon Sep 17 00:00:00 2001 From: David Miller Date: Wed, 2 Oct 2013 14:25:09 -0400 Subject: mm: Fix generic hugetlb pte check return type. The include/asm-generic/hugetlb.h stubs that just vector huge_pte_*() calls to the pte_*() implementations won't work in certain situations. x86 and sparc, for example, return "unsigned long" from the bit checks, and just go "return pte_val(pte) & PTE_BIT_FOO;" But since huge_pte_*() returns 'int', if any high bits on 64-bit are relevant, they get chopped off. The net effect is that we can loop forever trying to COW a huge page, because the huge_pte_write() check signals false all the time. Reported-by: Gurudas Pai Tested-by: Gurudas Pai Signed-off-by: David S. Miller Acked-by: David Rientjes --- include/asm-generic/hugetlb.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/asm-generic/hugetlb.h b/include/asm-generic/hugetlb.h index d06079c774a0..99b490b4d05a 100644 --- a/include/asm-generic/hugetlb.h +++ b/include/asm-generic/hugetlb.h @@ -6,12 +6,12 @@ static inline pte_t mk_huge_pte(struct page *page, pgprot_t pgprot) return mk_pte(page, pgprot); } -static inline int huge_pte_write(pte_t pte) +static inline unsigned long huge_pte_write(pte_t pte) { return pte_write(pte); } -static inline int huge_pte_dirty(pte_t pte) +static inline unsigned long huge_pte_dirty(pte_t pte) { return pte_dirty(pte); } -- cgit v1.2.3 From 9886167d20c0720dcfb01e62cdff4d906b226f43 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 3 Oct 2013 16:02:23 +0200 Subject: perf: Fix perf_pmu_migrate_context While auditing the list_entry usage due to a trinity bug I found that perf_pmu_migrate_context violates the rules for perf_event::event_entry. The problem is that perf_event::event_entry is a RCU list element, and hence we must wait for a full RCU grace period before re-using the element after deletion. Therefore the usage in perf_pmu_migrate_context() which re-uses the entry immediately is broken. For now introduce another list_head into perf_event for this specific usage. This doesn't actually fix the trinity report because that never goes through this code. Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-mkj72lxagw1z8fvjm648iznw@git.kernel.org Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 24 +++++++++++++++++++++++- kernel/events/core.c | 6 +++--- 2 files changed, 26 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 866e85c5eb94..c8ba627c1d60 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -294,9 +294,31 @@ struct ring_buffer; */ struct perf_event { #ifdef CONFIG_PERF_EVENTS - struct list_head group_entry; + /* + * entry onto perf_event_context::event_list; + * modifications require ctx->lock + * RCU safe iterations. + */ struct list_head event_entry; + + /* + * XXX: group_entry and sibling_list should be mutually exclusive; + * either you're a sibling on a group, or you're the group leader. + * Rework the code to always use the same list element. + * + * Locked for modification by both ctx->mutex and ctx->lock; holding + * either sufficies for read. + */ + struct list_head group_entry; struct list_head sibling_list; + + /* + * We need storage to track the entries in perf_pmu_migrate_context; we + * cannot use the event_entry because of RCU and we want to keep the + * group in tact which avoids us using the other two entries. + */ + struct list_head migrate_entry; + struct hlist_node hlist_entry; int nr_siblings; int group_flags; diff --git a/kernel/events/core.c b/kernel/events/core.c index cb4238e85b38..d49a9d29334c 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7234,15 +7234,15 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu) perf_remove_from_context(event); unaccount_event_cpu(event, src_cpu); put_ctx(src_ctx); - list_add(&event->event_entry, &events); + list_add(&event->migrate_entry, &events); } mutex_unlock(&src_ctx->mutex); synchronize_rcu(); mutex_lock(&dst_ctx->mutex); - list_for_each_entry_safe(event, tmp, &events, event_entry) { - list_del(&event->event_entry); + list_for_each_entry_safe(event, tmp, &events, migrate_entry) { + list_del(&event->migrate_entry); if (event->state >= PERF_EVENT_STATE_OFF) event->state = PERF_EVENT_STATE_INACTIVE; account_event_cpu(event, dst_cpu); -- cgit v1.2.3 From 3573540cafa4296dd60f8be02f2aecaa31047525 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Wed, 2 Oct 2013 09:14:06 +0300 Subject: netif_set_xps_queue: make cpu mask const virtio wants to pass in cpumask_of(cpu), make parameter const to avoid build warnings. Signed-off-by: Michael S. Tsirkin Signed-off-by: David S. Miller --- include/linux/netdevice.h | 5 +++-- net/core/dev.c | 3 ++- 2 files changed, 5 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 3de49aca4519..25f5d2d11e7c 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2264,11 +2264,12 @@ static inline void netif_wake_subqueue(struct net_device *dev, u16 queue_index) } #ifdef CONFIG_XPS -extern int netif_set_xps_queue(struct net_device *dev, struct cpumask *mask, +extern int netif_set_xps_queue(struct net_device *dev, + const struct cpumask *mask, u16 index); #else static inline int netif_set_xps_queue(struct net_device *dev, - struct cpumask *mask, + const struct cpumask *mask, u16 index) { return 0; diff --git a/net/core/dev.c b/net/core/dev.c index 65f829cfd928..3430b1ed12e5 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1917,7 +1917,8 @@ static struct xps_map *expand_xps_map(struct xps_map *map, return new_map; } -int netif_set_xps_queue(struct net_device *dev, struct cpumask *mask, u16 index) +int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask, + u16 index) { struct xps_dev_maps *dev_maps, *new_dev_maps = NULL; struct xps_map *map, *new_map; -- cgit v1.2.3 From d45ed4a4e33ae103053c0a53d280014e7101bb5c Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 4 Oct 2013 00:14:06 -0700 Subject: net: fix unsafe set_memory_rw from softirq on x86 system with net.core.bpf_jit_enable = 1 sudo tcpdump -i eth1 'tcp port 22' causes the warning: [ 56.766097] Possible unsafe locking scenario: [ 56.766097] [ 56.780146] CPU0 [ 56.786807] ---- [ 56.793188] lock(&(&vb->lock)->rlock); [ 56.799593] [ 56.805889] lock(&(&vb->lock)->rlock); [ 56.812266] [ 56.812266] *** DEADLOCK *** [ 56.812266] [ 56.830670] 1 lock held by ksoftirqd/1/13: [ 56.836838] #0: (rcu_read_lock){.+.+..}, at: [] vm_unmap_aliases+0x8c/0x380 [ 56.849757] [ 56.849757] stack backtrace: [ 56.862194] CPU: 1 PID: 13 Comm: ksoftirqd/1 Not tainted 3.12.0-rc3+ #45 [ 56.868721] Hardware name: System manufacturer System Product Name/P8Z77 WS, BIOS 3007 07/26/2012 [ 56.882004] ffffffff821944c0 ffff88080bbdb8c8 ffffffff8175a145 0000000000000007 [ 56.895630] ffff88080bbd5f40 ffff88080bbdb928 ffffffff81755b14 0000000000000001 [ 56.909313] ffff880800000001 ffff880800000000 ffffffff8101178f 0000000000000001 [ 56.923006] Call Trace: [ 56.929532] [] dump_stack+0x55/0x76 [ 56.936067] [] print_usage_bug+0x1f7/0x208 [ 56.942445] [] ? save_stack_trace+0x2f/0x50 [ 56.948932] [] ? check_usage_backwards+0x150/0x150 [ 56.955470] [] mark_lock+0x282/0x2c0 [ 56.961945] [] __lock_acquire+0x45d/0x1d50 [ 56.968474] [] ? __lock_acquire+0x2de/0x1d50 [ 56.975140] [] ? cpumask_next_and+0x55/0x90 [ 56.981942] [] lock_acquire+0x92/0x1d0 [ 56.988745] [] ? vm_unmap_aliases+0x16a/0x380 [ 56.995619] [] _raw_spin_lock+0x41/0x50 [ 57.002493] [] ? vm_unmap_aliases+0x16a/0x380 [ 57.009447] [] vm_unmap_aliases+0x16a/0x380 [ 57.016477] [] ? vm_unmap_aliases+0x8c/0x380 [ 57.023607] [] change_page_attr_set_clr+0xc0/0x460 [ 57.030818] [] ? trace_hardirqs_on+0xd/0x10 [ 57.037896] [] ? kmem_cache_free+0xb0/0x2b0 [ 57.044789] [] ? free_object_rcu+0x93/0xa0 [ 57.051720] [] set_memory_rw+0x2f/0x40 [ 57.058727] [] bpf_jit_free+0x2c/0x40 [ 57.065577] [] sk_filter_release_rcu+0x1a/0x30 [ 57.072338] [] rcu_process_callbacks+0x202/0x7c0 [ 57.078962] [] __do_softirq+0xf7/0x3f0 [ 57.085373] [] run_ksoftirqd+0x35/0x70 cannot reuse jited filter memory, since it's readonly, so use original bpf insns memory to hold work_struct defer kfree of sk_filter until jit completed freeing tested on x86_64 and i386 Signed-off-by: Alexei Starovoitov Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- arch/arm/net/bpf_jit_32.c | 1 + arch/powerpc/net/bpf_jit_comp.c | 1 + arch/s390/net/bpf_jit_comp.c | 4 +++- arch/sparc/net/bpf_jit_comp.c | 1 + arch/x86/net/bpf_jit_comp.c | 18 +++++++++++++----- include/linux/filter.h | 15 +++++++++++---- include/net/sock.h | 6 ++---- net/core/filter.c | 8 ++++---- 8 files changed, 36 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c index f50d223a0bd3..99b44e0e8d86 100644 --- a/arch/arm/net/bpf_jit_32.c +++ b/arch/arm/net/bpf_jit_32.c @@ -930,4 +930,5 @@ void bpf_jit_free(struct sk_filter *fp) { if (fp->bpf_func != sk_run_filter) module_free(NULL, fp->bpf_func); + kfree(fp); } diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c index bf56e33f8257..2345bdb4d917 100644 --- a/arch/powerpc/net/bpf_jit_comp.c +++ b/arch/powerpc/net/bpf_jit_comp.c @@ -691,4 +691,5 @@ void bpf_jit_free(struct sk_filter *fp) { if (fp->bpf_func != sk_run_filter) module_free(NULL, fp->bpf_func); + kfree(fp); } diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index 709239285869..a5df511e27a2 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -881,7 +881,9 @@ void bpf_jit_free(struct sk_filter *fp) struct bpf_binary_header *header = (void *)addr; if (fp->bpf_func == sk_run_filter) - return; + goto free_filter; set_memory_rw(addr, header->pages); module_free(NULL, header); +free_filter: + kfree(fp); } diff --git a/arch/sparc/net/bpf_jit_comp.c b/arch/sparc/net/bpf_jit_comp.c index 9c7be59e6f5a..218b6b23c378 100644 --- a/arch/sparc/net/bpf_jit_comp.c +++ b/arch/sparc/net/bpf_jit_comp.c @@ -808,4 +808,5 @@ void bpf_jit_free(struct sk_filter *fp) { if (fp->bpf_func != sk_run_filter) module_free(NULL, fp->bpf_func); + kfree(fp); } diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 79c216aa0e2b..516593e1ce33 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -772,13 +772,21 @@ out: return; } +static void bpf_jit_free_deferred(struct work_struct *work) +{ + struct sk_filter *fp = container_of(work, struct sk_filter, work); + unsigned long addr = (unsigned long)fp->bpf_func & PAGE_MASK; + struct bpf_binary_header *header = (void *)addr; + + set_memory_rw(addr, header->pages); + module_free(NULL, header); + kfree(fp); +} + void bpf_jit_free(struct sk_filter *fp) { if (fp->bpf_func != sk_run_filter) { - unsigned long addr = (unsigned long)fp->bpf_func & PAGE_MASK; - struct bpf_binary_header *header = (void *)addr; - - set_memory_rw(addr, header->pages); - module_free(NULL, header); + INIT_WORK(&fp->work, bpf_jit_free_deferred); + schedule_work(&fp->work); } } diff --git a/include/linux/filter.h b/include/linux/filter.h index a6ac84871d6d..ff4e40cd45b1 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -6,6 +6,7 @@ #include #include +#include #include #ifdef CONFIG_COMPAT @@ -25,15 +26,19 @@ struct sk_filter { atomic_t refcnt; unsigned int len; /* Number of filter blocks */ + struct rcu_head rcu; unsigned int (*bpf_func)(const struct sk_buff *skb, const struct sock_filter *filter); - struct rcu_head rcu; - struct sock_filter insns[0]; + union { + struct sock_filter insns[0]; + struct work_struct work; + }; }; -static inline unsigned int sk_filter_len(const struct sk_filter *fp) +static inline unsigned int sk_filter_size(unsigned int proglen) { - return fp->len * sizeof(struct sock_filter) + sizeof(*fp); + return max(sizeof(struct sk_filter), + offsetof(struct sk_filter, insns[proglen])); } extern int sk_filter(struct sock *sk, struct sk_buff *skb); @@ -67,11 +72,13 @@ static inline void bpf_jit_dump(unsigned int flen, unsigned int proglen, } #define SK_RUN_FILTER(FILTER, SKB) (*FILTER->bpf_func)(SKB, FILTER->insns) #else +#include static inline void bpf_jit_compile(struct sk_filter *fp) { } static inline void bpf_jit_free(struct sk_filter *fp) { + kfree(fp); } #define SK_RUN_FILTER(FILTER, SKB) sk_run_filter(SKB, FILTER->insns) #endif diff --git a/include/net/sock.h b/include/net/sock.h index 1d37a8086bed..808cbc2ec6c1 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1630,16 +1630,14 @@ static inline void sk_filter_release(struct sk_filter *fp) static inline void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp) { - unsigned int size = sk_filter_len(fp); - - atomic_sub(size, &sk->sk_omem_alloc); + atomic_sub(sk_filter_size(fp->len), &sk->sk_omem_alloc); sk_filter_release(fp); } static inline void sk_filter_charge(struct sock *sk, struct sk_filter *fp) { atomic_inc(&fp->refcnt); - atomic_add(sk_filter_len(fp), &sk->sk_omem_alloc); + atomic_add(sk_filter_size(fp->len), &sk->sk_omem_alloc); } /* diff --git a/net/core/filter.c b/net/core/filter.c index 6438f29ff266..01b780856db2 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -644,7 +644,6 @@ void sk_filter_release_rcu(struct rcu_head *rcu) struct sk_filter *fp = container_of(rcu, struct sk_filter, rcu); bpf_jit_free(fp); - kfree(fp); } EXPORT_SYMBOL(sk_filter_release_rcu); @@ -683,7 +682,7 @@ int sk_unattached_filter_create(struct sk_filter **pfp, if (fprog->filter == NULL) return -EINVAL; - fp = kmalloc(fsize + sizeof(*fp), GFP_KERNEL); + fp = kmalloc(sk_filter_size(fprog->len), GFP_KERNEL); if (!fp) return -ENOMEM; memcpy(fp->insns, fprog->filter, fsize); @@ -723,6 +722,7 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) { struct sk_filter *fp, *old_fp; unsigned int fsize = sizeof(struct sock_filter) * fprog->len; + unsigned int sk_fsize = sk_filter_size(fprog->len); int err; if (sock_flag(sk, SOCK_FILTER_LOCKED)) @@ -732,11 +732,11 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) if (fprog->filter == NULL) return -EINVAL; - fp = sock_kmalloc(sk, fsize+sizeof(*fp), GFP_KERNEL); + fp = sock_kmalloc(sk, sk_fsize, GFP_KERNEL); if (!fp) return -ENOMEM; if (copy_from_user(fp->insns, fprog->filter, fsize)) { - sock_kfree_s(sk, fp, fsize+sizeof(*fp)); + sock_kfree_s(sk, fp, sk_fsize); return -EFAULT; } -- cgit v1.2.3 From d623a0e19dcbc4e44a8db047158815d7f8c2b839 Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Mon, 7 Oct 2013 10:22:01 -0700 Subject: ARM: dts: Fix pinctrl mask for omap3 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The wake-up interrupt bit is available on omap3/4/5 processors unlike what we claim. Without fixing it we cannot use it on omap3 and the system configured for wake-up events will just hang on wake-up. Cc: Grygorii Strashko Cc: Benoît Cousson Cc: devicetree@vger.kernel.org Signed-off-by: Tony Lindgren --- arch/arm/boot/dts/omap3.dtsi | 4 ++-- arch/arm/mach-omap2/mux.h | 4 +--- include/dt-bindings/pinctrl/omap.h | 4 +--- 3 files changed, 4 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/arch/arm/boot/dts/omap3.dtsi b/arch/arm/boot/dts/omap3.dtsi index 7d95cda1fae4..b41bd57f4328 100644 --- a/arch/arm/boot/dts/omap3.dtsi +++ b/arch/arm/boot/dts/omap3.dtsi @@ -108,7 +108,7 @@ #address-cells = <1>; #size-cells = <0>; pinctrl-single,register-width = <16>; - pinctrl-single,function-mask = <0x7f1f>; + pinctrl-single,function-mask = <0xff1f>; }; omap3_pmx_wkup: pinmux@0x48002a00 { @@ -117,7 +117,7 @@ #address-cells = <1>; #size-cells = <0>; pinctrl-single,register-width = <16>; - pinctrl-single,function-mask = <0x7f1f>; + pinctrl-single,function-mask = <0xff1f>; }; gpio1: gpio@48310000 { diff --git a/arch/arm/mach-omap2/mux.h b/arch/arm/mach-omap2/mux.h index 5d2080ef7923..16f78a990d04 100644 --- a/arch/arm/mach-omap2/mux.h +++ b/arch/arm/mach-omap2/mux.h @@ -28,7 +28,7 @@ #define OMAP_PULL_UP (1 << 4) #define OMAP_ALTELECTRICALSEL (1 << 5) -/* 34xx specific mux bit defines */ +/* omap3/4/5 specific mux bit defines */ #define OMAP_INPUT_EN (1 << 8) #define OMAP_OFF_EN (1 << 9) #define OMAP_OFFOUT_EN (1 << 10) @@ -36,8 +36,6 @@ #define OMAP_OFF_PULL_EN (1 << 12) #define OMAP_OFF_PULL_UP (1 << 13) #define OMAP_WAKEUP_EN (1 << 14) - -/* 44xx specific mux bit defines */ #define OMAP_WAKEUP_EVENT (1 << 15) /* Active pin states */ diff --git a/include/dt-bindings/pinctrl/omap.h b/include/dt-bindings/pinctrl/omap.h index edbd250809cb..bed35e36fd27 100644 --- a/include/dt-bindings/pinctrl/omap.h +++ b/include/dt-bindings/pinctrl/omap.h @@ -23,7 +23,7 @@ #define PULL_UP (1 << 4) #define ALTELECTRICALSEL (1 << 5) -/* 34xx specific mux bit defines */ +/* omap3/4/5 specific mux bit defines */ #define INPUT_EN (1 << 8) #define OFF_EN (1 << 9) #define OFFOUT_EN (1 << 10) @@ -31,8 +31,6 @@ #define OFF_PULL_EN (1 << 12) #define OFF_PULL_UP (1 << 13) #define WAKEUP_EN (1 << 14) - -/* 44xx specific mux bit defines */ #define WAKEUP_EVENT (1 << 15) /* Active pin states */ -- cgit v1.2.3 From 2053a1db41193c2b5e1f47a91aaba0fd63ba7102 Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Tue, 8 Oct 2013 09:47:22 -0700 Subject: target: Fix assignment of LUN in tracepoints The unpacked_lun field in the SCSI target tracepoints should be initialized with cmd->orig_fe_lun rather than cmd->se_lun->unpacked_lun for two reasons: - most importantly, if we are in the cmd_complete tracepoint returning a check condition due to no LUN found, cmd->se_lun will be NULL and we'll crash trying to dereference it. - also, in any case, cmd->se_lun->unpacked_lun is an internal index into the target's internal set of LUNs; cmd->orig_fe_lun is much more useful and interesting, since it's the value the initiator actually sent. Signed-off-by: Roland Dreier Cc: # 3.11+ Signed-off-by: Nicholas Bellinger --- include/trace/events/target.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/trace/events/target.h b/include/trace/events/target.h index aef8fc354025..da9cc0f05c93 100644 --- a/include/trace/events/target.h +++ b/include/trace/events/target.h @@ -144,7 +144,7 @@ TRACE_EVENT(target_sequencer_start, ), TP_fast_assign( - __entry->unpacked_lun = cmd->se_lun->unpacked_lun; + __entry->unpacked_lun = cmd->orig_fe_lun; __entry->opcode = cmd->t_task_cdb[0]; __entry->data_length = cmd->data_length; __entry->task_attribute = cmd->sam_task_attr; @@ -182,7 +182,7 @@ TRACE_EVENT(target_cmd_complete, ), TP_fast_assign( - __entry->unpacked_lun = cmd->se_lun->unpacked_lun; + __entry->unpacked_lun = cmd->orig_fe_lun; __entry->opcode = cmd->t_task_cdb[0]; __entry->data_length = cmd->data_length; __entry->task_attribute = cmd->sam_task_attr; -- cgit v1.2.3 From c1868b822515313c72445e70b7d9e47d8815bc52 Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Wed, 11 Sep 2013 16:35:25 +0300 Subject: mlx5: Remove checksum on command interface commands Checksum calculations consume CPU resources and can be significant to the rate of resource creation/destruction. Signed-off-by: Eli Cohen Signed-off-by: Roland Dreier --- drivers/net/ethernet/mellanox/mlx5/core/cmd.c | 26 ++++++++++++++------------ drivers/net/ethernet/mellanox/mlx5/core/main.c | 21 +++++---------------- include/linux/mlx5/device.h | 2 +- include/linux/mlx5/driver.h | 4 +--- 4 files changed, 21 insertions(+), 32 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c index 5472cbd34028..db3a6584939f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c @@ -180,28 +180,32 @@ static int verify_block_sig(struct mlx5_cmd_prot_block *block) return 0; } -static void calc_block_sig(struct mlx5_cmd_prot_block *block, u8 token) +static void calc_block_sig(struct mlx5_cmd_prot_block *block, u8 token, + int csum) { block->token = token; - block->ctrl_sig = ~xor8_buf(block->rsvd0, sizeof(*block) - sizeof(block->data) - 2); - block->sig = ~xor8_buf(block, sizeof(*block) - 1); + if (csum) { + block->ctrl_sig = ~xor8_buf(block->rsvd0, sizeof(*block) - + sizeof(block->data) - 2); + block->sig = ~xor8_buf(block, sizeof(*block) - 1); + } } -static void calc_chain_sig(struct mlx5_cmd_msg *msg, u8 token) +static void calc_chain_sig(struct mlx5_cmd_msg *msg, u8 token, int csum) { struct mlx5_cmd_mailbox *next = msg->next; while (next) { - calc_block_sig(next->buf, token); + calc_block_sig(next->buf, token, csum); next = next->next; } } -static void set_signature(struct mlx5_cmd_work_ent *ent) +static void set_signature(struct mlx5_cmd_work_ent *ent, int csum) { ent->lay->sig = ~xor8_buf(ent->lay, sizeof(*ent->lay)); - calc_chain_sig(ent->in, ent->token); - calc_chain_sig(ent->out, ent->token); + calc_chain_sig(ent->in, ent->token, csum); + calc_chain_sig(ent->out, ent->token, csum); } static void poll_timeout(struct mlx5_cmd_work_ent *ent) @@ -539,8 +543,7 @@ static void cmd_work_handler(struct work_struct *work) lay->type = MLX5_PCI_CMD_XPORT; lay->token = ent->token; lay->status_own = CMD_OWNER_HW; - if (!cmd->checksum_disabled) - set_signature(ent); + set_signature(ent, !cmd->checksum_disabled); dump_command(dev, ent, 1); ktime_get_ts(&ent->ts1); @@ -773,8 +776,6 @@ static int mlx5_copy_from_msg(void *to, struct mlx5_cmd_msg *from, int size) copy = min_t(int, size, MLX5_CMD_DATA_BLOCK_SIZE); block = next->buf; - if (xor8_buf(block, sizeof(*block)) != 0xff) - return -EINVAL; memcpy(to, block->data, copy); to += copy; @@ -1361,6 +1362,7 @@ int mlx5_cmd_init(struct mlx5_core_dev *dev) goto err_map; } + cmd->checksum_disabled = 1; cmd->max_reg_cmds = (1 << cmd->log_sz) - 1; cmd->bitmask = (1 << cmd->max_reg_cmds) - 1; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index b47739b0b5f6..bc0f5fb66e24 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -165,9 +165,7 @@ static int handle_hca_cap(struct mlx5_core_dev *dev) struct mlx5_cmd_set_hca_cap_mbox_in *set_ctx = NULL; struct mlx5_cmd_query_hca_cap_mbox_in query_ctx; struct mlx5_cmd_set_hca_cap_mbox_out set_out; - struct mlx5_profile *prof = dev->profile; u64 flags; - int csum = 1; int err; memset(&query_ctx, 0, sizeof(query_ctx)); @@ -197,20 +195,14 @@ static int handle_hca_cap(struct mlx5_core_dev *dev) memcpy(&set_ctx->hca_cap, &query_out->hca_cap, sizeof(set_ctx->hca_cap)); - if (prof->mask & MLX5_PROF_MASK_CMDIF_CSUM) { - csum = !!prof->cmdif_csum; - flags = be64_to_cpu(set_ctx->hca_cap.flags); - if (csum) - flags |= MLX5_DEV_CAP_FLAG_CMDIF_CSUM; - else - flags &= ~MLX5_DEV_CAP_FLAG_CMDIF_CSUM; - - set_ctx->hca_cap.flags = cpu_to_be64(flags); - } - if (dev->profile->mask & MLX5_PROF_MASK_QP_SIZE) set_ctx->hca_cap.log_max_qp = dev->profile->log_max_qp; + flags = be64_to_cpu(query_out->hca_cap.flags); + /* disable checksum */ + flags &= ~MLX5_DEV_CAP_FLAG_CMDIF_CSUM; + + set_ctx->hca_cap.flags = cpu_to_be64(flags); memset(&set_out, 0, sizeof(set_out)); set_ctx->hca_cap.log_uar_page_sz = cpu_to_be16(PAGE_SHIFT - 12); set_ctx->hdr.opcode = cpu_to_be16(MLX5_CMD_OP_SET_HCA_CAP); @@ -225,9 +217,6 @@ static int handle_hca_cap(struct mlx5_core_dev *dev) if (err) goto query_ex; - if (!csum) - dev->cmd.checksum_disabled = 1; - query_ex: kfree(query_out); kfree(set_ctx); diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 68029b30c3dc..770e3b448b3b 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -181,7 +181,7 @@ enum { MLX5_DEV_CAP_FLAG_TLP_HINTS = 1LL << 39, MLX5_DEV_CAP_FLAG_SIG_HAND_OVER = 1LL << 40, MLX5_DEV_CAP_FLAG_DCT = 1LL << 41, - MLX5_DEV_CAP_FLAG_CMDIF_CSUM = 1LL << 46, + MLX5_DEV_CAP_FLAG_CMDIF_CSUM = 3LL << 46, }; enum { diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 8888381fc150..2cfc4309d45f 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -747,8 +747,7 @@ static inline u32 mlx5_idx_to_mkey(u32 mkey_idx) enum { MLX5_PROF_MASK_QP_SIZE = (u64)1 << 0, - MLX5_PROF_MASK_CMDIF_CSUM = (u64)1 << 1, - MLX5_PROF_MASK_MR_CACHE = (u64)1 << 2, + MLX5_PROF_MASK_MR_CACHE = (u64)1 << 1, }; enum { @@ -758,7 +757,6 @@ enum { struct mlx5_profile { u64 mask; u32 log_max_qp; - int cmdif_csum; struct { int size; int limit; -- cgit v1.2.3 From 2f6daec14d02deb84e7896a93196d78fbe9956a2 Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Wed, 11 Sep 2013 16:35:29 +0300 Subject: mlx5: Fix layout of struct mlx5_init_seg The layout of struct health_buffer was not according to firmware specification. Fix it to comply. Signed-off-by: Eli Cohen Signed-off-by: Roland Dreier --- include/linux/mlx5/device.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 770e3b448b3b..5eb4e31af22b 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -417,7 +417,7 @@ struct mlx5_init_seg { struct health_buffer health; __be32 rsvd2[884]; __be32 health_counter; - __be32 rsvd3[1023]; + __be32 rsvd3[1019]; __be64 ieee1588_clk; __be32 ieee1588_clk_type; __be32 clr_intx; -- cgit v1.2.3 From ada9f5d007971a71d619e2abf66ebd3a9a399413 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Wed, 11 Sep 2013 16:35:34 +0300 Subject: IB/mlx5: Fix eq names to display nicely in /proc/interrupts It's helpful for a driver to put the pci slot name in its interrupt names, so /proc/interrupts will show the pci slot of the device. Signed-off-by: Sagi Grimberg Signed-off-by: Eli Cohen Signed-off-by: Roland Dreier --- drivers/infiniband/hw/mlx5/main.c | 6 +++--- drivers/net/ethernet/mellanox/mlx5/core/eq.c | 4 +++- include/linux/mlx5/driver.h | 2 +- 3 files changed, 7 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index b267c65261c0..b1a6cb3a2809 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -164,6 +164,7 @@ int mlx5_vector2eqn(struct mlx5_ib_dev *dev, int vector, int *eqn, int *irqn) static int alloc_comp_eqs(struct mlx5_ib_dev *dev) { struct mlx5_eq_table *table = &dev->mdev.priv.eq_table; + char name[MLX5_MAX_EQ_NAME]; struct mlx5_eq *eq, *n; int ncomp_vec; int nent; @@ -180,11 +181,10 @@ static int alloc_comp_eqs(struct mlx5_ib_dev *dev) goto clean; } - snprintf(eq->name, MLX5_MAX_EQ_NAME, "mlx5_comp%d", i); + snprintf(name, MLX5_MAX_EQ_NAME, "mlx5_comp%d", i); err = mlx5_create_map_eq(&dev->mdev, eq, i + MLX5_EQ_VEC_COMP_BASE, nent, 0, - eq->name, - &dev->mdev.priv.uuari.uars[0]); + name, &dev->mdev.priv.uuari.uars[0]); if (err) { kfree(eq); goto clean; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c index 443cc4d7b024..2231d93cc7ad 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c @@ -366,9 +366,11 @@ int mlx5_create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq, u8 vecidx, goto err_in; } + snprintf(eq->name, MLX5_MAX_EQ_NAME, "%s@pci:%s", + name, pci_name(dev->pdev)); eq->eqn = out.eq_number; err = request_irq(table->msix_arr[vecidx].vector, mlx5_msix_handler, 0, - name, eq); + eq->name, eq); if (err) goto err_eq; diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 2cfc4309d45f..6b8c496572c8 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -82,7 +82,7 @@ enum { }; enum { - MLX5_MAX_EQ_NAME = 20 + MLX5_MAX_EQ_NAME = 32 }; enum { -- cgit v1.2.3 From 61875f30daf60305712e25b209ef41ced2635bad Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 21 Sep 2013 13:58:22 -0400 Subject: random: allow architectures to optionally define random_get_entropy() Allow architectures which have a disabled get_cycles() function to provide a random_get_entropy() function which provides a fine-grained, rapidly changing counter that can be used by the /dev/random driver. For example, an architecture might have a rapidly changing register used to control random TLB cache eviction, or DRAM refresh that doesn't meet the requirements of get_cycles(), but which is good enough for the needs of the random driver. Signed-off-by: "Theodore Ts'o" Cc: stable@vger.kernel.org --- drivers/char/random.c | 8 ++++---- include/linux/timex.h | 14 ++++++++++++++ 2 files changed, 18 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/drivers/char/random.c b/drivers/char/random.c index 92e6c67e1ae6..2d5daf9b58e9 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -643,7 +643,7 @@ struct timer_rand_state { */ void add_device_randomness(const void *buf, unsigned int size) { - unsigned long time = get_cycles() ^ jiffies; + unsigned long time = random_get_entropy() ^ jiffies; mix_pool_bytes(&input_pool, buf, size, NULL); mix_pool_bytes(&input_pool, &time, sizeof(time), NULL); @@ -680,7 +680,7 @@ static void add_timer_randomness(struct timer_rand_state *state, unsigned num) goto out; sample.jiffies = jiffies; - sample.cycles = get_cycles(); + sample.cycles = random_get_entropy(); sample.num = num; mix_pool_bytes(&input_pool, &sample, sizeof(sample), NULL); @@ -747,7 +747,7 @@ void add_interrupt_randomness(int irq, int irq_flags) struct fast_pool *fast_pool = &__get_cpu_var(irq_randomness); struct pt_regs *regs = get_irq_regs(); unsigned long now = jiffies; - __u32 input[4], cycles = get_cycles(); + __u32 input[4], cycles = random_get_entropy(); input[0] = cycles ^ jiffies; input[1] = irq; @@ -1485,7 +1485,7 @@ unsigned int get_random_int(void) hash = get_cpu_var(get_random_int_hash); - hash[0] += current->pid + jiffies + get_cycles(); + hash[0] += current->pid + jiffies + random_get_entropy(); md5_transform(hash, random_int_secret); ret = hash[0]; put_cpu_var(get_random_int_hash); diff --git a/include/linux/timex.h b/include/linux/timex.h index b3726e61368e..da4c32dbb2aa 100644 --- a/include/linux/timex.h +++ b/include/linux/timex.h @@ -64,6 +64,20 @@ #include +#ifndef random_get_entropy +/* + * The random_get_entropy() function is used by the /dev/random driver + * in order to extract entropy via the relative unpredictability of + * when an interrupt takes places versus a high speed, fine-grained + * timing source or cycle counter. Since it will be occurred on every + * single interrupt, it must have a very low cost/overhead. + * + * By default we use get_cycles() for this purpose, but individual + * architectures may override this in their asm/timex.h header file. + */ +#define random_get_entropy() get_cycles() +#endif + /* * SHIFT_PLL is used as a dampening factor to define how much we * adjust the frequency correction for a given offset in PLL mode. -- cgit v1.2.3 From ebff5fa9d545574324095d9c6a3cb80c9157abc5 Mon Sep 17 00:00:00 2001 From: Dave Airlie Date: Fri, 11 Oct 2013 15:12:04 +1000 Subject: Revert "i915: Update VGA arbiter support for newer devices" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 81b5c7bc8de3e6f63419139c2fc91bf81dea8a7d. Adding drm/i915 into the vga arbiter chain means that X (in a piece of well-meant paranoia) will do a get/put on the vga decoding around _every_ accel call down into the ddx. Which results in some nice performance disasters [1]. This really breaks userspace, by disabling DRI for everyone, and stops OpenGL from working, this isn't limited to just the i915 but both the integrated and discrete GPUs on multi-gpu systems, in other words this causes untold worlds of pain, Ville tried to come up with a Great Hack to fiddle the required VGA I/O ops behind everyone's back using stop_machine, but that didn't really work out [2]. Given that we're fairly late in the -rc stage for such games let's just revert this all. One thing we might want to keep is to delay the disabling of the vga decoding until the fbdev emulation and the fbcon screen is set up. If we kill vga mem decoding beforehand fbcon can end up with a white square in the top-left corner it tried to save from the vga memory for a seamless transition. And we have bug reports on older platforms which seem to match these symptoms. But again that's something to play around with in -next. References: [1] http://lists.x.org/archives/xorg-devel/2013-September/037763.html References: [2] http://www.spinics.net/lists/intel-gfx/msg34062.html Cc: Alex Williamson Cc: Ville Syrjälä Cc: Chris Wilson Signed-off-by: Daniel Vetter Signed-off-by: Dave Airlie --- drivers/gpu/drm/i915/i915_dma.c | 9 +++------ drivers/gpu/drm/i915/intel_display.c | 25 ------------------------- include/linux/vgaarb.h | 7 ------- 3 files changed, 3 insertions(+), 38 deletions(-) (limited to 'include') diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c index 52f5ad8037cc..d5c784d48671 100644 --- a/drivers/gpu/drm/i915/i915_dma.c +++ b/drivers/gpu/drm/i915/i915_dma.c @@ -1290,12 +1290,9 @@ static int i915_load_modeset_init(struct drm_device *dev) * then we do not take part in VGA arbitration and the * vga_client_register() fails with -ENODEV. */ - if (!HAS_PCH_SPLIT(dev)) { - ret = vga_client_register(dev->pdev, dev, NULL, - i915_vga_set_decode); - if (ret && ret != -ENODEV) - goto out; - } + ret = vga_client_register(dev->pdev, dev, NULL, i915_vga_set_decode); + if (ret && ret != -ENODEV) + goto out; intel_register_dsm_handler(); diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c index aaea3ec811ed..581fb4b2f766 100644 --- a/drivers/gpu/drm/i915/intel_display.c +++ b/drivers/gpu/drm/i915/intel_display.c @@ -10038,15 +10038,6 @@ static void i915_disable_vga(struct drm_device *dev) outb(SR01, VGA_SR_INDEX); sr1 = inb(VGA_SR_DATA); outb(sr1 | 1<<5, VGA_SR_DATA); - - /* Disable VGA memory on Intel HD */ - if (HAS_PCH_SPLIT(dev)) { - outb(inb(VGA_MSR_READ) & ~VGA_MSR_MEM_EN, VGA_MSR_WRITE); - vga_set_legacy_decoding(dev->pdev, VGA_RSRC_LEGACY_IO | - VGA_RSRC_NORMAL_IO | - VGA_RSRC_NORMAL_MEM); - } - vga_put(dev->pdev, VGA_RSRC_LEGACY_IO); udelay(300); @@ -10054,20 +10045,6 @@ static void i915_disable_vga(struct drm_device *dev) POSTING_READ(vga_reg); } -static void i915_enable_vga(struct drm_device *dev) -{ - /* Enable VGA memory on Intel HD */ - if (HAS_PCH_SPLIT(dev)) { - vga_get_uninterruptible(dev->pdev, VGA_RSRC_LEGACY_IO); - outb(inb(VGA_MSR_READ) | VGA_MSR_MEM_EN, VGA_MSR_WRITE); - vga_set_legacy_decoding(dev->pdev, VGA_RSRC_LEGACY_IO | - VGA_RSRC_LEGACY_MEM | - VGA_RSRC_NORMAL_IO | - VGA_RSRC_NORMAL_MEM); - vga_put(dev->pdev, VGA_RSRC_LEGACY_IO); - } -} - void intel_modeset_init_hw(struct drm_device *dev) { intel_init_power_well(dev); @@ -10559,8 +10536,6 @@ void intel_modeset_cleanup(struct drm_device *dev) intel_disable_fbc(dev); - i915_enable_vga(dev); - intel_disable_gt_powersave(dev); ironlake_teardown_rc6(dev); diff --git a/include/linux/vgaarb.h b/include/linux/vgaarb.h index 80cf8173a65b..2c02f3a8d2ba 100644 --- a/include/linux/vgaarb.h +++ b/include/linux/vgaarb.h @@ -65,15 +65,8 @@ struct pci_dev; * out of the arbitration process (and can be safe to take * interrupts at any time. */ -#if defined(CONFIG_VGA_ARB) extern void vga_set_legacy_decoding(struct pci_dev *pdev, unsigned int decodes); -#else -static inline void vga_set_legacy_decoding(struct pci_dev *pdev, - unsigned int decodes) -{ -} -#endif /** * vga_get - acquire & locks VGA resources -- cgit v1.2.3 From 3f0116c3238a96bc18ad4b4acefe4e7be32fa861 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 10 Oct 2013 10:16:30 +0200 Subject: compiler/gcc4: Add quirk for 'asm goto' miscompilation bug Fengguang Wu, Oleg Nesterov and Peter Zijlstra tracked down a kernel crash to a GCC bug: GCC miscompiles certain 'asm goto' constructs, as outlined here: http://gcc.gnu.org/bugzilla/show_bug.cgi?id=58670 Implement a workaround suggested by Jakub Jelinek. Reported-and-tested-by: Fengguang Wu Reported-by: Oleg Nesterov Reported-by: Peter Zijlstra Suggested-by: Jakub Jelinek Reviewed-by: Richard Henderson Cc: Linus Torvalds Cc: Andrew Morton Cc: Signed-off-by: Ingo Molnar --- arch/arm/include/asm/jump_label.h | 2 +- arch/mips/include/asm/jump_label.h | 2 +- arch/powerpc/include/asm/jump_label.h | 2 +- arch/s390/include/asm/jump_label.h | 2 +- arch/sparc/include/asm/jump_label.h | 2 +- arch/x86/include/asm/cpufeature.h | 6 +++--- arch/x86/include/asm/jump_label.h | 2 +- arch/x86/include/asm/mutex_64.h | 4 ++-- include/linux/compiler-gcc4.h | 15 +++++++++++++++ 9 files changed, 26 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/arch/arm/include/asm/jump_label.h b/arch/arm/include/asm/jump_label.h index bfc198c75913..863c892b4aaa 100644 --- a/arch/arm/include/asm/jump_label.h +++ b/arch/arm/include/asm/jump_label.h @@ -16,7 +16,7 @@ static __always_inline bool arch_static_branch(struct static_key *key) { - asm goto("1:\n\t" + asm_volatile_goto("1:\n\t" JUMP_LABEL_NOP "\n\t" ".pushsection __jump_table, \"aw\"\n\t" ".word 1b, %l[l_yes], %c0\n\t" diff --git a/arch/mips/include/asm/jump_label.h b/arch/mips/include/asm/jump_label.h index 4d6d77ed9b9d..e194f957ca8c 100644 --- a/arch/mips/include/asm/jump_label.h +++ b/arch/mips/include/asm/jump_label.h @@ -22,7 +22,7 @@ static __always_inline bool arch_static_branch(struct static_key *key) { - asm goto("1:\tnop\n\t" + asm_volatile_goto("1:\tnop\n\t" "nop\n\t" ".pushsection __jump_table, \"aw\"\n\t" WORD_INSN " 1b, %l[l_yes], %0\n\t" diff --git a/arch/powerpc/include/asm/jump_label.h b/arch/powerpc/include/asm/jump_label.h index ae098c438f00..f016bb699b5f 100644 --- a/arch/powerpc/include/asm/jump_label.h +++ b/arch/powerpc/include/asm/jump_label.h @@ -19,7 +19,7 @@ static __always_inline bool arch_static_branch(struct static_key *key) { - asm goto("1:\n\t" + asm_volatile_goto("1:\n\t" "nop\n\t" ".pushsection __jump_table, \"aw\"\n\t" JUMP_ENTRY_TYPE "1b, %l[l_yes], %c0\n\t" diff --git a/arch/s390/include/asm/jump_label.h b/arch/s390/include/asm/jump_label.h index 6c32190dc73e..346b1c85ffb4 100644 --- a/arch/s390/include/asm/jump_label.h +++ b/arch/s390/include/asm/jump_label.h @@ -15,7 +15,7 @@ static __always_inline bool arch_static_branch(struct static_key *key) { - asm goto("0: brcl 0,0\n" + asm_volatile_goto("0: brcl 0,0\n" ".pushsection __jump_table, \"aw\"\n" ASM_ALIGN "\n" ASM_PTR " 0b, %l[label], %0\n" diff --git a/arch/sparc/include/asm/jump_label.h b/arch/sparc/include/asm/jump_label.h index 5080d16a832f..ec2e2e2aba7d 100644 --- a/arch/sparc/include/asm/jump_label.h +++ b/arch/sparc/include/asm/jump_label.h @@ -9,7 +9,7 @@ static __always_inline bool arch_static_branch(struct static_key *key) { - asm goto("1:\n\t" + asm_volatile_goto("1:\n\t" "nop\n\t" "nop\n\t" ".pushsection __jump_table, \"aw\"\n\t" diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index d3f5c63078d8..89270b4318db 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -374,7 +374,7 @@ static __always_inline __pure bool __static_cpu_has(u16 bit) * Catch too early usage of this before alternatives * have run. */ - asm goto("1: jmp %l[t_warn]\n" + asm_volatile_goto("1: jmp %l[t_warn]\n" "2:\n" ".section .altinstructions,\"a\"\n" " .long 1b - .\n" @@ -388,7 +388,7 @@ static __always_inline __pure bool __static_cpu_has(u16 bit) #endif - asm goto("1: jmp %l[t_no]\n" + asm_volatile_goto("1: jmp %l[t_no]\n" "2:\n" ".section .altinstructions,\"a\"\n" " .long 1b - .\n" @@ -453,7 +453,7 @@ static __always_inline __pure bool _static_cpu_has_safe(u16 bit) * have. Thus, we force the jump to the widest, 4-byte, signed relative * offset even though the last would often fit in less bytes. */ - asm goto("1: .byte 0xe9\n .long %l[t_dynamic] - 2f\n" + asm_volatile_goto("1: .byte 0xe9\n .long %l[t_dynamic] - 2f\n" "2:\n" ".section .altinstructions,\"a\"\n" " .long 1b - .\n" /* src offset */ diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h index 64507f35800c..6a2cefb4395a 100644 --- a/arch/x86/include/asm/jump_label.h +++ b/arch/x86/include/asm/jump_label.h @@ -18,7 +18,7 @@ static __always_inline bool arch_static_branch(struct static_key *key) { - asm goto("1:" + asm_volatile_goto("1:" ".byte " __stringify(STATIC_KEY_INIT_NOP) "\n\t" ".pushsection __jump_table, \"aw\" \n\t" _ASM_ALIGN "\n\t" diff --git a/arch/x86/include/asm/mutex_64.h b/arch/x86/include/asm/mutex_64.h index e7e6751648ed..07537a44216e 100644 --- a/arch/x86/include/asm/mutex_64.h +++ b/arch/x86/include/asm/mutex_64.h @@ -20,7 +20,7 @@ static inline void __mutex_fastpath_lock(atomic_t *v, void (*fail_fn)(atomic_t *)) { - asm volatile goto(LOCK_PREFIX " decl %0\n" + asm_volatile_goto(LOCK_PREFIX " decl %0\n" " jns %l[exit]\n" : : "m" (v->counter) : "memory", "cc" @@ -75,7 +75,7 @@ static inline int __mutex_fastpath_lock_retval(atomic_t *count) static inline void __mutex_fastpath_unlock(atomic_t *v, void (*fail_fn)(atomic_t *)) { - asm volatile goto(LOCK_PREFIX " incl %0\n" + asm_volatile_goto(LOCK_PREFIX " incl %0\n" " jg %l[exit]\n" : : "m" (v->counter) : "memory", "cc" diff --git a/include/linux/compiler-gcc4.h b/include/linux/compiler-gcc4.h index 842de225055f..ded429966c1f 100644 --- a/include/linux/compiler-gcc4.h +++ b/include/linux/compiler-gcc4.h @@ -65,6 +65,21 @@ #define __visible __attribute__((externally_visible)) #endif +/* + * GCC 'asm goto' miscompiles certain code sequences: + * + * http://gcc.gnu.org/bugzilla/show_bug.cgi?id=58670 + * + * Work it around via a compiler barrier quirk suggested by Jakub Jelinek. + * Fixed in GCC 4.8.2 and later versions. + * + * (asm goto is automatically volatile - the naming reflects this.) + */ +#if GCC_VERSION <= 40801 +# define asm_volatile_goto(x...) do { asm goto(x); asm (""); } while (0) +#else +# define asm_volatile_goto(x...) do { asm goto(x); } while (0) +#endif #ifdef CONFIG_ARCH_USE_BUILTIN_BSWAP #if GCC_VERSION >= 40400 -- cgit v1.2.3 From c5d5a58d7ff977289c4bba8eae447c9afa66516b Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Fri, 11 Oct 2013 00:07:01 -0700 Subject: ASoC: rcar: fixup generation checker Current rcar is using rsnd_is_gen1/gen2() to checking its IP generation, but it needs data mask. This patch fixes it up. Signed-off-by: Kuninori Morimoto Signed-off-by: Mark Brown --- include/sound/rcar_snd.h | 1 + sound/soc/sh/rcar/rsnd.h | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/sound/rcar_snd.h b/include/sound/rcar_snd.h index fe66533e9b7a..fb0a312bcb81 100644 --- a/include/sound/rcar_snd.h +++ b/include/sound/rcar_snd.h @@ -68,6 +68,7 @@ struct rsnd_scu_platform_info { * * A : generation */ +#define RSND_GEN_MASK (0xF << 0) #define RSND_GEN1 (1 << 0) /* fixme */ #define RSND_GEN2 (2 << 0) /* fixme */ diff --git a/sound/soc/sh/rcar/rsnd.h b/sound/soc/sh/rcar/rsnd.h index 9cc6986a8cfb..5dd87f4c919e 100644 --- a/sound/soc/sh/rcar/rsnd.h +++ b/sound/soc/sh/rcar/rsnd.h @@ -220,8 +220,8 @@ int rsnd_gen_path_exit(struct rsnd_priv *priv, void __iomem *rsnd_gen_reg_get(struct rsnd_priv *priv, struct rsnd_mod *mod, enum rsnd_reg reg); -#define rsnd_is_gen1(s) ((s)->info->flags & RSND_GEN1) -#define rsnd_is_gen2(s) ((s)->info->flags & RSND_GEN2) +#define rsnd_is_gen1(s) (((s)->info->flags & RSND_GEN_MASK) == RSND_GEN1) +#define rsnd_is_gen2(s) (((s)->info->flags & RSND_GEN_MASK) == RSND_GEN2) /* * R-Car ADG -- cgit v1.2.3 From 1931ee143b0ab72924944bc06e363d837ba05063 Mon Sep 17 00:00:00 2001 From: Marek Szyprowski Date: Fri, 11 Oct 2013 09:27:28 +0200 Subject: Revert "drivers: of: add initialization code for dma reserved memory" This reverts commit 9d8eab7af79cb4ce2de5de39f82c455b1f796963. There is still no consensus on the bindings for the reserved memory and various drawbacks of the proposed solution has been shown, so the best now is to revert it completely and start again from scratch later. Signed-off-by: Marek Szyprowski Signed-off-by: Grant Likely --- Documentation/devicetree/bindings/memory.txt | 168 -------------------------- drivers/of/Kconfig | 6 - drivers/of/Makefile | 1 - drivers/of/of_reserved_mem.c | 173 --------------------------- drivers/of/platform.c | 4 - include/linux/of_reserved_mem.h | 14 --- 6 files changed, 366 deletions(-) delete mode 100644 Documentation/devicetree/bindings/memory.txt delete mode 100644 drivers/of/of_reserved_mem.c delete mode 100644 include/linux/of_reserved_mem.h (limited to 'include') diff --git a/Documentation/devicetree/bindings/memory.txt b/Documentation/devicetree/bindings/memory.txt deleted file mode 100644 index eb2469365593..000000000000 --- a/Documentation/devicetree/bindings/memory.txt +++ /dev/null @@ -1,168 +0,0 @@ -*** Memory binding *** - -The /memory node provides basic information about the address and size -of the physical memory. This node is usually filled or updated by the -bootloader, depending on the actual memory configuration of the given -hardware. - -The memory layout is described by the following node: - -/ { - #address-cells = <(n)>; - #size-cells = <(m)>; - memory { - device_type = "memory"; - reg = <(baseaddr1) (size1) - (baseaddr2) (size2) - ... - (baseaddrN) (sizeN)>; - }; - ... -}; - -A memory node follows the typical device tree rules for "reg" property: -n: number of cells used to store base address value -m: number of cells used to store size value -baseaddrX: defines a base address of the defined memory bank -sizeX: the size of the defined memory bank - - -More than one memory bank can be defined. - - -*** Reserved memory regions *** - -In /memory/reserved-memory node one can create child nodes describing -particular reserved (excluded from normal use) memory regions. Such -memory regions are usually designed for the special usage by various -device drivers. A good example are contiguous memory allocations or -memory sharing with other operating system on the same hardware board. -Those special memory regions might depend on the board configuration and -devices used on the target system. - -Parameters for each memory region can be encoded into the device tree -with the following convention: - -[(label):] (name) { - compatible = "linux,contiguous-memory-region", "reserved-memory-region"; - reg = <(address) (size)>; - (linux,default-contiguous-region); -}; - -compatible: one or more of: - - "linux,contiguous-memory-region" - enables binding of this - region to Contiguous Memory Allocator (special region for - contiguous memory allocations, shared with movable system - memory, Linux kernel-specific). - - "reserved-memory-region" - compatibility is defined, given - region is assigned for exclusive usage for by the respective - devices. - -reg: standard property defining the base address and size of - the memory region - -linux,default-contiguous-region: property indicating that the region - is the default region for all contiguous memory - allocations, Linux specific (optional) - -It is optional to specify the base address, so if one wants to use -autoconfiguration of the base address, '0' can be specified as a base -address in the 'reg' property. - -The /memory/reserved-memory node must contain the same #address-cells -and #size-cells value as the root node. - - -*** Device node's properties *** - -Once regions in the /memory/reserved-memory node have been defined, they -may be referenced by other device nodes. Bindings that wish to reference -memory regions should explicitly document their use of the following -property: - -memory-region = <&phandle_to_defined_region>; - -This property indicates that the device driver should use the memory -region pointed by the given phandle. - - -*** Example *** - -This example defines a memory consisting of 4 memory banks. 3 contiguous -regions are defined for Linux kernel, one default of all device drivers -(named contig_mem, placed at 0x72000000, 64MiB), one dedicated to the -framebuffer device (labelled display_mem, placed at 0x78000000, 8MiB) -and one for multimedia processing (labelled multimedia_mem, placed at -0x77000000, 64MiB). 'display_mem' region is then assigned to fb@12300000 -device for DMA memory allocations (Linux kernel drivers will use CMA is -available or dma-exclusive usage otherwise). 'multimedia_mem' is -assigned to scaler@12500000 and codec@12600000 devices for contiguous -memory allocations when CMA driver is enabled. - -The reason for creating a separate region for framebuffer device is to -match the framebuffer base address to the one configured by bootloader, -so once Linux kernel drivers starts no glitches on the displayed boot -logo appears. Scaller and codec drivers should share the memory -allocations. - -/ { - #address-cells = <1>; - #size-cells = <1>; - - /* ... */ - - memory { - reg = <0x40000000 0x10000000 - 0x50000000 0x10000000 - 0x60000000 0x10000000 - 0x70000000 0x10000000>; - - reserved-memory { - #address-cells = <1>; - #size-cells = <1>; - - /* - * global autoconfigured region for contiguous allocations - * (used only with Contiguous Memory Allocator) - */ - contig_region@0 { - compatible = "linux,contiguous-memory-region"; - reg = <0x0 0x4000000>; - linux,default-contiguous-region; - }; - - /* - * special region for framebuffer - */ - display_region: region@78000000 { - compatible = "linux,contiguous-memory-region", "reserved-memory-region"; - reg = <0x78000000 0x800000>; - }; - - /* - * special region for multimedia processing devices - */ - multimedia_region: region@77000000 { - compatible = "linux,contiguous-memory-region"; - reg = <0x77000000 0x4000000>; - }; - }; - }; - - /* ... */ - - fb0: fb@12300000 { - status = "okay"; - memory-region = <&display_region>; - }; - - scaler: scaler@12500000 { - status = "okay"; - memory-region = <&multimedia_region>; - }; - - codec: codec@12600000 { - status = "okay"; - memory-region = <&multimedia_region>; - }; -}; diff --git a/drivers/of/Kconfig b/drivers/of/Kconfig index 9d2009a9004d..78cc76053328 100644 --- a/drivers/of/Kconfig +++ b/drivers/of/Kconfig @@ -74,10 +74,4 @@ config OF_MTD depends on MTD def_bool y -config OF_RESERVED_MEM - depends on OF_FLATTREE && (DMA_CMA || (HAVE_GENERIC_DMA_COHERENT && HAVE_MEMBLOCK)) - def_bool y - help - Initialization code for DMA reserved memory - endmenu # OF diff --git a/drivers/of/Makefile b/drivers/of/Makefile index ed9660adad77..efd05102c405 100644 --- a/drivers/of/Makefile +++ b/drivers/of/Makefile @@ -9,4 +9,3 @@ obj-$(CONFIG_OF_MDIO) += of_mdio.o obj-$(CONFIG_OF_PCI) += of_pci.o obj-$(CONFIG_OF_PCI_IRQ) += of_pci_irq.o obj-$(CONFIG_OF_MTD) += of_mtd.o -obj-$(CONFIG_OF_RESERVED_MEM) += of_reserved_mem.o diff --git a/drivers/of/of_reserved_mem.c b/drivers/of/of_reserved_mem.c deleted file mode 100644 index 0fe40c7d6904..000000000000 --- a/drivers/of/of_reserved_mem.c +++ /dev/null @@ -1,173 +0,0 @@ -/* - * Device tree based initialization code for reserved memory. - * - * Copyright (c) 2013 Samsung Electronics Co., Ltd. - * http://www.samsung.com - * Author: Marek Szyprowski - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation; either version 2 of the - * License or (at your optional) any later version of the license. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define MAX_RESERVED_REGIONS 16 -struct reserved_mem { - phys_addr_t base; - unsigned long size; - struct cma *cma; - char name[32]; -}; -static struct reserved_mem reserved_mem[MAX_RESERVED_REGIONS]; -static int reserved_mem_count; - -static int __init fdt_scan_reserved_mem(unsigned long node, const char *uname, - int depth, void *data) -{ - struct reserved_mem *rmem = &reserved_mem[reserved_mem_count]; - phys_addr_t base, size; - int is_cma, is_reserved; - unsigned long len; - const char *status; - __be32 *prop; - - is_cma = IS_ENABLED(CONFIG_DMA_CMA) && - of_flat_dt_is_compatible(node, "linux,contiguous-memory-region"); - is_reserved = of_flat_dt_is_compatible(node, "reserved-memory-region"); - - if (!is_reserved && !is_cma) { - /* ignore node and scan next one */ - return 0; - } - - status = of_get_flat_dt_prop(node, "status", &len); - if (status && strcmp(status, "okay") != 0) { - /* ignore disabled node nad scan next one */ - return 0; - } - - prop = of_get_flat_dt_prop(node, "reg", &len); - if (!prop || (len < (dt_root_size_cells + dt_root_addr_cells) * - sizeof(__be32))) { - pr_err("Reserved mem: node %s, incorrect \"reg\" property\n", - uname); - /* ignore node and scan next one */ - return 0; - } - base = dt_mem_next_cell(dt_root_addr_cells, &prop); - size = dt_mem_next_cell(dt_root_size_cells, &prop); - - if (!size) { - /* ignore node and scan next one */ - return 0; - } - - pr_info("Reserved mem: found %s, memory base %lx, size %ld MiB\n", - uname, (unsigned long)base, (unsigned long)size / SZ_1M); - - if (reserved_mem_count == ARRAY_SIZE(reserved_mem)) - return -ENOSPC; - - rmem->base = base; - rmem->size = size; - strlcpy(rmem->name, uname, sizeof(rmem->name)); - - if (is_cma) { - struct cma *cma; - if (dma_contiguous_reserve_area(size, base, 0, &cma) == 0) { - rmem->cma = cma; - reserved_mem_count++; - if (of_get_flat_dt_prop(node, - "linux,default-contiguous-region", - NULL)) - dma_contiguous_set_default(cma); - } - } else if (is_reserved) { - if (memblock_remove(base, size) == 0) - reserved_mem_count++; - else - pr_err("Failed to reserve memory for %s\n", uname); - } - - return 0; -} - -static struct reserved_mem *get_dma_memory_region(struct device *dev) -{ - struct device_node *node; - const char *name; - int i; - - node = of_parse_phandle(dev->of_node, "memory-region", 0); - if (!node) - return NULL; - - name = kbasename(node->full_name); - for (i = 0; i < reserved_mem_count; i++) - if (strcmp(name, reserved_mem[i].name) == 0) - return &reserved_mem[i]; - return NULL; -} - -/** - * of_reserved_mem_device_init() - assign reserved memory region to given device - * - * This function assign memory region pointed by "memory-region" device tree - * property to the given device. - */ -void of_reserved_mem_device_init(struct device *dev) -{ - struct reserved_mem *region = get_dma_memory_region(dev); - if (!region) - return; - - if (region->cma) { - dev_set_cma_area(dev, region->cma); - pr_info("Assigned CMA %s to %s device\n", region->name, - dev_name(dev)); - } else { - if (dma_declare_coherent_memory(dev, region->base, region->base, - region->size, DMA_MEMORY_MAP | DMA_MEMORY_EXCLUSIVE) != 0) - pr_info("Declared reserved memory %s to %s device\n", - region->name, dev_name(dev)); - } -} - -/** - * of_reserved_mem_device_release() - release reserved memory device structures - * - * This function releases structures allocated for memory region handling for - * the given device. - */ -void of_reserved_mem_device_release(struct device *dev) -{ - struct reserved_mem *region = get_dma_memory_region(dev); - if (!region && !region->cma) - dma_release_declared_memory(dev); -} - -/** - * early_init_dt_scan_reserved_mem() - create reserved memory regions - * - * This function grabs memory from early allocator for device exclusive use - * defined in device tree structures. It should be called by arch specific code - * once the early allocator (memblock) has been activated and all other - * subsystems have already allocated/reserved memory. - */ -void __init early_init_dt_scan_reserved_mem(void) -{ - of_scan_flat_dt_by_path("/memory/reserved-memory", - fdt_scan_reserved_mem, NULL); -} diff --git a/drivers/of/platform.c b/drivers/of/platform.c index 9b439ac63d8e..f6dcde220821 100644 --- a/drivers/of/platform.c +++ b/drivers/of/platform.c @@ -21,7 +21,6 @@ #include #include #include -#include #include const struct of_device_id of_default_bus_match_table[] = { @@ -219,8 +218,6 @@ static struct platform_device *of_platform_device_create_pdata( dev->dev.bus = &platform_bus_type; dev->dev.platform_data = platform_data; - of_reserved_mem_device_init(&dev->dev); - /* We do not fill the DMA ops for platform devices by default. * This is currently the responsibility of the platform code * to do such, possibly using a device notifier @@ -228,7 +225,6 @@ static struct platform_device *of_platform_device_create_pdata( if (of_device_add(dev) != 0) { platform_device_put(dev); - of_reserved_mem_device_release(&dev->dev); return NULL; } diff --git a/include/linux/of_reserved_mem.h b/include/linux/of_reserved_mem.h deleted file mode 100644 index c84128255814..000000000000 --- a/include/linux/of_reserved_mem.h +++ /dev/null @@ -1,14 +0,0 @@ -#ifndef __OF_RESERVED_MEM_H -#define __OF_RESERVED_MEM_H - -#ifdef CONFIG_OF_RESERVED_MEM -void of_reserved_mem_device_init(struct device *dev); -void of_reserved_mem_device_release(struct device *dev); -void early_init_dt_scan_reserved_mem(void); -#else -static inline void of_reserved_mem_device_init(struct device *dev) { } -static inline void of_reserved_mem_device_release(struct device *dev) { } -static inline void early_init_dt_scan_reserved_mem(void) { } -#endif - -#endif /* __OF_RESERVED_MEM_H */ -- cgit v1.2.3 From 32c37fc30c52508711ea6a108cfd5855b8a07176 Mon Sep 17 00:00:00 2001 From: Oliver Neukum Date: Mon, 14 Oct 2013 15:24:55 +0200 Subject: usb-storage: add quirk for mandatory READ_CAPACITY_16 Some USB drive enclosures do not correctly report an overflow condition if they hold a drive with a capacity over 2TB and are confronted with a READ_CAPACITY_10. They answer with their capacity modulo 2TB. The generic layer cannot cope with that. It must be told to use READ_CAPACITY_16 from the beginning. Signed-off-by: Oliver Neukum Cc: stable Signed-off-by: Greg Kroah-Hartman --- drivers/usb/storage/scsiglue.c | 5 ++++- drivers/usb/storage/unusual_devs.h | 7 +++++++ include/linux/usb_usual.h | 4 +++- 3 files changed, 14 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/drivers/usb/storage/scsiglue.c b/drivers/usb/storage/scsiglue.c index 94d75edef77f..18509e6c21ab 100644 --- a/drivers/usb/storage/scsiglue.c +++ b/drivers/usb/storage/scsiglue.c @@ -211,8 +211,11 @@ static int slave_configure(struct scsi_device *sdev) /* * Many devices do not respond properly to READ_CAPACITY_16. * Tell the SCSI layer to try READ_CAPACITY_10 first. + * However some USB 3.0 drive enclosures return capacity + * modulo 2TB. Those must use READ_CAPACITY_16 */ - sdev->try_rc_10_first = 1; + if (!(us->fflags & US_FL_NEEDS_CAP16)) + sdev->try_rc_10_first = 1; /* assume SPC3 or latter devices support sense size > 18 */ if (sdev->scsi_level > SCSI_SPC_2) diff --git a/drivers/usb/storage/unusual_devs.h b/drivers/usb/storage/unusual_devs.h index c015f2c16729..de32cfa5bfa6 100644 --- a/drivers/usb/storage/unusual_devs.h +++ b/drivers/usb/storage/unusual_devs.h @@ -1925,6 +1925,13 @@ UNUSUAL_DEV( 0x1652, 0x6600, 0x0201, 0x0201, USB_SC_DEVICE, USB_PR_DEVICE, NULL, US_FL_IGNORE_RESIDUE ), +/* Reported by Oliver Neukum */ +UNUSUAL_DEV( 0x174c, 0x55aa, 0x0100, 0x0100, + "ASMedia", + "AS2105", + USB_SC_DEVICE, USB_PR_DEVICE, NULL, + US_FL_NEEDS_CAP16), + /* Reported by Jesse Feddema */ UNUSUAL_DEV( 0x177f, 0x0400, 0x0000, 0x0000, "Yarvik", diff --git a/include/linux/usb_usual.h b/include/linux/usb_usual.h index bf99cd01be20..630356866030 100644 --- a/include/linux/usb_usual.h +++ b/include/linux/usb_usual.h @@ -66,7 +66,9 @@ US_FLAG(INITIAL_READ10, 0x00100000) \ /* Initial READ(10) (and others) must be retried */ \ US_FLAG(WRITE_CACHE, 0x00200000) \ - /* Write Cache status is not available */ + /* Write Cache status is not available */ \ + US_FLAG(NEEDS_CAP16, 0x00400000) + /* cannot handle READ_CAPACITY_10 */ #define US_FLAG(name, value) US_FL_##name = value , enum { US_DO_ALL_FLAGS }; -- cgit v1.2.3 From 4942642080ea82d99ab5b653abb9a12b7ba31f4a Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 16 Oct 2013 13:46:59 -0700 Subject: mm: memcg: handle non-error OOM situations more gracefully Commit 3812c8c8f395 ("mm: memcg: do not trap chargers with full callstack on OOM") assumed that only a few places that can trigger a memcg OOM situation do not return VM_FAULT_OOM, like optional page cache readahead. But there are many more and it's impractical to annotate them all. First of all, we don't want to invoke the OOM killer when the failed allocation is gracefully handled, so defer the actual kill to the end of the fault handling as well. This simplifies the code quite a bit for added bonus. Second, since a failed allocation might not be the abrupt end of the fault, the memcg OOM handler needs to be re-entrant until the fault finishes for subsequent allocation attempts. If an allocation is attempted after the task already OOMed, allow it to bypass the limit so that it can quickly finish the fault and invoke the OOM killer. Reported-by: azurIt Signed-off-by: Johannes Weiner Cc: Michal Hocko Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 50 ++++------------ include/linux/sched.h | 7 +-- mm/filemap.c | 11 +--- mm/memcontrol.c | 139 +++++++++++++++++---------------------------- mm/memory.c | 18 ++++-- mm/oom_kill.c | 2 +- 6 files changed, 79 insertions(+), 148 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index ecc82b37c4cc..b3e7a667e03c 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -137,47 +137,24 @@ extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, extern void mem_cgroup_replace_page_cache(struct page *oldpage, struct page *newpage); -/** - * mem_cgroup_toggle_oom - toggle the memcg OOM killer for the current task - * @new: true to enable, false to disable - * - * Toggle whether a failed memcg charge should invoke the OOM killer - * or just return -ENOMEM. Returns the previous toggle state. - * - * NOTE: Any path that enables the OOM killer before charging must - * call mem_cgroup_oom_synchronize() afterward to finalize the - * OOM handling and clean up. - */ -static inline bool mem_cgroup_toggle_oom(bool new) +static inline void mem_cgroup_oom_enable(void) { - bool old; - - old = current->memcg_oom.may_oom; - current->memcg_oom.may_oom = new; - - return old; + WARN_ON(current->memcg_oom.may_oom); + current->memcg_oom.may_oom = 1; } -static inline void mem_cgroup_enable_oom(void) +static inline void mem_cgroup_oom_disable(void) { - bool old = mem_cgroup_toggle_oom(true); - - WARN_ON(old == true); -} - -static inline void mem_cgroup_disable_oom(void) -{ - bool old = mem_cgroup_toggle_oom(false); - - WARN_ON(old == false); + WARN_ON(!current->memcg_oom.may_oom); + current->memcg_oom.may_oom = 0; } static inline bool task_in_memcg_oom(struct task_struct *p) { - return p->memcg_oom.in_memcg_oom; + return p->memcg_oom.memcg; } -bool mem_cgroup_oom_synchronize(void); +bool mem_cgroup_oom_synchronize(bool wait); #ifdef CONFIG_MEMCG_SWAP extern int do_swap_account; @@ -402,16 +379,11 @@ static inline void mem_cgroup_end_update_page_stat(struct page *page, { } -static inline bool mem_cgroup_toggle_oom(bool new) -{ - return false; -} - -static inline void mem_cgroup_enable_oom(void) +static inline void mem_cgroup_oom_enable(void) { } -static inline void mem_cgroup_disable_oom(void) +static inline void mem_cgroup_oom_disable(void) { } @@ -420,7 +392,7 @@ static inline bool task_in_memcg_oom(struct task_struct *p) return false; } -static inline bool mem_cgroup_oom_synchronize(void) +static inline bool mem_cgroup_oom_synchronize(bool wait) { return false; } diff --git a/include/linux/sched.h b/include/linux/sched.h index 6682da36b293..e27baeeda3f4 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1394,11 +1394,10 @@ struct task_struct { } memcg_batch; unsigned int memcg_kmem_skip_account; struct memcg_oom_info { + struct mem_cgroup *memcg; + gfp_t gfp_mask; + int order; unsigned int may_oom:1; - unsigned int in_memcg_oom:1; - unsigned int oom_locked:1; - int wakeups; - struct mem_cgroup *wait_on_memcg; } memcg_oom; #endif #ifdef CONFIG_UPROBES diff --git a/mm/filemap.c b/mm/filemap.c index 1e6aec4a2d2e..ae4846ff4849 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1616,7 +1616,6 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) struct inode *inode = mapping->host; pgoff_t offset = vmf->pgoff; struct page *page; - bool memcg_oom; pgoff_t size; int ret = 0; @@ -1625,11 +1624,7 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) return VM_FAULT_SIGBUS; /* - * Do we have something in the page cache already? Either - * way, try readahead, but disable the memcg OOM killer for it - * as readahead is optional and no errors are propagated up - * the fault stack. The OOM killer is enabled while trying to - * instantiate the faulting page individually below. + * Do we have something in the page cache already? */ page = find_get_page(mapping, offset); if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) { @@ -1637,14 +1632,10 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) * We found the page, so try async readahead before * waiting for the lock. */ - memcg_oom = mem_cgroup_toggle_oom(false); do_async_mmap_readahead(vma, ra, file, page, offset); - mem_cgroup_toggle_oom(memcg_oom); } else if (!page) { /* No page in the page cache at all */ - memcg_oom = mem_cgroup_toggle_oom(false); do_sync_mmap_readahead(vma, ra, file, offset); - mem_cgroup_toggle_oom(memcg_oom); count_vm_event(PGMAJFAULT); mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); ret = VM_FAULT_MAJOR; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 5335b2b6be77..65fc6a449841 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2161,110 +2161,59 @@ static void memcg_oom_recover(struct mem_cgroup *memcg) memcg_wakeup_oom(memcg); } -/* - * try to call OOM killer - */ static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) { - bool locked; - int wakeups; - if (!current->memcg_oom.may_oom) return; - - current->memcg_oom.in_memcg_oom = 1; - /* - * As with any blocking lock, a contender needs to start - * listening for wakeups before attempting the trylock, - * otherwise it can miss the wakeup from the unlock and sleep - * indefinitely. This is just open-coded because our locking - * is so particular to memcg hierarchies. + * We are in the middle of the charge context here, so we + * don't want to block when potentially sitting on a callstack + * that holds all kinds of filesystem and mm locks. + * + * Also, the caller may handle a failed allocation gracefully + * (like optional page cache readahead) and so an OOM killer + * invocation might not even be necessary. + * + * That's why we don't do anything here except remember the + * OOM context and then deal with it at the end of the page + * fault when the stack is unwound, the locks are released, + * and when we know whether the fault was overall successful. */ - wakeups = atomic_read(&memcg->oom_wakeups); - mem_cgroup_mark_under_oom(memcg); - - locked = mem_cgroup_oom_trylock(memcg); - - if (locked) - mem_cgroup_oom_notify(memcg); - - if (locked && !memcg->oom_kill_disable) { - mem_cgroup_unmark_under_oom(memcg); - mem_cgroup_out_of_memory(memcg, mask, order); - mem_cgroup_oom_unlock(memcg); - /* - * There is no guarantee that an OOM-lock contender - * sees the wakeups triggered by the OOM kill - * uncharges. Wake any sleepers explicitely. - */ - memcg_oom_recover(memcg); - } else { - /* - * A system call can just return -ENOMEM, but if this - * is a page fault and somebody else is handling the - * OOM already, we need to sleep on the OOM waitqueue - * for this memcg until the situation is resolved. - * Which can take some time because it might be - * handled by a userspace task. - * - * However, this is the charge context, which means - * that we may sit on a large call stack and hold - * various filesystem locks, the mmap_sem etc. and we - * don't want the OOM handler to deadlock on them - * while we sit here and wait. Store the current OOM - * context in the task_struct, then return -ENOMEM. - * At the end of the page fault handler, with the - * stack unwound, pagefault_out_of_memory() will check - * back with us by calling - * mem_cgroup_oom_synchronize(), possibly putting the - * task to sleep. - */ - current->memcg_oom.oom_locked = locked; - current->memcg_oom.wakeups = wakeups; - css_get(&memcg->css); - current->memcg_oom.wait_on_memcg = memcg; - } + css_get(&memcg->css); + current->memcg_oom.memcg = memcg; + current->memcg_oom.gfp_mask = mask; + current->memcg_oom.order = order; } /** * mem_cgroup_oom_synchronize - complete memcg OOM handling + * @handle: actually kill/wait or just clean up the OOM state * - * This has to be called at the end of a page fault if the the memcg - * OOM handler was enabled and the fault is returning %VM_FAULT_OOM. + * This has to be called at the end of a page fault if the memcg OOM + * handler was enabled. * - * Memcg supports userspace OOM handling, so failed allocations must + * Memcg supports userspace OOM handling where failed allocations must * sleep on a waitqueue until the userspace task resolves the * situation. Sleeping directly in the charge context with all kinds * of locks held is not a good idea, instead we remember an OOM state * in the task and mem_cgroup_oom_synchronize() has to be called at - * the end of the page fault to put the task to sleep and clean up the - * OOM state. + * the end of the page fault to complete the OOM handling. * * Returns %true if an ongoing memcg OOM situation was detected and - * finalized, %false otherwise. + * completed, %false otherwise. */ -bool mem_cgroup_oom_synchronize(void) +bool mem_cgroup_oom_synchronize(bool handle) { + struct mem_cgroup *memcg = current->memcg_oom.memcg; struct oom_wait_info owait; - struct mem_cgroup *memcg; + bool locked; /* OOM is global, do not handle */ - if (!current->memcg_oom.in_memcg_oom) - return false; - - /* - * We invoked the OOM killer but there is a chance that a kill - * did not free up any charges. Everybody else might already - * be sleeping, so restart the fault and keep the rampage - * going until some charges are released. - */ - memcg = current->memcg_oom.wait_on_memcg; if (!memcg) - goto out; + return false; - if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) - goto out_memcg; + if (!handle) + goto cleanup; owait.memcg = memcg; owait.wait.flags = 0; @@ -2273,13 +2222,25 @@ bool mem_cgroup_oom_synchronize(void) INIT_LIST_HEAD(&owait.wait.task_list); prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); - /* Only sleep if we didn't miss any wakeups since OOM */ - if (atomic_read(&memcg->oom_wakeups) == current->memcg_oom.wakeups) + mem_cgroup_mark_under_oom(memcg); + + locked = mem_cgroup_oom_trylock(memcg); + + if (locked) + mem_cgroup_oom_notify(memcg); + + if (locked && !memcg->oom_kill_disable) { + mem_cgroup_unmark_under_oom(memcg); + finish_wait(&memcg_oom_waitq, &owait.wait); + mem_cgroup_out_of_memory(memcg, current->memcg_oom.gfp_mask, + current->memcg_oom.order); + } else { schedule(); - finish_wait(&memcg_oom_waitq, &owait.wait); -out_memcg: - mem_cgroup_unmark_under_oom(memcg); - if (current->memcg_oom.oom_locked) { + mem_cgroup_unmark_under_oom(memcg); + finish_wait(&memcg_oom_waitq, &owait.wait); + } + + if (locked) { mem_cgroup_oom_unlock(memcg); /* * There is no guarantee that an OOM-lock contender @@ -2288,10 +2249,9 @@ out_memcg: */ memcg_oom_recover(memcg); } +cleanup: + current->memcg_oom.memcg = NULL; css_put(&memcg->css); - current->memcg_oom.wait_on_memcg = NULL; -out: - current->memcg_oom.in_memcg_oom = 0; return true; } @@ -2705,6 +2665,9 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, || fatal_signal_pending(current))) goto bypass; + if (unlikely(task_in_memcg_oom(current))) + goto bypass; + /* * We always charge the cgroup the mm_struct belongs to. * The mm_struct's mem_cgroup changes on task migration if the diff --git a/mm/memory.c b/mm/memory.c index f7b7692c05ed..1311f26497e6 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3865,15 +3865,21 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, * space. Kernel faults are handled more gracefully. */ if (flags & FAULT_FLAG_USER) - mem_cgroup_enable_oom(); + mem_cgroup_oom_enable(); ret = __handle_mm_fault(mm, vma, address, flags); - if (flags & FAULT_FLAG_USER) - mem_cgroup_disable_oom(); - - if (WARN_ON(task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))) - mem_cgroup_oom_synchronize(); + if (flags & FAULT_FLAG_USER) { + mem_cgroup_oom_disable(); + /* + * The task may have entered a memcg OOM situation but + * if the allocation error was handled gracefully (no + * VM_FAULT_OOM), there is no need to kill anything. + * Just clean up the OOM state peacefully. + */ + if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM)) + mem_cgroup_oom_synchronize(false); + } return ret; } diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 314e9d274381..6738c47f1f72 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -680,7 +680,7 @@ void pagefault_out_of_memory(void) { struct zonelist *zonelist; - if (mem_cgroup_oom_synchronize()) + if (mem_cgroup_oom_synchronize(true)) return; zonelist = node_zonelist(first_online_node, GFP_KERNEL); -- cgit v1.2.3 From 2421ad48f4aed63bc890e8f3c53ed581a542fb66 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 17 Oct 2013 15:44:48 +0200 Subject: ACPI / PM: Drop two functions that are not used any more Two functions defined in device_pm.c, acpi_dev_pm_add_dependent() and acpi_dev_pm_remove_dependent(), have no callers and may be dropped, so drop them. Moreover, they are the only functions adding entries to and removing entries from the power_dependent list in struct acpi_device, so drop that list too. Signed-off-by: Rafael J. Wysocki --- drivers/acpi/device_pm.c | 56 ------------------------------------------------ drivers/acpi/scan.c | 1 - include/acpi/acpi_bus.h | 7 ------ 3 files changed, 64 deletions(-) (limited to 'include') diff --git a/drivers/acpi/device_pm.c b/drivers/acpi/device_pm.c index 59d3202f6b36..a94383d1f350 100644 --- a/drivers/acpi/device_pm.c +++ b/drivers/acpi/device_pm.c @@ -1025,60 +1025,4 @@ void acpi_dev_pm_detach(struct device *dev, bool power_off) } } EXPORT_SYMBOL_GPL(acpi_dev_pm_detach); - -/** - * acpi_dev_pm_add_dependent - Add physical device depending for PM. - * @handle: Handle of ACPI device node. - * @depdev: Device depending on that node for PM. - */ -void acpi_dev_pm_add_dependent(acpi_handle handle, struct device *depdev) -{ - struct acpi_device_physical_node *dep; - struct acpi_device *adev; - - if (!depdev || acpi_bus_get_device(handle, &adev)) - return; - - mutex_lock(&adev->physical_node_lock); - - list_for_each_entry(dep, &adev->power_dependent, node) - if (dep->dev == depdev) - goto out; - - dep = kzalloc(sizeof(*dep), GFP_KERNEL); - if (dep) { - dep->dev = depdev; - list_add_tail(&dep->node, &adev->power_dependent); - } - - out: - mutex_unlock(&adev->physical_node_lock); -} -EXPORT_SYMBOL_GPL(acpi_dev_pm_add_dependent); - -/** - * acpi_dev_pm_remove_dependent - Remove physical device depending for PM. - * @handle: Handle of ACPI device node. - * @depdev: Device depending on that node for PM. - */ -void acpi_dev_pm_remove_dependent(acpi_handle handle, struct device *depdev) -{ - struct acpi_device_physical_node *dep; - struct acpi_device *adev; - - if (!depdev || acpi_bus_get_device(handle, &adev)) - return; - - mutex_lock(&adev->physical_node_lock); - - list_for_each_entry(dep, &adev->power_dependent, node) - if (dep->dev == depdev) { - list_del(&dep->node); - kfree(dep); - break; - } - - mutex_unlock(&adev->physical_node_lock); -} -EXPORT_SYMBOL_GPL(acpi_dev_pm_remove_dependent); #endif /* CONFIG_PM */ diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c index 407ad13cac2f..fee8a297c7d9 100644 --- a/drivers/acpi/scan.c +++ b/drivers/acpi/scan.c @@ -999,7 +999,6 @@ int acpi_device_add(struct acpi_device *device, INIT_LIST_HEAD(&device->wakeup_list); INIT_LIST_HEAD(&device->physical_node_list); mutex_init(&device->physical_node_lock); - INIT_LIST_HEAD(&device->power_dependent); new_bus_id = kzalloc(sizeof(struct acpi_device_bus_id), GFP_KERNEL); if (!new_bus_id) { diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h index 02e113bb8b7d..d9019821aa60 100644 --- a/include/acpi/acpi_bus.h +++ b/include/acpi/acpi_bus.h @@ -311,7 +311,6 @@ struct acpi_device { unsigned int physical_node_count; struct list_head physical_node_list; struct mutex physical_node_lock; - struct list_head power_dependent; void (*remove)(struct acpi_device *); }; @@ -456,8 +455,6 @@ acpi_status acpi_add_pm_notifier(struct acpi_device *adev, acpi_status acpi_remove_pm_notifier(struct acpi_device *adev, acpi_notify_handler handler); int acpi_pm_device_sleep_state(struct device *, int *, int); -void acpi_dev_pm_add_dependent(acpi_handle handle, struct device *depdev); -void acpi_dev_pm_remove_dependent(acpi_handle handle, struct device *depdev); #else static inline acpi_status acpi_add_pm_notifier(struct acpi_device *adev, acpi_notify_handler handler, @@ -478,10 +475,6 @@ static inline int acpi_pm_device_sleep_state(struct device *d, int *p, int m) return (m >= ACPI_STATE_D0 && m <= ACPI_STATE_D3_COLD) ? m : ACPI_STATE_D0; } -static inline void acpi_dev_pm_add_dependent(acpi_handle handle, - struct device *depdev) {} -static inline void acpi_dev_pm_remove_dependent(acpi_handle handle, - struct device *depdev) {} #endif #ifdef CONFIG_PM_RUNTIME -- cgit v1.2.3 From 94468783cd960aa14b22503dd59afd14efb785aa Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Wed, 16 Oct 2013 19:18:41 -0700 Subject: usb: usb_phy_gen: refine conditional declaration of usb_nop_xceiv_register Commit 3fa4d734 (usb: phy: rename nop_usb_xceiv => usb_phy_gen_xceiv) changed the conditional around the declaration of usb_nop_xceiv_register from #if defined(CONFIG_NOP_USB_XCEIV) || (defined(CONFIG_NOP_USB_XCEIV_MODULE) && defined(MODULE)) to #if IS_ENABLED(CONFIG_NOP_USB_XCEIV) While that looks the same, it is semantically different. The first expression is true if CONFIG_NOP_USB_XCEIV is built as module and if the including code is built as module. The second expression is true if code depending on CONFIG_NOP_USB_XCEIV if built as module or into the kernel. As a result, the arm:allmodconfig build fails with arch/arm/mach-omap2/built-in.o: In function `omap3_evm_init': arch/arm/mach-omap2/board-omap3evm.c:703: undefined reference to `usb_nop_xceiv_register' Fix the problem by reverting to the old conditional. Cc: Josh Boyer Signed-off-by: Guenter Roeck Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/usb_phy_gen_xceiv.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/usb/usb_phy_gen_xceiv.h b/include/linux/usb/usb_phy_gen_xceiv.h index f9a7e7bc925b..11d85b9c1b08 100644 --- a/include/linux/usb/usb_phy_gen_xceiv.h +++ b/include/linux/usb/usb_phy_gen_xceiv.h @@ -12,7 +12,7 @@ struct usb_phy_gen_xceiv_platform_data { unsigned int needs_reset:1; }; -#if IS_ENABLED(CONFIG_NOP_USB_XCEIV) +#if defined(CONFIG_NOP_USB_XCEIV) || (defined(CONFIG_NOP_USB_XCEIV_MODULE) && defined(MODULE)) /* sometimes transceivers are accessed only through e.g. ULPI */ extern void usb_nop_xceiv_register(void); extern void usb_nop_xceiv_unregister(void); -- cgit v1.2.3 From e87b3998d795123b4139bc3f25490dd236f68212 Mon Sep 17 00:00:00 2001 From: Vlad Yasevich Date: Tue, 15 Oct 2013 22:01:29 -0400 Subject: net: dst: provide accessor function to dst->xfrm dst->xfrm is conditionally defined. Provide accessor funtion that is always available. Signed-off-by: Vlad Yasevich Acked-by: Neil Horman Signed-off-by: David S. Miller --- include/net/dst.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include') diff --git a/include/net/dst.h b/include/net/dst.h index 3bc4865f8267..3c4c944096c9 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -479,10 +479,22 @@ static inline struct dst_entry *xfrm_lookup(struct net *net, { return dst_orig; } + +static inline struct xfrm_state *dst_xfrm(const struct dst_entry *dst) +{ + return NULL; +} + #else extern struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig, const struct flowi *fl, struct sock *sk, int flags); + +/* skb attached with this dst needs transformation if dst->xfrm is valid */ +static inline struct xfrm_state *dst_xfrm(const struct dst_entry *dst) +{ + return dst->xfrm; +} #endif #endif /* _NET_DST_H */ -- cgit v1.2.3 From 9e5f1721907fcfbd4b575bcafa0314188f7330a5 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 14 Oct 2013 15:28:38 +0300 Subject: yam: integer underflow in yam_ioctl() We cap bitrate at YAM_MAXBITRATE in yam_ioctl(), but it could also be negative. I don't know the impact of using a negative bitrate but let's prevent it. Signed-off-by: Dan Carpenter Signed-off-by: David S. Miller --- include/linux/yam.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/yam.h b/include/linux/yam.h index 7fe28228b274..512cdc2fb80f 100644 --- a/include/linux/yam.h +++ b/include/linux/yam.h @@ -77,6 +77,6 @@ struct yamdrv_ioctl_cfg { struct yamdrv_ioctl_mcs { int cmd; - int bitrate; + unsigned int bitrate; unsigned char bits[YAM_FPGA_SIZE]; }; -- cgit v1.2.3 From bc5bd37ce48c66e9192ad2e7231e9678880f6f8e Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Wed, 16 Oct 2013 09:49:02 +0100 Subject: drm: Pad drm_mode_get_connector to 64-bit boundary Pavel Roskin reported that DRM_IOCTL_MODE_GETCONNECTOR was overwritting the 4 bytes beyond the end of its structure with a 32-bit userspace running on a 64-bit kernel. This is due to the padding gcc inserts as the drm_mode_get_connector struct includes a u64 and its size is not a natural multiple of u64s. 64-bit kernel: sizeof(drm_mode_get_connector)=80, alignof=8 sizeof(drm_mode_get_encoder)=20, alignof=4 sizeof(drm_mode_modeinfo)=68, alignof=4 32-bit userspace: sizeof(drm_mode_get_connector)=76, alignof=4 sizeof(drm_mode_get_encoder)=20, alignof=4 sizeof(drm_mode_modeinfo)=68, alignof=4 Fortuituously we can insert explicit padding to the tail of our structures without breaking ABI. Reported-by: Pavel Roskin Signed-off-by: Chris Wilson Cc: Dave Airlie Cc: dri-devel@lists.freedesktop.org Cc: stable@vger.kernel.org Signed-off-by: Dave Airlie --- include/uapi/drm/drm_mode.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/uapi/drm/drm_mode.h b/include/uapi/drm/drm_mode.h index 550811712f78..28acbaf4a81e 100644 --- a/include/uapi/drm/drm_mode.h +++ b/include/uapi/drm/drm_mode.h @@ -223,6 +223,8 @@ struct drm_mode_get_connector { __u32 connection; __u32 mm_width, mm_height; /**< HxW in millimeters */ __u32 subpixel; + + __u32 pad; }; #define DRM_MODE_PROP_PENDING (1<<0) -- cgit v1.2.3 From f2e5ddcc0d12f9c4c7b254358ad245c9dddce13b Mon Sep 17 00:00:00 2001 From: Seif Mazareeb Date: Thu, 17 Oct 2013 20:33:21 -0700 Subject: net: fix cipso packet validation when !NETLABEL When CONFIG_NETLABEL is disabled, the cipso_v4_validate() function could loop forever in the main loop if opt[opt_iter +1] == 0, this will causing a kernel crash in an SMP system, since the CPU executing this function will stall /not respond to IPIs. This problem can be reproduced by running the IP Stack Integrity Checker (http://isic.sourceforge.net) using the following command on a Linux machine connected to DUT: "icmpsic -s rand -d -r 123456" wait (1-2 min) Signed-off-by: Seif Mazareeb Acked-by: Paul Moore Signed-off-by: David S. Miller --- include/net/cipso_ipv4.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/cipso_ipv4.h b/include/net/cipso_ipv4.h index a7a683e30b64..a8c2ef6d3b93 100644 --- a/include/net/cipso_ipv4.h +++ b/include/net/cipso_ipv4.h @@ -290,6 +290,7 @@ static inline int cipso_v4_validate(const struct sk_buff *skb, unsigned char err_offset = 0; u8 opt_len = opt[1]; u8 opt_iter; + u8 tag_len; if (opt_len < 8) { err_offset = 1; @@ -302,11 +303,12 @@ static inline int cipso_v4_validate(const struct sk_buff *skb, } for (opt_iter = 6; opt_iter < opt_len;) { - if (opt[opt_iter + 1] > (opt_len - opt_iter)) { + tag_len = opt[opt_iter + 1]; + if ((tag_len == 0) || (opt[opt_iter + 1] > (opt_len - opt_iter))) { err_offset = opt_iter + 1; goto out; } - opt_iter += opt[opt_iter + 1]; + opt_iter += tag_len; } out: -- cgit v1.2.3 From 7afbddfae9931bf113c01bc5c6780dda3602ef6c Mon Sep 17 00:00:00 2001 From: Yann Droneaud Date: Thu, 10 Oct 2013 11:10:55 +0200 Subject: IB/core: Temporarily disable create_flow/destroy_flow uverbs The create_flow/destroy_flow uverbs and the associated extensions to the user-kernel verbs ABI are under review and are too experimental to freeze at this point. So userspace is not exposed to experimental features and an uinstable ABI, temporarily disable this for v3.12 (with a Kconfig option behind staging to reenable it if desired). The feature will be enabled after proper cleanup for v3.13. Signed-off-by: Yann Droneaud Link: http://marc.info/?i=cover.1381351016.git.ydroneaud@opteya.com Link: http://marc.info/?i=cover.1381177342.git.ydroneaud@opteya.com [ Add a Kconfig option to reenable these verbs. - Roland ] Signed-off-by: Roland Dreier --- drivers/infiniband/Kconfig | 11 +++++++++++ drivers/infiniband/core/uverbs.h | 2 ++ drivers/infiniband/core/uverbs_cmd.c | 4 ++++ drivers/infiniband/core/uverbs_main.c | 6 ++++++ drivers/infiniband/hw/mlx4/main.c | 2 ++ include/uapi/rdma/ib_user_verbs.h | 6 ++++++ 6 files changed, 31 insertions(+) (limited to 'include') diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig index 5ceda710f516..b84791f03a27 100644 --- a/drivers/infiniband/Kconfig +++ b/drivers/infiniband/Kconfig @@ -31,6 +31,17 @@ config INFINIBAND_USER_ACCESS libibverbs, libibcm and a hardware driver library from . +config INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING + bool "Experimental and unstable ABI for userspace access to flow steering verbs" + depends on INFINIBAND_USER_ACCESS + depends on STAGING + ---help--- + The final ABI for userspace access to flow steering verbs + has not been defined. To use the current ABI, *WHICH WILL + CHANGE IN THE FUTURE*, say Y here. + + If unsure, say N. + config INFINIBAND_USER_MEM bool depends on INFINIBAND_USER_ACCESS != n diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h index d040b877475f..d8f9c6c272d7 100644 --- a/drivers/infiniband/core/uverbs.h +++ b/drivers/infiniband/core/uverbs.h @@ -217,7 +217,9 @@ IB_UVERBS_DECLARE_CMD(destroy_srq); IB_UVERBS_DECLARE_CMD(create_xsrq); IB_UVERBS_DECLARE_CMD(open_xrcd); IB_UVERBS_DECLARE_CMD(close_xrcd); +#ifdef CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING IB_UVERBS_DECLARE_CMD(create_flow); IB_UVERBS_DECLARE_CMD(destroy_flow); +#endif /* CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING */ #endif /* UVERBS_H */ diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index f2b81b9ee0d6..2f0f01b70e3b 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -54,7 +54,9 @@ static struct uverbs_lock_class qp_lock_class = { .name = "QP-uobj" }; static struct uverbs_lock_class ah_lock_class = { .name = "AH-uobj" }; static struct uverbs_lock_class srq_lock_class = { .name = "SRQ-uobj" }; static struct uverbs_lock_class xrcd_lock_class = { .name = "XRCD-uobj" }; +#ifdef CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING static struct uverbs_lock_class rule_lock_class = { .name = "RULE-uobj" }; +#endif /* CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING */ #define INIT_UDATA(udata, ibuf, obuf, ilen, olen) \ do { \ @@ -2599,6 +2601,7 @@ out_put: return ret ? ret : in_len; } +#ifdef CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING static int kern_spec_to_ib_spec(struct ib_kern_spec *kern_spec, union ib_flow_spec *ib_spec) { @@ -2824,6 +2827,7 @@ ssize_t ib_uverbs_destroy_flow(struct ib_uverbs_file *file, return ret ? ret : in_len; } +#endif /* CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING */ static int __uverbs_create_xsrq(struct ib_uverbs_file *file, struct ib_uverbs_create_xsrq *cmd, diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 75ad86c4abf8..2df31f68ea09 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -115,8 +115,10 @@ static ssize_t (*uverbs_cmd_table[])(struct ib_uverbs_file *file, [IB_USER_VERBS_CMD_CLOSE_XRCD] = ib_uverbs_close_xrcd, [IB_USER_VERBS_CMD_CREATE_XSRQ] = ib_uverbs_create_xsrq, [IB_USER_VERBS_CMD_OPEN_QP] = ib_uverbs_open_qp, +#ifdef CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING [IB_USER_VERBS_CMD_CREATE_FLOW] = ib_uverbs_create_flow, [IB_USER_VERBS_CMD_DESTROY_FLOW] = ib_uverbs_destroy_flow +#endif /* CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING */ }; static void ib_uverbs_add_one(struct ib_device *device); @@ -605,6 +607,7 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf, if (!(file->device->ib_dev->uverbs_cmd_mask & (1ull << hdr.command))) return -ENOSYS; +#ifdef CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING if (hdr.command >= IB_USER_VERBS_CMD_THRESHOLD) { struct ib_uverbs_cmd_hdr_ex hdr_ex; @@ -621,6 +624,7 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf, (hdr_ex.out_words + hdr_ex.provider_out_words) * 4); } else { +#endif /* CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING */ if (hdr.in_words * 4 != count) return -EINVAL; @@ -628,7 +632,9 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf, buf + sizeof(hdr), hdr.in_words * 4, hdr.out_words * 4); +#ifdef CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING } +#endif /* CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING */ } static int ib_uverbs_mmap(struct file *filp, struct vm_area_struct *vma) diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index d6c5a73becf4..f0612645de99 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -1691,9 +1691,11 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) ibdev->ib_dev.create_flow = mlx4_ib_create_flow; ibdev->ib_dev.destroy_flow = mlx4_ib_destroy_flow; +#ifdef CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING ibdev->ib_dev.uverbs_cmd_mask |= (1ull << IB_USER_VERBS_CMD_CREATE_FLOW) | (1ull << IB_USER_VERBS_CMD_DESTROY_FLOW); +#endif /* CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING */ } mlx4_ib_alloc_eqs(dev, ibdev); diff --git a/include/uapi/rdma/ib_user_verbs.h b/include/uapi/rdma/ib_user_verbs.h index 0b233c56b0e4..e3ddd86c90a6 100644 --- a/include/uapi/rdma/ib_user_verbs.h +++ b/include/uapi/rdma/ib_user_verbs.h @@ -87,8 +87,10 @@ enum { IB_USER_VERBS_CMD_CLOSE_XRCD, IB_USER_VERBS_CMD_CREATE_XSRQ, IB_USER_VERBS_CMD_OPEN_QP, +#ifdef CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING IB_USER_VERBS_CMD_CREATE_FLOW = IB_USER_VERBS_CMD_THRESHOLD, IB_USER_VERBS_CMD_DESTROY_FLOW +#endif /* CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING */ }; /* @@ -126,6 +128,7 @@ struct ib_uverbs_cmd_hdr { __u16 out_words; }; +#ifdef CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING struct ib_uverbs_cmd_hdr_ex { __u32 command; __u16 in_words; @@ -134,6 +137,7 @@ struct ib_uverbs_cmd_hdr_ex { __u16 provider_out_words; __u32 cmd_hdr_reserved; }; +#endif /* CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING */ struct ib_uverbs_get_context { __u64 response; @@ -696,6 +700,7 @@ struct ib_uverbs_detach_mcast { __u64 driver_data[0]; }; +#ifdef CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING struct ib_kern_eth_filter { __u8 dst_mac[6]; __u8 src_mac[6]; @@ -780,6 +785,7 @@ struct ib_uverbs_destroy_flow { __u32 comp_mask; __u32 flow_handle; }; +#endif /* CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING */ struct ib_uverbs_create_srq { __u64 response; -- cgit v1.2.3 From 96dc809514fb2328605198a0602b67554d8cce7b Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Sun, 20 Oct 2013 15:43:03 +0300 Subject: ipv6: always prefer rt6i_gateway if present In v3.9 6fd6ce2056de2709 ("ipv6: Do not depend on rt->n in ip6_finish_output2()." changed the behaviour of ip6_finish_output2() such that the recently introduced rt6_nexthop() is used instead of an assigned neighbor. As rt6_nexthop() prefers rt6i_gateway only for gatewayed routes this causes a problem for users like IPVS, xt_TEE and RAW(hdrincl) if they want to use different address for routing compared to the destination address. Another case is when redirect can create RTF_DYNAMIC route without RTF_GATEWAY flag, we ignore the rt6i_gateway in rt6_nexthop(). Fix the above problems by considering the rt6i_gateway if present, so that traffic routed to address on local subnet is not wrongly diverted to the destination address. Thanks to Simon Horman and Phil Oester for spotting the problematic commit. Thanks to Hannes Frederic Sowa for his review and help in testing. Reported-by: Phil Oester Reported-by: Mark Brooks Signed-off-by: Julian Anastasov Acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- include/net/ip6_route.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index f525e7038cca..481404abdf65 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -196,7 +196,7 @@ static inline int ip6_skb_dst_mtu(struct sk_buff *skb) static inline struct in6_addr *rt6_nexthop(struct rt6_info *rt, struct in6_addr *dest) { - if (rt->rt6i_flags & RTF_GATEWAY) + if (rt->rt6i_flags & RTF_GATEWAY || !ipv6_addr_any(&rt->rt6i_gateway)) return &rt->rt6i_gateway; return dest; } -- cgit v1.2.3 From 550bab42f83308c9d6ab04a980cc4333cef1c8fa Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Sun, 20 Oct 2013 15:43:04 +0300 Subject: ipv6: fill rt6i_gateway with nexthop address Make sure rt6i_gateway contains nexthop information in all routes returned from lookup or when routes are directly attached to skb for generated ICMP packets. The effect of this patch should be a faster version of rt6_nexthop() and the consideration of local addresses as nexthop. Signed-off-by: Julian Anastasov Acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- include/net/ip6_route.h | 6 ++---- net/ipv6/ip6_output.c | 4 ++-- net/ipv6/route.c | 8 ++++++-- 3 files changed, 10 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 481404abdf65..2b786b7e3585 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -194,11 +194,9 @@ static inline int ip6_skb_dst_mtu(struct sk_buff *skb) skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb)); } -static inline struct in6_addr *rt6_nexthop(struct rt6_info *rt, struct in6_addr *dest) +static inline struct in6_addr *rt6_nexthop(struct rt6_info *rt) { - if (rt->rt6i_flags & RTF_GATEWAY || !ipv6_addr_any(&rt->rt6i_gateway)) - return &rt->rt6i_gateway; - return dest; + return &rt->rt6i_gateway; } #endif diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 975624b8d2ea..91fb4e8212f5 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -105,7 +105,7 @@ static int ip6_finish_output2(struct sk_buff *skb) } rcu_read_lock_bh(); - nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr); + nexthop = rt6_nexthop((struct rt6_info *)dst); neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop); if (unlikely(!neigh)) neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false); @@ -874,7 +874,7 @@ static int ip6_dst_lookup_tail(struct sock *sk, */ rt = (struct rt6_info *) *dst; rcu_read_lock_bh(); - n = __ipv6_neigh_lookup_noref(rt->dst.dev, rt6_nexthop(rt, &fl6->daddr)); + n = __ipv6_neigh_lookup_noref(rt->dst.dev, rt6_nexthop(rt)); err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0; rcu_read_unlock_bh(); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index c979dd96d82a..c1ee3813e1ae 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -851,7 +851,6 @@ static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, if (ort->rt6i_dst.plen != 128 && ipv6_addr_equal(&ort->rt6i_dst.addr, daddr)) rt->rt6i_flags |= RTF_ANYCAST; - rt->rt6i_gateway = *daddr; } rt->rt6i_flags |= RTF_CACHE; @@ -1338,6 +1337,7 @@ struct dst_entry *icmp6_dst_alloc(struct net_device *dev, rt->dst.flags |= DST_HOST; rt->dst.output = ip6_output; atomic_set(&rt->dst.__refcnt, 1); + rt->rt6i_gateway = fl6->daddr; rt->rt6i_dst.addr = fl6->daddr; rt->rt6i_dst.plen = 128; rt->rt6i_idev = idev; @@ -1873,7 +1873,10 @@ static struct rt6_info *ip6_rt_copy(struct rt6_info *ort, in6_dev_hold(rt->rt6i_idev); rt->dst.lastuse = jiffies; - rt->rt6i_gateway = ort->rt6i_gateway; + if (ort->rt6i_flags & RTF_GATEWAY) + rt->rt6i_gateway = ort->rt6i_gateway; + else + rt->rt6i_gateway = *dest; rt->rt6i_flags = ort->rt6i_flags; if ((ort->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) == (RTF_DEFAULT | RTF_ADDRCONF)) @@ -2160,6 +2163,7 @@ struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev, else rt->rt6i_flags |= RTF_LOCAL; + rt->rt6i_gateway = *addr; rt->rt6i_dst.addr = *addr; rt->rt6i_dst.plen = 128; rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL); -- cgit v1.2.3 From 7e4d8a193f2f1b551dc8d10acca6aea4356f3b86 Mon Sep 17 00:00:00 2001 From: Alexandre Belloni Date: Mon, 21 Oct 2013 19:09:58 +0200 Subject: mac802154: correct a typo in ieee802154_alloc_device() prototype This has no other impact than a cosmetic one. Signed-off-by: Alexandre Belloni Signed-off-by: David S. Miller --- include/net/mac802154.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/mac802154.h b/include/net/mac802154.h index d0d11df9cba1..807d6b7a943f 100644 --- a/include/net/mac802154.h +++ b/include/net/mac802154.h @@ -133,7 +133,7 @@ struct ieee802154_ops { /* Basic interface to register ieee802154 device */ struct ieee802154_dev * -ieee802154_alloc_device(size_t priv_data_lex, struct ieee802154_ops *ops); +ieee802154_alloc_device(size_t priv_data_len, struct ieee802154_ops *ops); void ieee802154_free_device(struct ieee802154_dev *dev); int ieee802154_register_device(struct ieee802154_dev *dev); void ieee802154_unregister_device(struct ieee802154_dev *dev); -- cgit v1.2.3 From bf378d341e4873ed928dc3c636252e6895a21f50 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 28 Oct 2013 13:55:29 +0100 Subject: perf: Fix perf ring buffer memory ordering The PPC64 people noticed a missing memory barrier and crufty old comments in the perf ring buffer code. So update all the comments and add the missing barrier. When the architecture implements local_t using atomic_long_t there will be double barriers issued; but short of introducing more conditional barrier primitives this is the best we can do. Reported-by: Victor Kaplansky Tested-by: Victor Kaplansky Signed-off-by: Peter Zijlstra Cc: Mathieu Desnoyers Cc: michael@ellerman.id.au Cc: Paul McKenney Cc: Michael Neuling Cc: Frederic Weisbecker Cc: anton@samba.org Cc: benh@kernel.crashing.org Link: http://lkml.kernel.org/r/20131025173749.GG19466@laptop.lan Signed-off-by: Ingo Molnar --- include/uapi/linux/perf_event.h | 12 +++++++----- kernel/events/ring_buffer.c | 31 +++++++++++++++++++++++++++---- 2 files changed, 34 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 009a655a5d35..2fc1602e23bb 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -456,13 +456,15 @@ struct perf_event_mmap_page { /* * Control data for the mmap() data buffer. * - * User-space reading the @data_head value should issue an rmb(), on - * SMP capable platforms, after reading this value -- see - * perf_event_wakeup(). + * User-space reading the @data_head value should issue an smp_rmb(), + * after reading this value. * * When the mapping is PROT_WRITE the @data_tail value should be - * written by userspace to reflect the last read data. In this case - * the kernel will not over-write unread data. + * written by userspace to reflect the last read data, after issueing + * an smp_mb() to separate the data read from the ->data_tail store. + * In this case the kernel will not over-write unread data. + * + * See perf_output_put_handle() for the data ordering. */ __u64 data_head; /* head in the data section */ __u64 data_tail; /* user-space written tail */ diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index cd55144270b5..9c2ddfbf4525 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -87,10 +87,31 @@ again: goto out; /* - * Publish the known good head. Rely on the full barrier implied - * by atomic_dec_and_test() order the rb->head read and this - * write. + * Since the mmap() consumer (userspace) can run on a different CPU: + * + * kernel user + * + * READ ->data_tail READ ->data_head + * smp_mb() (A) smp_rmb() (C) + * WRITE $data READ $data + * smp_wmb() (B) smp_mb() (D) + * STORE ->data_head WRITE ->data_tail + * + * Where A pairs with D, and B pairs with C. + * + * I don't think A needs to be a full barrier because we won't in fact + * write data until we see the store from userspace. So we simply don't + * issue the data WRITE until we observe it. Be conservative for now. + * + * OTOH, D needs to be a full barrier since it separates the data READ + * from the tail WRITE. + * + * For B a WMB is sufficient since it separates two WRITEs, and for C + * an RMB is sufficient since it separates two READs. + * + * See perf_output_begin(). */ + smp_wmb(); rb->user_page->data_head = head; /* @@ -154,9 +175,11 @@ int perf_output_begin(struct perf_output_handle *handle, * Userspace could choose to issue a mb() before updating the * tail pointer. So that all reads will be completed before the * write is issued. + * + * See perf_output_put_handle(). */ tail = ACCESS_ONCE(rb->user_page->data_tail); - smp_rmb(); + smp_mb(); offset = head = local_read(&rb->head); head += size; if (unlikely(!perf_output_space(rb, tail, offset, head))) -- cgit v1.2.3 From bd09d9a35111b6ffc0c7585d3853d0ec7f9f1eb4 Mon Sep 17 00:00:00 2001 From: Greg Thelen Date: Wed, 30 Oct 2013 13:56:20 -0700 Subject: percpu: fix this_cpu_sub() subtrahend casting for unsigneds this_cpu_sub() is implemented as negation and addition. This patch casts the adjustment to the counter type before negation to sign extend the adjustment. This helps in cases where the counter type is wider than an unsigned adjustment. An alternative to this patch is to declare such operations unsupported, but it seemed useful to avoid surprises. This patch specifically helps the following example: unsigned int delta = 1 preempt_disable() this_cpu_write(long_counter, 0) this_cpu_sub(long_counter, delta) preempt_enable() Before this change long_counter on a 64 bit machine ends with value 0xffffffff, rather than 0xffffffffffffffff. This is because this_cpu_sub(pcp, delta) boils down to this_cpu_add(pcp, -delta), which is basically: long_counter = 0 + 0xffffffff Also apply the same cast to: __this_cpu_sub() __this_cpu_sub_return() this_cpu_sub_return() All percpu_test.ko passes, especially the following cases which previously failed: l -= ui_one; __this_cpu_sub(long_counter, ui_one); CHECK(l, long_counter, -1); l -= ui_one; this_cpu_sub(long_counter, ui_one); CHECK(l, long_counter, -1); CHECK(l, long_counter, 0xffffffffffffffff); ul -= ui_one; __this_cpu_sub(ulong_counter, ui_one); CHECK(ul, ulong_counter, -1); CHECK(ul, ulong_counter, 0xffffffffffffffff); ul = this_cpu_sub_return(ulong_counter, ui_one); CHECK(ul, ulong_counter, 2); ul = __this_cpu_sub_return(ulong_counter, ui_one); CHECK(ul, ulong_counter, 1); Signed-off-by: Greg Thelen Acked-by: Tejun Heo Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/percpu.h | 3 ++- include/linux/percpu.h | 8 ++++---- 2 files changed, 6 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index 0da5200ee79d..b3e18f800302 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -128,7 +128,8 @@ do { \ do { \ typedef typeof(var) pao_T__; \ const int pao_ID__ = (__builtin_constant_p(val) && \ - ((val) == 1 || (val) == -1)) ? (val) : 0; \ + ((val) == 1 || (val) == -1)) ? \ + (int)(val) : 0; \ if (0) { \ pao_T__ pao_tmp__; \ pao_tmp__ = (val); \ diff --git a/include/linux/percpu.h b/include/linux/percpu.h index cc88172c7d9a..c74088ab103b 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -332,7 +332,7 @@ do { \ #endif #ifndef this_cpu_sub -# define this_cpu_sub(pcp, val) this_cpu_add((pcp), -(val)) +# define this_cpu_sub(pcp, val) this_cpu_add((pcp), -(typeof(pcp))(val)) #endif #ifndef this_cpu_inc @@ -418,7 +418,7 @@ do { \ # define this_cpu_add_return(pcp, val) __pcpu_size_call_return2(this_cpu_add_return_, pcp, val) #endif -#define this_cpu_sub_return(pcp, val) this_cpu_add_return(pcp, -(val)) +#define this_cpu_sub_return(pcp, val) this_cpu_add_return(pcp, -(typeof(pcp))(val)) #define this_cpu_inc_return(pcp) this_cpu_add_return(pcp, 1) #define this_cpu_dec_return(pcp) this_cpu_add_return(pcp, -1) @@ -586,7 +586,7 @@ do { \ #endif #ifndef __this_cpu_sub -# define __this_cpu_sub(pcp, val) __this_cpu_add((pcp), -(val)) +# define __this_cpu_sub(pcp, val) __this_cpu_add((pcp), -(typeof(pcp))(val)) #endif #ifndef __this_cpu_inc @@ -668,7 +668,7 @@ do { \ __pcpu_size_call_return2(__this_cpu_add_return_, pcp, val) #endif -#define __this_cpu_sub_return(pcp, val) __this_cpu_add_return(pcp, -(val)) +#define __this_cpu_sub_return(pcp, val) __this_cpu_add_return(pcp, -(typeof(pcp))(val)) #define __this_cpu_inc_return(pcp) __this_cpu_add_return(pcp, 1) #define __this_cpu_dec_return(pcp) __this_cpu_add_return(pcp, -1) -- cgit v1.2.3 From 9bf76ca325d5e9208eb343f7bd4cc666f703ed30 Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Sun, 3 Nov 2013 12:36:28 +0100 Subject: ipc, msg: forbid negative values for "msg{max,mnb,mni}" Negative message lengths make no sense -- so don't do negative queue lenghts or identifier counts. Prevent them from getting negative. Also change the underlying data types to be unsigned to avoid hairy surprises with sign extensions in cases where those variables get evaluated in unsigned expressions with bigger data types, e.g size_t. In case a user still wants to have "unlimited" sizes she could just use INT_MAX instead. Signed-off-by: Mathias Krause Cc: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/ipc_namespace.h | 6 +++--- ipc/ipc_sysctl.c | 20 ++++++++++++-------- 2 files changed, 15 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h index 19c19a5eee29..f6c82de12541 100644 --- a/include/linux/ipc_namespace.h +++ b/include/linux/ipc_namespace.h @@ -34,9 +34,9 @@ struct ipc_namespace { int sem_ctls[4]; int used_sems; - int msg_ctlmax; - int msg_ctlmnb; - int msg_ctlmni; + unsigned int msg_ctlmax; + unsigned int msg_ctlmnb; + unsigned int msg_ctlmni; atomic_t msg_bytes; atomic_t msg_hdrs; int auto_msgmni; diff --git a/ipc/ipc_sysctl.c b/ipc/ipc_sysctl.c index 130dfece27ac..b0e99deb6d05 100644 --- a/ipc/ipc_sysctl.c +++ b/ipc/ipc_sysctl.c @@ -62,7 +62,7 @@ static int proc_ipc_dointvec_minmax_orphans(ctl_table *table, int write, return err; } -static int proc_ipc_callback_dointvec(ctl_table *table, int write, +static int proc_ipc_callback_dointvec_minmax(ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table ipc_table; @@ -72,7 +72,7 @@ static int proc_ipc_callback_dointvec(ctl_table *table, int write, memcpy(&ipc_table, table, sizeof(ipc_table)); ipc_table.data = get_ipc(table); - rc = proc_dointvec(&ipc_table, write, buffer, lenp, ppos); + rc = proc_dointvec_minmax(&ipc_table, write, buffer, lenp, ppos); if (write && !rc && lenp_bef == *lenp) /* @@ -152,15 +152,13 @@ static int proc_ipcauto_dointvec_minmax(ctl_table *table, int write, #define proc_ipc_dointvec NULL #define proc_ipc_dointvec_minmax NULL #define proc_ipc_dointvec_minmax_orphans NULL -#define proc_ipc_callback_dointvec NULL +#define proc_ipc_callback_dointvec_minmax NULL #define proc_ipcauto_dointvec_minmax NULL #endif static int zero; static int one = 1; -#ifdef CONFIG_CHECKPOINT_RESTORE static int int_max = INT_MAX; -#endif static struct ctl_table ipc_kern_table[] = { { @@ -198,21 +196,27 @@ static struct ctl_table ipc_kern_table[] = { .data = &init_ipc_ns.msg_ctlmax, .maxlen = sizeof (init_ipc_ns.msg_ctlmax), .mode = 0644, - .proc_handler = proc_ipc_dointvec, + .proc_handler = proc_ipc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &int_max, }, { .procname = "msgmni", .data = &init_ipc_ns.msg_ctlmni, .maxlen = sizeof (init_ipc_ns.msg_ctlmni), .mode = 0644, - .proc_handler = proc_ipc_callback_dointvec, + .proc_handler = proc_ipc_callback_dointvec_minmax, + .extra1 = &zero, + .extra2 = &int_max, }, { .procname = "msgmnb", .data = &init_ipc_ns.msg_ctlmnb, .maxlen = sizeof (init_ipc_ns.msg_ctlmnb), .mode = 0644, - .proc_handler = proc_ipc_dointvec, + .proc_handler = proc_ipc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &int_max, }, { .procname = "sem", -- cgit v1.2.3