From c6e666345e1b79c62ba82339cc7d55a89cb73f88 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 2 Aug 2012 09:48:50 +0200 Subject: block: split discard into aligned requests When a disk has large discard_granularity and small max_discard_sectors, discards are not split with optimal alignment. In the limit case of discard_granularity == max_discard_sectors, no request could be aligned correctly, so in fact you might end up with no discarded logical blocks at all. Another example that helps showing the condition in the patch is with discard_granularity == 64, max_discard_sectors == 128. A request that is submitted for 256 sectors 2..257 will be split in two: 2..129, 130..257. However, only 2 aligned blocks out of 3 are included in the request; 128..191 may be left intact and not discarded. With this patch, the first request will be truncated to ensure good alignment of what's left, and the split will be 2..127, 128..255, 256..257. The patch will also take into account the discard_alignment. At most one extra request will be introduced, because the first request will be reduced by at most granularity-1 sectors, and granularity must be less than max_discard_sectors. Subsequent requests will run on round_down(max_discard_sectors, granularity) sectors, as in the current code. Signed-off-by: Paolo Bonzini Acked-by: Vivek Goyal Tested-by: Mike Snitzer Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 4e72a9d48232..281516ae8b4e 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1139,6 +1139,16 @@ static inline int queue_limit_discard_alignment(struct queue_limits *lim, sector & (lim->discard_granularity - 1); } +static inline int bdev_discard_alignment(struct block_device *bdev) +{ + struct request_queue *q = bdev_get_queue(bdev); + + if (bdev != bdev->bd_contains) + return bdev->bd_part->discard_alignment; + + return q->limits.discard_alignment; +} + static inline unsigned int queue_discard_zeroes_data(struct request_queue *q) { if (q->limits.max_discard_sectors && q->limits.discard_zeroes_data == 1) -- cgit v1.2.3 From f6166384095b7ecf77752b5e9096e6d03d75f7ae Mon Sep 17 00:00:00 2001 From: Peng Tao Date: Thu, 2 Aug 2012 15:36:09 +0300 Subject: NFS41: add pg_layout_private to nfs_pageio_descriptor To allow layout driver to pass private information around pg_init/pg_doio. Signed-off-by: Peng Tao Signed-off-by: Boaz Harrosh Signed-off-by: Trond Myklebust --- fs/nfs/pagelist.c | 2 ++ include/linux/nfs_page.h | 1 + include/linux/nfs_xdr.h | 1 + 3 files changed, 4 insertions(+) (limited to 'include') diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 1a6732ed04a4..311a79681e2b 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -49,6 +49,7 @@ void nfs_pgheader_init(struct nfs_pageio_descriptor *desc, hdr->io_start = req_offset(hdr->req); hdr->good_bytes = desc->pg_count; hdr->dreq = desc->pg_dreq; + hdr->layout_private = desc->pg_layout_private; hdr->release = release; hdr->completion_ops = desc->pg_completion_ops; if (hdr->completion_ops->init_hdr) @@ -268,6 +269,7 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc, desc->pg_error = 0; desc->pg_lseg = NULL; desc->pg_dreq = NULL; + desc->pg_layout_private = NULL; } EXPORT_SYMBOL_GPL(nfs_pageio_init); diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h index 880805774f9f..92ce5783b707 100644 --- a/include/linux/nfs_page.h +++ b/include/linux/nfs_page.h @@ -69,6 +69,7 @@ struct nfs_pageio_descriptor { const struct nfs_pgio_completion_ops *pg_completion_ops; struct pnfs_layout_segment *pg_lseg; struct nfs_direct_req *pg_dreq; + void *pg_layout_private; }; #define NFS_WBACK_BUSY(req) (test_bit(PG_BUSY,&(req)->wb_flags)) diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 00485e084394..ac7c8ae254f2 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1248,6 +1248,7 @@ struct nfs_pgio_header { void (*release) (struct nfs_pgio_header *hdr); const struct nfs_pgio_completion_ops *completion_ops; struct nfs_direct_req *dreq; + void *layout_private; spinlock_t lock; /* fields protected by lock */ int pnfs_error; -- cgit v1.2.3 From 85b9f66a41eb8ee3f1dfc95707412705463cdd97 Mon Sep 17 00:00:00 2001 From: Asias He Date: Thu, 2 Aug 2012 23:42:04 +0200 Subject: block: Add blk_bio_map_sg() helper Add a helper to map a bio to a scatterlist, modelled after blk_rq_map_sg. This helper is useful for any driver that wants to create a scatterlist from its ->make_request_fn method. Changes in v2: - Use __blk_segment_map_sg to avoid duplicated code - Add cocbook style function comment Cc: Rusty Russell Cc: Christoph Hellwig Cc: Tejun Heo Cc: Shaohua Li Cc: "Michael S. Tsirkin" Cc: kvm@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: virtualization@lists.linux-foundation.org Signed-off-by: Christoph Hellwig Signed-off-by: Minchan Kim Signed-off-by: Asias He Signed-off-by: Jens Axboe --- block/blk-merge.c | 37 +++++++++++++++++++++++++++++++++++++ include/linux/blkdev.h | 2 ++ 2 files changed, 39 insertions(+) (limited to 'include') diff --git a/block/blk-merge.c b/block/blk-merge.c index 576b68e79248..e76279e41162 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -209,6 +209,43 @@ int blk_rq_map_sg(struct request_queue *q, struct request *rq, } EXPORT_SYMBOL(blk_rq_map_sg); +/** + * blk_bio_map_sg - map a bio to a scatterlist + * @q: request_queue in question + * @bio: bio being mapped + * @sglist: scatterlist being mapped + * + * Note: + * Caller must make sure sg can hold bio->bi_phys_segments entries + * + * Will return the number of sg entries setup + */ +int blk_bio_map_sg(struct request_queue *q, struct bio *bio, + struct scatterlist *sglist) +{ + struct bio_vec *bvec, *bvprv; + struct scatterlist *sg; + int nsegs, cluster; + unsigned long i; + + nsegs = 0; + cluster = blk_queue_cluster(q); + + bvprv = NULL; + sg = NULL; + bio_for_each_segment(bvec, bio, i) { + __blk_segment_map_sg(q, bvec, sglist, &bvprv, &sg, + &nsegs, &cluster); + } /* segments in bio */ + + if (sg) + sg_mark_end(sg); + + BUG_ON(bio->bi_phys_segments && nsegs > bio->bi_phys_segments); + return nsegs; +} +EXPORT_SYMBOL(blk_bio_map_sg); + static inline int ll_new_hw_segment(struct request_queue *q, struct request *req, struct bio *bio) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 281516ae8b4e..dc632975d54f 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -894,6 +894,8 @@ extern void blk_queue_flush_queueable(struct request_queue *q, bool queueable); extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev); extern int blk_rq_map_sg(struct request_queue *, struct request *, struct scatterlist *); +extern int blk_bio_map_sg(struct request_queue *q, struct bio *bio, + struct scatterlist *sglist); extern void blk_dump_rq_flags(struct request *, char *); extern long nr_blockdev_pages(void); -- cgit v1.2.3 From d796c52ef0b71a988364f6109aeb63d79c5b116b Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sun, 5 Aug 2012 19:04:57 -0400 Subject: ext4: make sure the journal sb is written in ext4_clear_journal_err() After we transfer set the EXT4_ERROR_FS bit in the file system superblock, it's not enough to call jbd2_journal_clear_err() to clear the error indication from journal superblock --- we need to call jbd2_journal_update_sb_errno() as well. Otherwise, when the root file system is mounted read-only, the journal is replayed, and the error indicator is transferred to the superblock --- but the s_errno field in the jbd2 superblock is left set (since although we cleared it in memory, we never flushed it out to disk). This can end up confusing e2fsck. We should make e2fsck more robust in this case, but the kernel shouldn't be leaving things in this confused state, either. Signed-off-by: "Theodore Ts'o" Cc: stable@kernel.org --- fs/ext4/super.c | 1 + fs/jbd2/journal.c | 3 ++- include/linux/jbd2.h | 1 + 3 files changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index d76ec8277d3f..ccc4bcad5616 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -4430,6 +4430,7 @@ static void ext4_clear_journal_err(struct super_block *sb, ext4_commit_super(sb, 1); jbd2_journal_clear_err(journal); + jbd2_journal_update_sb_errno(journal); } } diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index e9a3c4c85594..bd23f2ebaa67 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -1377,7 +1377,7 @@ static void jbd2_mark_journal_empty(journal_t *journal) * Update a journal's errno. Write updated superblock to disk waiting for IO * to complete. */ -static void jbd2_journal_update_sb_errno(journal_t *journal) +void jbd2_journal_update_sb_errno(journal_t *journal) { journal_superblock_t *sb = journal->j_superblock; @@ -1390,6 +1390,7 @@ static void jbd2_journal_update_sb_errno(journal_t *journal) jbd2_write_superblock(journal, WRITE_SYNC); } +EXPORT_SYMBOL(jbd2_journal_update_sb_errno); /* * Read the superblock for a given journal, performing initial diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index f334c7fab967..3efc43f3f162 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1125,6 +1125,7 @@ extern int jbd2_journal_destroy (journal_t *); extern int jbd2_journal_recover (journal_t *journal); extern int jbd2_journal_wipe (journal_t *, int); extern int jbd2_journal_skip_recovery (journal_t *); +extern void jbd2_journal_update_sb_errno(journal_t *); extern void jbd2_journal_update_sb_log_tail (journal_t *, tid_t, unsigned long, int); extern void __jbd2_journal_abort_hard (journal_t *); -- cgit v1.2.3 From 276f0f5d157bb4a816053f4f3a941dbcd4f76556 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 9 Aug 2012 15:20:23 +0200 Subject: block: disable discard request merge temporarily The SCSI discard request merge never worked, and looks no solution for in future, let's disable it temporarily. Signed-off-by: Shaohua Li Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index dc632975d54f..4a2ab7c85393 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -601,7 +601,7 @@ static inline void blk_clear_rl_full(struct request_list *rl, bool sync) * it already be started by driver. */ #define RQ_NOMERGE_FLAGS \ - (REQ_NOMERGE | REQ_STARTED | REQ_SOFTBARRIER | REQ_FLUSH | REQ_FUA) + (REQ_NOMERGE | REQ_STARTED | REQ_SOFTBARRIER | REQ_FLUSH | REQ_FUA | REQ_DISCARD) #define rq_mergeable(rq) \ (!((rq)->cmd_flags & RQ_NOMERGE_FLAGS) && \ (((rq)->cmd_flags & REQ_DISCARD) || \ -- cgit v1.2.3 From 02b69cbdc2fb2e1bfbfd9ac0c246d7be1b08d3cd Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Thu, 9 Aug 2012 10:08:46 +0000 Subject: netfilter: nf_ct_sip: fix IPv6 address parsing Within SIP messages IPv6 addresses are enclosed in square brackets in most cases, with the exception of the "received=" header parameter. Currently the helper fails to parse enclosed addresses. This patch: - changes the SIP address parsing function to enforce square brackets when required, and accept them when not required but present, as recommended by RFC 5118. - adds a new SDP address parsing function that never accepts square brackets since SDP doesn't use them. With these changes, the SIP helper correctly parses all test messages from RFC 5118 (Session Initiation Protocol (SIP) Torture Test Messages for Internet Protocol Version 6 (IPv6)). Signed-off-by: Patrick McHardy Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/nf_conntrack_sip.h | 2 +- net/ipv4/netfilter/nf_nat_sip.c | 4 +- net/netfilter/nf_conntrack_sip.c | 87 ++++++++++++++++++++++++------ 3 files changed, 73 insertions(+), 20 deletions(-) (limited to 'include') diff --git a/include/linux/netfilter/nf_conntrack_sip.h b/include/linux/netfilter/nf_conntrack_sip.h index 0dfc8b7210a3..89f2a627f3f0 100644 --- a/include/linux/netfilter/nf_conntrack_sip.h +++ b/include/linux/netfilter/nf_conntrack_sip.h @@ -164,7 +164,7 @@ extern int ct_sip_parse_address_param(const struct nf_conn *ct, const char *dptr unsigned int dataoff, unsigned int datalen, const char *name, unsigned int *matchoff, unsigned int *matchlen, - union nf_inet_addr *addr); + union nf_inet_addr *addr, bool delim); extern int ct_sip_parse_numerical_param(const struct nf_conn *ct, const char *dptr, unsigned int off, unsigned int datalen, const char *name, diff --git a/net/ipv4/netfilter/nf_nat_sip.c b/net/ipv4/netfilter/nf_nat_sip.c index ea4a23813d26..eef8f29e8bf8 100644 --- a/net/ipv4/netfilter/nf_nat_sip.c +++ b/net/ipv4/netfilter/nf_nat_sip.c @@ -173,7 +173,7 @@ static unsigned int ip_nat_sip(struct sk_buff *skb, unsigned int dataoff, * the reply. */ if (ct_sip_parse_address_param(ct, *dptr, matchend, *datalen, "maddr=", &poff, &plen, - &addr) > 0 && + &addr, true) > 0 && addr.ip == ct->tuplehash[dir].tuple.src.u3.ip && addr.ip != ct->tuplehash[!dir].tuple.dst.u3.ip) { buflen = sprintf(buffer, "%pI4", @@ -187,7 +187,7 @@ static unsigned int ip_nat_sip(struct sk_buff *skb, unsigned int dataoff, * from which the server received the request. */ if (ct_sip_parse_address_param(ct, *dptr, matchend, *datalen, "received=", &poff, &plen, - &addr) > 0 && + &addr, false) > 0 && addr.ip == ct->tuplehash[dir].tuple.dst.u3.ip && addr.ip != ct->tuplehash[!dir].tuple.src.u3.ip) { buflen = sprintf(buffer, "%pI4", diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c index 2fb666920cc9..5c0a112aeee6 100644 --- a/net/netfilter/nf_conntrack_sip.c +++ b/net/netfilter/nf_conntrack_sip.c @@ -183,12 +183,12 @@ static int media_len(const struct nf_conn *ct, const char *dptr, return len + digits_len(ct, dptr, limit, shift); } -static int parse_addr(const struct nf_conn *ct, const char *cp, - const char **endp, union nf_inet_addr *addr, - const char *limit) +static int sip_parse_addr(const struct nf_conn *ct, const char *cp, + const char **endp, union nf_inet_addr *addr, + const char *limit, bool delim) { const char *end; - int ret = 0; + int ret; if (!ct) return 0; @@ -197,16 +197,28 @@ static int parse_addr(const struct nf_conn *ct, const char *cp, switch (nf_ct_l3num(ct)) { case AF_INET: ret = in4_pton(cp, limit - cp, (u8 *)&addr->ip, -1, &end); + if (ret == 0) + return 0; break; case AF_INET6: + if (cp < limit && *cp == '[') + cp++; + else if (delim) + return 0; + ret = in6_pton(cp, limit - cp, (u8 *)&addr->ip6, -1, &end); + if (ret == 0) + return 0; + + if (end < limit && *end == ']') + end++; + else if (delim) + return 0; break; default: BUG(); } - if (ret == 0 || end == cp) - return 0; if (endp) *endp = end; return 1; @@ -219,7 +231,7 @@ static int epaddr_len(const struct nf_conn *ct, const char *dptr, union nf_inet_addr addr; const char *aux = dptr; - if (!parse_addr(ct, dptr, &dptr, &addr, limit)) { + if (!sip_parse_addr(ct, dptr, &dptr, &addr, limit, true)) { pr_debug("ip: %s parse failed.!\n", dptr); return 0; } @@ -296,7 +308,7 @@ int ct_sip_parse_request(const struct nf_conn *ct, return 0; dptr += shift; - if (!parse_addr(ct, dptr, &end, addr, limit)) + if (!sip_parse_addr(ct, dptr, &end, addr, limit, true)) return -1; if (end < limit && *end == ':') { end++; @@ -550,7 +562,7 @@ int ct_sip_parse_header_uri(const struct nf_conn *ct, const char *dptr, if (ret == 0) return ret; - if (!parse_addr(ct, dptr + *matchoff, &c, addr, limit)) + if (!sip_parse_addr(ct, dptr + *matchoff, &c, addr, limit, true)) return -1; if (*c == ':') { c++; @@ -599,7 +611,7 @@ int ct_sip_parse_address_param(const struct nf_conn *ct, const char *dptr, unsigned int dataoff, unsigned int datalen, const char *name, unsigned int *matchoff, unsigned int *matchlen, - union nf_inet_addr *addr) + union nf_inet_addr *addr, bool delim) { const char *limit = dptr + datalen; const char *start, *end; @@ -613,7 +625,7 @@ int ct_sip_parse_address_param(const struct nf_conn *ct, const char *dptr, return 0; start += strlen(name); - if (!parse_addr(ct, start, &end, addr, limit)) + if (!sip_parse_addr(ct, start, &end, addr, limit, delim)) return 0; *matchoff = start - dptr; *matchlen = end - start; @@ -675,6 +687,47 @@ static int ct_sip_parse_transport(struct nf_conn *ct, const char *dptr, return 1; } +static int sdp_parse_addr(const struct nf_conn *ct, const char *cp, + const char **endp, union nf_inet_addr *addr, + const char *limit) +{ + const char *end; + int ret; + + memset(addr, 0, sizeof(*addr)); + switch (nf_ct_l3num(ct)) { + case AF_INET: + ret = in4_pton(cp, limit - cp, (u8 *)&addr->ip, -1, &end); + break; + case AF_INET6: + ret = in6_pton(cp, limit - cp, (u8 *)&addr->ip6, -1, &end); + break; + default: + BUG(); + } + + if (ret == 0) + return 0; + if (endp) + *endp = end; + return 1; +} + +/* skip ip address. returns its length. */ +static int sdp_addr_len(const struct nf_conn *ct, const char *dptr, + const char *limit, int *shift) +{ + union nf_inet_addr addr; + const char *aux = dptr; + + if (!sdp_parse_addr(ct, dptr, &dptr, &addr, limit)) { + pr_debug("ip: %s parse failed.!\n", dptr); + return 0; + } + + return dptr - aux; +} + /* SDP header parsing: a SDP session description contains an ordered set of * headers, starting with a section containing general session parameters, * optionally followed by multiple media descriptions. @@ -686,10 +739,10 @@ static int ct_sip_parse_transport(struct nf_conn *ct, const char *dptr, */ static const struct sip_header ct_sdp_hdrs[] = { [SDP_HDR_VERSION] = SDP_HDR("v=", NULL, digits_len), - [SDP_HDR_OWNER_IP4] = SDP_HDR("o=", "IN IP4 ", epaddr_len), - [SDP_HDR_CONNECTION_IP4] = SDP_HDR("c=", "IN IP4 ", epaddr_len), - [SDP_HDR_OWNER_IP6] = SDP_HDR("o=", "IN IP6 ", epaddr_len), - [SDP_HDR_CONNECTION_IP6] = SDP_HDR("c=", "IN IP6 ", epaddr_len), + [SDP_HDR_OWNER_IP4] = SDP_HDR("o=", "IN IP4 ", sdp_addr_len), + [SDP_HDR_CONNECTION_IP4] = SDP_HDR("c=", "IN IP4 ", sdp_addr_len), + [SDP_HDR_OWNER_IP6] = SDP_HDR("o=", "IN IP6 ", sdp_addr_len), + [SDP_HDR_CONNECTION_IP6] = SDP_HDR("c=", "IN IP6 ", sdp_addr_len), [SDP_HDR_MEDIA] = SDP_HDR("m=", NULL, media_len), }; @@ -775,8 +828,8 @@ static int ct_sip_parse_sdp_addr(const struct nf_conn *ct, const char *dptr, if (ret <= 0) return ret; - if (!parse_addr(ct, dptr + *matchoff, NULL, addr, - dptr + *matchoff + *matchlen)) + if (!sdp_parse_addr(ct, dptr + *matchoff, NULL, addr, + dptr + *matchoff + *matchlen)) return -1; return 1; } -- cgit v1.2.3 From 0bce9c46bf3b15f485d82d7e81dabed6ebcc24b1 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 10 Aug 2012 15:22:09 +0100 Subject: mutex: Place lock in contended state after fastpath_lock failure ARM recently moved to asm-generic/mutex-xchg.h for its mutex implementation after the previous implementation was found to be missing some crucial memory barriers. However, this has revealed some problems running hackbench on SMP platforms due to the way in which the MUTEX_SPIN_ON_OWNER code operates. The symptoms are that a bunch of hackbench tasks are left waiting on an unlocked mutex and therefore never get woken up to claim it. This boils down to the following sequence of events: Task A Task B Task C Lock value 0 1 1 lock() 0 2 lock() 0 3 spin(A) 0 4 unlock() 1 5 lock() 0 6 cmpxchg(1,0) 0 7 contended() -1 8 lock() 0 9 spin(C) 0 10 unlock() 1 11 cmpxchg(1,0) 0 12 unlock() 1 At this point, the lock is unlocked, but Task B is in an uninterruptible sleep with nobody to wake it up. This patch fixes the problem by ensuring we put the lock into the contended state if we fail to acquire it on the fastpath, ensuring that any blocked waiters are woken up when the mutex is released. Signed-off-by: Will Deacon Cc: Arnd Bergmann Cc: Chris Mason Cc: Ingo Molnar Cc: Reviewed-by: Nicolas Pitre Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-6e9lrw2avczr0617fzl5vqb8@git.kernel.org Signed-off-by: Thomas Gleixner --- include/asm-generic/mutex-xchg.h | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/asm-generic/mutex-xchg.h b/include/asm-generic/mutex-xchg.h index 580a6d35c700..c04e0db8a2d6 100644 --- a/include/asm-generic/mutex-xchg.h +++ b/include/asm-generic/mutex-xchg.h @@ -26,7 +26,13 @@ static inline void __mutex_fastpath_lock(atomic_t *count, void (*fail_fn)(atomic_t *)) { if (unlikely(atomic_xchg(count, 0) != 1)) - fail_fn(count); + /* + * We failed to acquire the lock, so mark it contended + * to ensure that any waiting tasks are woken up by the + * unlock slow path. + */ + if (likely(atomic_xchg(count, -1) != 1)) + fail_fn(count); } /** @@ -43,7 +49,8 @@ static inline int __mutex_fastpath_lock_retval(atomic_t *count, int (*fail_fn)(atomic_t *)) { if (unlikely(atomic_xchg(count, 0) != 1)) - return fail_fn(count); + if (likely(atomic_xchg(count, -1) != 1)) + return fail_fn(count); return 0; } -- cgit v1.2.3 From f026cfa82f628db24b8cea41b9d6202af104cecb Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 14 Aug 2012 09:53:38 -0700 Subject: Revert "x86-64/efi: Use EFI to deal with platform wall clock" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit bacef661acdb634170a8faddbc1cf28e8f8b9eee. This commit has been found to cause serious regressions on a number of ASUS machines at the least. We probably need to provide a 1:1 map in addition to the EFI virtual memory map in order for this to work. Signed-off-by: H. Peter Anvin Reported-and-bisected-by: Jérôme Carretero Cc: Jan Beulich Cc: Matt Fleming Cc: Matthew Garrett Cc: Linus Torvalds Cc: Andrew Morton Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20120805172903.5f8bb24c@zougloub.eu --- arch/x86/mm/pageattr.c | 10 ++++------ arch/x86/platform/efi/efi.c | 30 ++++++++++++++++++++++++++---- include/linux/efi.h | 2 ++ init/main.c | 8 ++++---- 4 files changed, 36 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 931930a96160..a718e0d23503 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -919,13 +919,11 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages, /* * On success we use clflush, when the CPU supports it to - * avoid the wbindv. If the CPU does not support it, in the - * error case, and during early boot (for EFI) we fall back - * to cpa_flush_all (which uses wbinvd): + * avoid the wbindv. If the CPU does not support it and in the + * error case we fall back to cpa_flush_all (which uses + * wbindv): */ - if (early_boot_irqs_disabled) - __cpa_flush_all((void *)(long)cache); - else if (!ret && cpu_has_clflush) { + if (!ret && cpu_has_clflush) { if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) { cpa_flush_array(addr, numpages, cache, cpa.flags, pages); diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 2dc29f51e75a..92660edaa1e7 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -234,7 +234,22 @@ static efi_status_t __init phys_efi_set_virtual_address_map( return status; } -static int efi_set_rtc_mmss(unsigned long nowtime) +static efi_status_t __init phys_efi_get_time(efi_time_t *tm, + efi_time_cap_t *tc) +{ + unsigned long flags; + efi_status_t status; + + spin_lock_irqsave(&rtc_lock, flags); + efi_call_phys_prelog(); + status = efi_call_phys2(efi_phys.get_time, virt_to_phys(tm), + virt_to_phys(tc)); + efi_call_phys_epilog(); + spin_unlock_irqrestore(&rtc_lock, flags); + return status; +} + +int efi_set_rtc_mmss(unsigned long nowtime) { int real_seconds, real_minutes; efi_status_t status; @@ -263,7 +278,7 @@ static int efi_set_rtc_mmss(unsigned long nowtime) return 0; } -static unsigned long efi_get_time(void) +unsigned long efi_get_time(void) { efi_status_t status; efi_time_t eft; @@ -606,13 +621,18 @@ static int __init efi_runtime_init(void) } /* * We will only need *early* access to the following - * EFI runtime service before set_virtual_address_map + * two EFI runtime services before set_virtual_address_map * is invoked. */ + efi_phys.get_time = (efi_get_time_t *)runtime->get_time; efi_phys.set_virtual_address_map = (efi_set_virtual_address_map_t *) runtime->set_virtual_address_map; - + /* + * Make efi_get_time can be called before entering + * virtual mode. + */ + efi.get_time = phys_efi_get_time; early_iounmap(runtime, sizeof(efi_runtime_services_t)); return 0; @@ -700,10 +720,12 @@ void __init efi_init(void) efi_enabled = 0; return; } +#ifdef CONFIG_X86_32 if (efi_native) { x86_platform.get_wallclock = efi_get_time; x86_platform.set_wallclock = efi_set_rtc_mmss; } +#endif #if EFI_DEBUG print_efi_memmap(); diff --git a/include/linux/efi.h b/include/linux/efi.h index 103adc6d7e3a..ec45ccd8708a 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -503,6 +503,8 @@ extern u64 efi_mem_attribute (unsigned long phys_addr, unsigned long size); extern int __init efi_uart_console_only (void); extern void efi_initialize_iomem_resources(struct resource *code_resource, struct resource *data_resource, struct resource *bss_resource); +extern unsigned long efi_get_time(void); +extern int efi_set_rtc_mmss(unsigned long nowtime); extern void efi_reserve_boot_services(void); extern struct efi_memory_map memmap; diff --git a/init/main.c b/init/main.c index e60679de61c3..b28673087ac0 100644 --- a/init/main.c +++ b/init/main.c @@ -461,10 +461,6 @@ static void __init mm_init(void) percpu_init_late(); pgtable_cache_init(); vmalloc_init(); -#ifdef CONFIG_X86 - if (efi_enabled) - efi_enter_virtual_mode(); -#endif } asmlinkage void __init start_kernel(void) @@ -606,6 +602,10 @@ asmlinkage void __init start_kernel(void) calibrate_delay(); pidmap_init(); anon_vma_init(); +#ifdef CONFIG_X86 + if (efi_enabled) + efi_enter_virtual_mode(); +#endif thread_info_cache_init(); cred_init(); fork_init(totalram_pages); -- cgit v1.2.3 From 47be03a28cc6c80e3aa2b3e8ed6d960ff0c5c0af Mon Sep 17 00:00:00 2001 From: Amerigo Wang Date: Fri, 10 Aug 2012 01:24:37 +0000 Subject: netpoll: use GFP_ATOMIC in slave_enable_netpoll() and __netpoll_setup() slave_enable_netpoll() and __netpoll_setup() may be called with read_lock() held, so should use GFP_ATOMIC to allocate memory. Eric suggested to pass gfp flags to __netpoll_setup(). Cc: Eric Dumazet Cc: "David S. Miller" Reported-by: Dan Carpenter Signed-off-by: Eric Dumazet Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- drivers/net/bonding/bond_main.c | 6 +++--- drivers/net/team/team.c | 16 +++++++++------- include/linux/netdevice.h | 3 ++- include/linux/netpoll.h | 2 +- net/8021q/vlan_dev.c | 7 ++++--- net/bridge/br_device.c | 12 ++++++------ net/bridge/br_if.c | 2 +- net/bridge/br_private.h | 4 ++-- net/core/netpoll.c | 8 ++++---- 9 files changed, 32 insertions(+), 28 deletions(-) (limited to 'include') diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 6fae5f3ec7f6..8697136e27c0 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -1235,12 +1235,12 @@ static inline int slave_enable_netpoll(struct slave *slave) struct netpoll *np; int err = 0; - np = kzalloc(sizeof(*np), GFP_KERNEL); + np = kzalloc(sizeof(*np), GFP_ATOMIC); err = -ENOMEM; if (!np) goto out; - err = __netpoll_setup(np, slave->dev); + err = __netpoll_setup(np, slave->dev, GFP_ATOMIC); if (err) { kfree(np); goto out; @@ -1292,7 +1292,7 @@ static void bond_netpoll_cleanup(struct net_device *bond_dev) read_unlock(&bond->lock); } -static int bond_netpoll_setup(struct net_device *dev, struct netpoll_info *ni) +static int bond_netpoll_setup(struct net_device *dev, struct netpoll_info *ni, gfp_t gfp) { struct bonding *bond = netdev_priv(dev); struct slave *slave; diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c index 87707ab39430..341b65dbbcd3 100644 --- a/drivers/net/team/team.c +++ b/drivers/net/team/team.c @@ -795,16 +795,17 @@ static void team_port_leave(struct team *team, struct team_port *port) } #ifdef CONFIG_NET_POLL_CONTROLLER -static int team_port_enable_netpoll(struct team *team, struct team_port *port) +static int team_port_enable_netpoll(struct team *team, struct team_port *port, + gfp_t gfp) { struct netpoll *np; int err; - np = kzalloc(sizeof(*np), GFP_KERNEL); + np = kzalloc(sizeof(*np), gfp); if (!np) return -ENOMEM; - err = __netpoll_setup(np, port->dev); + err = __netpoll_setup(np, port->dev, gfp); if (err) { kfree(np); return err; @@ -833,7 +834,8 @@ static struct netpoll_info *team_netpoll_info(struct team *team) } #else -static int team_port_enable_netpoll(struct team *team, struct team_port *port) +static int team_port_enable_netpoll(struct team *team, struct team_port *port, + gfp_t gfp) { return 0; } @@ -913,7 +915,7 @@ static int team_port_add(struct team *team, struct net_device *port_dev) } if (team_netpoll_info(team)) { - err = team_port_enable_netpoll(team, port); + err = team_port_enable_netpoll(team, port, GFP_KERNEL); if (err) { netdev_err(dev, "Failed to enable netpoll on device %s\n", portname); @@ -1443,7 +1445,7 @@ static void team_netpoll_cleanup(struct net_device *dev) } static int team_netpoll_setup(struct net_device *dev, - struct netpoll_info *npifo) + struct netpoll_info *npifo, gfp_t gfp) { struct team *team = netdev_priv(dev); struct team_port *port; @@ -1451,7 +1453,7 @@ static int team_netpoll_setup(struct net_device *dev, mutex_lock(&team->lock); list_for_each_entry(port, &team->port_list, list) { - err = team_port_enable_netpoll(team, port); + err = team_port_enable_netpoll(team, port, gfp); if (err) { __team_netpoll_cleanup(team); break; diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index a9db4f33407f..3560d688161e 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -953,7 +953,8 @@ struct net_device_ops { #ifdef CONFIG_NET_POLL_CONTROLLER void (*ndo_poll_controller)(struct net_device *dev); int (*ndo_netpoll_setup)(struct net_device *dev, - struct netpoll_info *info); + struct netpoll_info *info, + gfp_t gfp); void (*ndo_netpoll_cleanup)(struct net_device *dev); #endif int (*ndo_set_vf_mac)(struct net_device *dev, diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h index 28f5389c924b..bf2d51eec0f3 100644 --- a/include/linux/netpoll.h +++ b/include/linux/netpoll.h @@ -43,7 +43,7 @@ struct netpoll_info { void netpoll_send_udp(struct netpoll *np, const char *msg, int len); void netpoll_print_options(struct netpoll *np); int netpoll_parse_options(struct netpoll *np, char *opt); -int __netpoll_setup(struct netpoll *np, struct net_device *ndev); +int __netpoll_setup(struct netpoll *np, struct net_device *ndev, gfp_t gfp); int netpoll_setup(struct netpoll *np); int netpoll_trap(void); void netpoll_set_trap(int trap); diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index 73a2a83ee2da..ee4ae0944cef 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -669,19 +669,20 @@ static void vlan_dev_poll_controller(struct net_device *dev) return; } -static int vlan_dev_netpoll_setup(struct net_device *dev, struct netpoll_info *npinfo) +static int vlan_dev_netpoll_setup(struct net_device *dev, struct netpoll_info *npinfo, + gfp_t gfp) { struct vlan_dev_priv *info = vlan_dev_priv(dev); struct net_device *real_dev = info->real_dev; struct netpoll *netpoll; int err = 0; - netpoll = kzalloc(sizeof(*netpoll), GFP_KERNEL); + netpoll = kzalloc(sizeof(*netpoll), gfp); err = -ENOMEM; if (!netpoll) goto out; - err = __netpoll_setup(netpoll, real_dev); + err = __netpoll_setup(netpoll, real_dev, gfp); if (err) { kfree(netpoll); goto out; diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index 333484537600..ed0e0f9dc788 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -213,7 +213,8 @@ static void br_netpoll_cleanup(struct net_device *dev) } } -static int br_netpoll_setup(struct net_device *dev, struct netpoll_info *ni) +static int br_netpoll_setup(struct net_device *dev, struct netpoll_info *ni, + gfp_t gfp) { struct net_bridge *br = netdev_priv(dev); struct net_bridge_port *p, *n; @@ -222,8 +223,7 @@ static int br_netpoll_setup(struct net_device *dev, struct netpoll_info *ni) list_for_each_entry_safe(p, n, &br->port_list, list) { if (!p->dev) continue; - - err = br_netpoll_enable(p); + err = br_netpoll_enable(p, gfp); if (err) goto fail; } @@ -236,17 +236,17 @@ fail: goto out; } -int br_netpoll_enable(struct net_bridge_port *p) +int br_netpoll_enable(struct net_bridge_port *p, gfp_t gfp) { struct netpoll *np; int err = 0; - np = kzalloc(sizeof(*p->np), GFP_KERNEL); + np = kzalloc(sizeof(*p->np), gfp); err = -ENOMEM; if (!np) goto out; - err = __netpoll_setup(np, p->dev); + err = __netpoll_setup(np, p->dev, gfp); if (err) { kfree(np); goto out; diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index e1144e1617be..171fd6b9bfe6 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -361,7 +361,7 @@ int br_add_if(struct net_bridge *br, struct net_device *dev) if (err) goto err2; - if (br_netpoll_info(br) && ((err = br_netpoll_enable(p)))) + if (br_netpoll_info(br) && ((err = br_netpoll_enable(p, GFP_KERNEL)))) goto err3; err = netdev_set_master(dev, br->dev); diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index a768b2408edf..f507d2af9646 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -316,7 +316,7 @@ static inline void br_netpoll_send_skb(const struct net_bridge_port *p, netpoll_send_skb(np, skb); } -extern int br_netpoll_enable(struct net_bridge_port *p); +extern int br_netpoll_enable(struct net_bridge_port *p, gfp_t gfp); extern void br_netpoll_disable(struct net_bridge_port *p); #else static inline struct netpoll_info *br_netpoll_info(struct net_bridge *br) @@ -329,7 +329,7 @@ static inline void br_netpoll_send_skb(const struct net_bridge_port *p, { } -static inline int br_netpoll_enable(struct net_bridge_port *p) +static inline int br_netpoll_enable(struct net_bridge_port *p, gfp_t gfp) { return 0; } diff --git a/net/core/netpoll.c b/net/core/netpoll.c index b4c90e42b443..37cc854774a4 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -715,7 +715,7 @@ int netpoll_parse_options(struct netpoll *np, char *opt) } EXPORT_SYMBOL(netpoll_parse_options); -int __netpoll_setup(struct netpoll *np, struct net_device *ndev) +int __netpoll_setup(struct netpoll *np, struct net_device *ndev, gfp_t gfp) { struct netpoll_info *npinfo; const struct net_device_ops *ops; @@ -734,7 +734,7 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev) } if (!ndev->npinfo) { - npinfo = kmalloc(sizeof(*npinfo), GFP_KERNEL); + npinfo = kmalloc(sizeof(*npinfo), gfp); if (!npinfo) { err = -ENOMEM; goto out; @@ -752,7 +752,7 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev) ops = np->dev->netdev_ops; if (ops->ndo_netpoll_setup) { - err = ops->ndo_netpoll_setup(ndev, npinfo); + err = ops->ndo_netpoll_setup(ndev, npinfo, gfp); if (err) goto free_npinfo; } @@ -857,7 +857,7 @@ int netpoll_setup(struct netpoll *np) refill_skbs(); rtnl_lock(); - err = __netpoll_setup(np, ndev); + err = __netpoll_setup(np, ndev, GFP_KERNEL); rtnl_unlock(); if (err) -- cgit v1.2.3 From 38e6bc185d9544dfad1774b3f8902a0b061aea25 Mon Sep 17 00:00:00 2001 From: Amerigo Wang Date: Fri, 10 Aug 2012 01:24:38 +0000 Subject: netpoll: make __netpoll_cleanup non-block Like the previous patch, slave_disable_netpoll() and __netpoll_cleanup() may be called with read_lock() held too, so we should make them non-block, by moving the cleanup and kfree() to call_rcu_bh() callbacks. Cc: "David S. Miller" Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- drivers/net/bonding/bond_main.c | 4 +--- include/linux/netpoll.h | 3 +++ net/8021q/vlan_dev.c | 6 +----- net/bridge/br_device.c | 6 +----- net/core/netpoll.c | 42 +++++++++++++++++++++++++++++++---------- 5 files changed, 38 insertions(+), 23 deletions(-) (limited to 'include') diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 8697136e27c0..e42891683e3b 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -1257,9 +1257,7 @@ static inline void slave_disable_netpoll(struct slave *slave) return; slave->np = NULL; - synchronize_rcu_bh(); - __netpoll_cleanup(np); - kfree(np); + __netpoll_free_rcu(np); } static inline bool slave_dev_support_netpoll(struct net_device *slave_dev) { diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h index bf2d51eec0f3..907812efb4d9 100644 --- a/include/linux/netpoll.h +++ b/include/linux/netpoll.h @@ -23,6 +23,7 @@ struct netpoll { u8 remote_mac[ETH_ALEN]; struct list_head rx; /* rx_np list element */ + struct rcu_head rcu; }; struct netpoll_info { @@ -38,6 +39,7 @@ struct netpoll_info { struct delayed_work tx_work; struct netpoll *netpoll; + struct rcu_head rcu; }; void netpoll_send_udp(struct netpoll *np, const char *msg, int len); @@ -48,6 +50,7 @@ int netpoll_setup(struct netpoll *np); int netpoll_trap(void); void netpoll_set_trap(int trap); void __netpoll_cleanup(struct netpoll *np); +void __netpoll_free_rcu(struct netpoll *np); void netpoll_cleanup(struct netpoll *np); int __netpoll_rx(struct sk_buff *skb); void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb, diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index ee4ae0944cef..b65623f90660 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -704,11 +704,7 @@ static void vlan_dev_netpoll_cleanup(struct net_device *dev) info->netpoll = NULL; - /* Wait for transmitting packets to finish before freeing. */ - synchronize_rcu_bh(); - - __netpoll_cleanup(netpoll); - kfree(netpoll); + __netpoll_free_rcu(netpoll); } #endif /* CONFIG_NET_POLL_CONTROLLER */ diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index ed0e0f9dc788..f41ba4048c9a 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -267,11 +267,7 @@ void br_netpoll_disable(struct net_bridge_port *p) p->np = NULL; - /* Wait for transmitting packets to finish before freeing. */ - synchronize_rcu_bh(); - - __netpoll_cleanup(np); - kfree(np); + __netpoll_free_rcu(np); } #endif diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 37cc854774a4..dc17f1db1479 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -878,6 +878,24 @@ static int __init netpoll_init(void) } core_initcall(netpoll_init); +static void rcu_cleanup_netpoll_info(struct rcu_head *rcu_head) +{ + struct netpoll_info *npinfo = + container_of(rcu_head, struct netpoll_info, rcu); + + skb_queue_purge(&npinfo->arp_tx); + skb_queue_purge(&npinfo->txq); + + /* we can't call cancel_delayed_work_sync here, as we are in softirq */ + cancel_delayed_work(&npinfo->tx_work); + + /* clean after last, unfinished work */ + __skb_queue_purge(&npinfo->txq); + /* now cancel it again */ + cancel_delayed_work(&npinfo->tx_work); + kfree(npinfo); +} + void __netpoll_cleanup(struct netpoll *np) { struct netpoll_info *npinfo; @@ -903,20 +921,24 @@ void __netpoll_cleanup(struct netpoll *np) ops->ndo_netpoll_cleanup(np->dev); RCU_INIT_POINTER(np->dev->npinfo, NULL); + call_rcu_bh(&npinfo->rcu, rcu_cleanup_netpoll_info); + } +} +EXPORT_SYMBOL_GPL(__netpoll_cleanup); - /* avoid racing with NAPI reading npinfo */ - synchronize_rcu_bh(); +static void rcu_cleanup_netpoll(struct rcu_head *rcu_head) +{ + struct netpoll *np = container_of(rcu_head, struct netpoll, rcu); - skb_queue_purge(&npinfo->arp_tx); - skb_queue_purge(&npinfo->txq); - cancel_delayed_work_sync(&npinfo->tx_work); + __netpoll_cleanup(np); + kfree(np); +} - /* clean after last, unfinished work */ - __skb_queue_purge(&npinfo->txq); - kfree(npinfo); - } +void __netpoll_free_rcu(struct netpoll *np) +{ + call_rcu_bh(&np->rcu, rcu_cleanup_netpoll); } -EXPORT_SYMBOL_GPL(__netpoll_cleanup); +EXPORT_SYMBOL_GPL(__netpoll_free_rcu); void netpoll_cleanup(struct netpoll *np) { -- cgit v1.2.3 From 57c5d46191e75312934c00eba65b13a31ca95120 Mon Sep 17 00:00:00 2001 From: Amerigo Wang Date: Fri, 10 Aug 2012 01:24:40 +0000 Subject: netpoll: take rcu_read_lock_bh() in netpoll_rx() In __netpoll_rx(), it dereferences ->npinfo without rcu_dereference_bh(), this patch fixes it by using the 'npinfo' passed from netpoll_rx() where it is already dereferenced with rcu_dereference_bh(). Cc: "David S. Miller" Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- include/linux/netpoll.h | 4 ++-- net/core/netpoll.c | 3 +-- 2 files changed, 3 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h index 907812efb4d9..5d881c388273 100644 --- a/include/linux/netpoll.h +++ b/include/linux/netpoll.h @@ -52,7 +52,7 @@ void netpoll_set_trap(int trap); void __netpoll_cleanup(struct netpoll *np); void __netpoll_free_rcu(struct netpoll *np); void netpoll_cleanup(struct netpoll *np); -int __netpoll_rx(struct sk_buff *skb); +int __netpoll_rx(struct sk_buff *skb, struct netpoll_info *npinfo); void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb, struct net_device *dev); static inline void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) @@ -77,7 +77,7 @@ static inline bool netpoll_rx(struct sk_buff *skb) spin_lock(&npinfo->rx_lock); /* check rx_flags again with the lock held */ - if (npinfo->rx_flags && __netpoll_rx(skb)) + if (npinfo->rx_flags && __netpoll_rx(skb, npinfo)) ret = true; spin_unlock(&npinfo->rx_lock); diff --git a/net/core/netpoll.c b/net/core/netpoll.c index dc17f1db1479..d055bb01328b 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -543,13 +543,12 @@ static void arp_reply(struct sk_buff *skb) spin_unlock_irqrestore(&npinfo->rx_lock, flags); } -int __netpoll_rx(struct sk_buff *skb) +int __netpoll_rx(struct sk_buff *skb, struct netpoll_info *npinfo) { int proto, len, ulen; int hits = 0; const struct iphdr *iph; struct udphdr *uh; - struct netpoll_info *npinfo = skb->dev->npinfo; struct netpoll *np, *tmp; if (list_empty(&npinfo->rx_np)) -- cgit v1.2.3 From 91fe4a4b9e490a24f6702dd8afe72d8afab6fcdb Mon Sep 17 00:00:00 2001 From: Amerigo Wang Date: Fri, 10 Aug 2012 01:24:41 +0000 Subject: netpoll: use netpoll_rx_on() in netpoll_rx() The logic of the code is same, just call netpoll_rx_on(). Cc: "David S. Miller" Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- include/linux/netpoll.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h index 5d881c388273..2d178baa49df 100644 --- a/include/linux/netpoll.h +++ b/include/linux/netpoll.h @@ -63,6 +63,13 @@ static inline void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) #ifdef CONFIG_NETPOLL +static inline int netpoll_rx_on(struct sk_buff *skb) +{ + struct netpoll_info *npinfo = rcu_dereference_bh(skb->dev->npinfo); + + return npinfo && (!list_empty(&npinfo->rx_np) || npinfo->rx_flags); +} + static inline bool netpoll_rx(struct sk_buff *skb) { struct netpoll_info *npinfo; @@ -70,11 +77,11 @@ static inline bool netpoll_rx(struct sk_buff *skb) bool ret = false; local_irq_save(flags); - npinfo = rcu_dereference_bh(skb->dev->npinfo); - if (!npinfo || (list_empty(&npinfo->rx_np) && !npinfo->rx_flags)) + if (!netpoll_rx_on(skb)) goto out; + npinfo = rcu_dereference_bh(skb->dev->npinfo); spin_lock(&npinfo->rx_lock); /* check rx_flags again with the lock held */ if (npinfo->rx_flags && __netpoll_rx(skb, npinfo)) @@ -86,13 +93,6 @@ out: return ret; } -static inline int netpoll_rx_on(struct sk_buff *skb) -{ - struct netpoll_info *npinfo = rcu_dereference_bh(skb->dev->npinfo); - - return npinfo && (!list_empty(&npinfo->rx_np) || npinfo->rx_flags); -} - static inline int netpoll_receive_skb(struct sk_buff *skb) { if (!list_empty(&skb->dev->napi_list)) -- cgit v1.2.3 From 2899656b494dcd118123af1126826b115c8ea6f9 Mon Sep 17 00:00:00 2001 From: Amerigo Wang Date: Fri, 10 Aug 2012 01:24:42 +0000 Subject: netpoll: take rcu_read_lock_bh() in netpoll_send_skb_on_dev() This patch fixes several problems in the call path of netpoll_send_skb_on_dev(): 1. Disable IRQ's before calling netpoll_send_skb_on_dev(). 2. All the callees of netpoll_send_skb_on_dev() should use rcu_dereference_bh() to dereference ->npinfo. 3. Rename arp_reply() to netpoll_arp_reply(), the former is too generic. Cc: "David S. Miller" Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- include/linux/netpoll.h | 3 +++ net/core/netpoll.c | 31 +++++++++++++++++-------------- 2 files changed, 20 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h index 2d178baa49df..61aee86cf21d 100644 --- a/include/linux/netpoll.h +++ b/include/linux/netpoll.h @@ -57,7 +57,10 @@ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb, struct net_device *dev); static inline void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) { + unsigned long flags; + local_irq_save(flags); netpoll_send_skb_on_dev(np, skb, np->dev); + local_irq_restore(flags); } diff --git a/net/core/netpoll.c b/net/core/netpoll.c index d055bb01328b..174346ac15a0 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -54,7 +54,7 @@ static atomic_t trapped; MAX_UDP_CHUNK) static void zap_completion_queue(void); -static void arp_reply(struct sk_buff *skb); +static void netpoll_arp_reply(struct sk_buff *skb, struct netpoll_info *npinfo); static unsigned int carrier_timeout = 4; module_param(carrier_timeout, uint, 0644); @@ -170,7 +170,8 @@ static void poll_napi(struct net_device *dev) list_for_each_entry(napi, &dev->napi_list, dev_list) { if (napi->poll_owner != smp_processor_id() && spin_trylock(&napi->poll_lock)) { - budget = poll_one_napi(dev->npinfo, napi, budget); + budget = poll_one_napi(rcu_dereference_bh(dev->npinfo), + napi, budget); spin_unlock(&napi->poll_lock); if (!budget) @@ -185,13 +186,14 @@ static void service_arp_queue(struct netpoll_info *npi) struct sk_buff *skb; while ((skb = skb_dequeue(&npi->arp_tx))) - arp_reply(skb); + netpoll_arp_reply(skb, npi); } } static void netpoll_poll_dev(struct net_device *dev) { const struct net_device_ops *ops; + struct netpoll_info *ni = rcu_dereference_bh(dev->npinfo); if (!dev || !netif_running(dev)) return; @@ -206,17 +208,18 @@ static void netpoll_poll_dev(struct net_device *dev) poll_napi(dev); if (dev->flags & IFF_SLAVE) { - if (dev->npinfo) { + if (ni) { struct net_device *bond_dev = dev->master; struct sk_buff *skb; - while ((skb = skb_dequeue(&dev->npinfo->arp_tx))) { + struct netpoll_info *bond_ni = rcu_dereference_bh(bond_dev->npinfo); + while ((skb = skb_dequeue(&ni->arp_tx))) { skb->dev = bond_dev; - skb_queue_tail(&bond_dev->npinfo->arp_tx, skb); + skb_queue_tail(&bond_ni->arp_tx, skb); } } } - service_arp_queue(dev->npinfo); + service_arp_queue(ni); zap_completion_queue(); } @@ -302,6 +305,7 @@ static int netpoll_owner_active(struct net_device *dev) return 0; } +/* call with IRQ disabled */ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb, struct net_device *dev) { @@ -309,8 +313,11 @@ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb, unsigned long tries; const struct net_device_ops *ops = dev->netdev_ops; /* It is up to the caller to keep npinfo alive. */ - struct netpoll_info *npinfo = np->dev->npinfo; + struct netpoll_info *npinfo; + + WARN_ON_ONCE(!irqs_disabled()); + npinfo = rcu_dereference_bh(np->dev->npinfo); if (!npinfo || !netif_running(dev) || !netif_device_present(dev)) { __kfree_skb(skb); return; @@ -319,11 +326,9 @@ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb, /* don't get messages out of order, and no recursion */ if (skb_queue_len(&npinfo->txq) == 0 && !netpoll_owner_active(dev)) { struct netdev_queue *txq; - unsigned long flags; txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb)); - local_irq_save(flags); /* try until next clock tick */ for (tries = jiffies_to_usecs(1)/USEC_PER_POLL; tries > 0; --tries) { @@ -347,10 +352,9 @@ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb, } WARN_ONCE(!irqs_disabled(), - "netpoll_send_skb(): %s enabled interrupts in poll (%pF)\n", + "netpoll_send_skb_on_dev(): %s enabled interrupts in poll (%pF)\n", dev->name, ops->ndo_start_xmit); - local_irq_restore(flags); } if (status != NETDEV_TX_OK) { @@ -423,9 +427,8 @@ void netpoll_send_udp(struct netpoll *np, const char *msg, int len) } EXPORT_SYMBOL(netpoll_send_udp); -static void arp_reply(struct sk_buff *skb) +static void netpoll_arp_reply(struct sk_buff *skb, struct netpoll_info *npinfo) { - struct netpoll_info *npinfo = skb->dev->npinfo; struct arphdr *arp; unsigned char *arp_ptr; int size, type = ARPOP_REPLY, ptype = ETH_P_ARP; -- cgit v1.2.3 From e15c3c2294605f09f9b336b2f3b97086ab4b8145 Mon Sep 17 00:00:00 2001 From: Amerigo Wang Date: Fri, 10 Aug 2012 01:24:45 +0000 Subject: netpoll: check netpoll tx status on the right device Although this doesn't matter actually, because netpoll_tx_running() doesn't use the parameter, the code will be more readable. For team_dev_queue_xmit() we have to move it down to avoid compile errors. Cc: David Miller Signed-off-by: Jiri Pirko Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- drivers/net/bonding/bond_main.c | 2 +- include/linux/if_team.h | 30 +++++++++++++++--------------- net/bridge/br_forward.c | 2 +- 3 files changed, 17 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index e42891683e3b..d688a8af432c 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -398,7 +398,7 @@ int bond_dev_queue_xmit(struct bonding *bond, struct sk_buff *skb, sizeof(qdisc_skb_cb(skb)->slave_dev_queue_mapping)); skb->queue_mapping = qdisc_skb_cb(skb)->slave_dev_queue_mapping; - if (unlikely(netpoll_tx_running(slave_dev))) + if (unlikely(netpoll_tx_running(bond->dev))) bond_netpoll_send_skb(bond_get_slave_by_dev(bond, slave_dev), skb); else dev_queue_xmit(skb); diff --git a/include/linux/if_team.h b/include/linux/if_team.h index 6960fc1841a7..aa2e167e1ef4 100644 --- a/include/linux/if_team.h +++ b/include/linux/if_team.h @@ -96,21 +96,6 @@ static inline void team_netpoll_send_skb(struct team_port *port, } #endif -static inline int team_dev_queue_xmit(struct team *team, struct team_port *port, - struct sk_buff *skb) -{ - BUILD_BUG_ON(sizeof(skb->queue_mapping) != - sizeof(qdisc_skb_cb(skb)->slave_dev_queue_mapping)); - skb_set_queue_mapping(skb, qdisc_skb_cb(skb)->slave_dev_queue_mapping); - - skb->dev = port->dev; - if (unlikely(netpoll_tx_running(port->dev))) { - team_netpoll_send_skb(port, skb); - return 0; - } - return dev_queue_xmit(skb); -} - struct team_mode_ops { int (*init)(struct team *team); void (*exit)(struct team *team); @@ -200,6 +185,21 @@ struct team { long mode_priv[TEAM_MODE_PRIV_LONGS]; }; +static inline int team_dev_queue_xmit(struct team *team, struct team_port *port, + struct sk_buff *skb) +{ + BUILD_BUG_ON(sizeof(skb->queue_mapping) != + sizeof(qdisc_skb_cb(skb)->slave_dev_queue_mapping)); + skb_set_queue_mapping(skb, qdisc_skb_cb(skb)->slave_dev_queue_mapping); + + skb->dev = port->dev; + if (unlikely(netpoll_tx_running(team->dev))) { + team_netpoll_send_skb(port, skb); + return 0; + } + return dev_queue_xmit(skb); +} + static inline struct hlist_head *team_port_index_hash(struct team *team, int port_index) { diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c index e9466d412707..02015a505d2a 100644 --- a/net/bridge/br_forward.c +++ b/net/bridge/br_forward.c @@ -65,7 +65,7 @@ static void __br_deliver(const struct net_bridge_port *to, struct sk_buff *skb) { skb->dev = to->dev; - if (unlikely(netpoll_tx_running(to->dev))) { + if (unlikely(netpoll_tx_running(to->br->dev))) { if (packet_length(skb) > skb->dev->mtu && !skb_is_gso(skb)) kfree_skb(skb); else { -- cgit v1.2.3 From 77ab8a54d9a8dcc4a46484a04133314f33f2aba6 Mon Sep 17 00:00:00 2001 From: Amerigo Wang Date: Fri, 10 Aug 2012 01:24:46 +0000 Subject: netpoll: convert several functions to bool These functions are just boolean, let them return bool instead of int. Cc: David Miller Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- include/linux/netpoll.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h index 61aee86cf21d..66d5379c305e 100644 --- a/include/linux/netpoll.h +++ b/include/linux/netpoll.h @@ -66,7 +66,7 @@ static inline void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) #ifdef CONFIG_NETPOLL -static inline int netpoll_rx_on(struct sk_buff *skb) +static inline bool netpoll_rx_on(struct sk_buff *skb) { struct netpoll_info *npinfo = rcu_dereference_bh(skb->dev->npinfo); @@ -125,7 +125,7 @@ static inline void netpoll_poll_unlock(void *have) } } -static inline int netpoll_tx_running(struct net_device *dev) +static inline bool netpoll_tx_running(struct net_device *dev) { return irqs_disabled(); } @@ -133,11 +133,11 @@ static inline int netpoll_tx_running(struct net_device *dev) #else static inline bool netpoll_rx(struct sk_buff *skb) { - return 0; + return false; } -static inline int netpoll_rx_on(struct sk_buff *skb) +static inline bool netpoll_rx_on(struct sk_buff *skb) { - return 0; + return false; } static inline int netpoll_receive_skb(struct sk_buff *skb) { @@ -153,9 +153,9 @@ static inline void netpoll_poll_unlock(void *have) static inline void netpoll_netdev_init(struct net_device *dev) { } -static inline int netpoll_tx_running(struct net_device *dev) +static inline bool netpoll_tx_running(struct net_device *dev) { - return 0; + return false; } #endif -- cgit v1.2.3 From 6024935f5ff5f1646bce8404416318e5fd4a0c4a Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Mon, 13 Aug 2012 02:49:59 +0000 Subject: llc2: Fix silent failure of llc_station_init() llc_station_init() creates and processes an event skb with no effect other than to change the state from DOWN to UP. Allocation failure is reported, but then ignored by its caller, llc2_init(). Remove this possibility by simply initialising the state as UP. Signed-off-by: Ben Hutchings Signed-off-by: David S. Miller --- include/net/llc.h | 2 +- net/llc/llc_station.c | 19 ++----------------- 2 files changed, 3 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/include/net/llc.h b/include/net/llc.h index 226c846cab08..f2d0fc570527 100644 --- a/include/net/llc.h +++ b/include/net/llc.h @@ -133,7 +133,7 @@ extern int llc_build_and_send_ui_pkt(struct llc_sap *sap, struct sk_buff *skb, extern void llc_sap_handler(struct llc_sap *sap, struct sk_buff *skb); extern void llc_conn_handler(struct llc_sap *sap, struct sk_buff *skb); -extern int llc_station_init(void); +extern void llc_station_init(void); extern void llc_station_exit(void); #ifdef CONFIG_PROC_FS diff --git a/net/llc/llc_station.c b/net/llc/llc_station.c index 6828e39ec2ec..45ddbb93c5d0 100644 --- a/net/llc/llc_station.c +++ b/net/llc/llc_station.c @@ -687,12 +687,8 @@ static void llc_station_rcv(struct sk_buff *skb) llc_station_state_process(skb); } -int __init llc_station_init(void) +void __init llc_station_init(void) { - int rc = -ENOBUFS; - struct sk_buff *skb; - struct llc_station_state_ev *ev; - skb_queue_head_init(&llc_main_station.mac_pdu_q); skb_queue_head_init(&llc_main_station.ev_q.list); spin_lock_init(&llc_main_station.ev_q.lock); @@ -700,20 +696,9 @@ int __init llc_station_init(void) (unsigned long)&llc_main_station); llc_main_station.ack_timer.expires = jiffies + sysctl_llc_station_ack_timeout; - skb = alloc_skb(0, GFP_ATOMIC); - if (!skb) - goto out; - rc = 0; llc_set_station_handler(llc_station_rcv); - ev = llc_station_ev(skb); - memset(ev, 0, sizeof(*ev)); llc_main_station.maximum_retry = 1; - llc_main_station.state = LLC_STATION_STATE_DOWN; - ev->type = LLC_STATION_EV_TYPE_SIMPLE; - ev->prim_type = LLC_STATION_EV_ENABLE_WITHOUT_DUP_ADDR_CHECK; - rc = llc_station_next_state(skb); -out: - return rc; + llc_main_station.state = LLC_STATION_STATE_UP; } void __exit llc_station_exit(void) -- cgit v1.2.3 From 4e8b14526ca7fb046a81c94002c1c43b6fdf0e9b Mon Sep 17 00:00:00 2001 From: John Stultz Date: Wed, 8 Aug 2012 15:36:20 -0400 Subject: time: Improve sanity checking of timekeeping inputs Unexpected behavior could occur if the time is set to a value large enough to overflow a 64bit ktime_t (which is something larger then the year 2262). Also unexpected behavior could occur if large negative offsets are injected via adjtimex. So this patch improves the sanity check timekeeping inputs by improving the timespec_valid() check, and then makes better use of timespec_valid() to make sure we don't set the time to an invalid negative value or one that overflows ktime_t. Note: This does not protect from setting the time close to overflowing ktime_t and then letting natural accumulation cause the overflow. Reported-by: CAI Qian Reported-by: Sasha Levin Signed-off-by: John Stultz Cc: Peter Zijlstra Cc: Prarit Bhargava Cc: Zhouping Liu Cc: Ingo Molnar Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/1344454580-17031-1-git-send-email-john.stultz@linaro.org Signed-off-by: Thomas Gleixner --- include/linux/ktime.h | 7 ------- include/linux/time.h | 22 ++++++++++++++++++++-- kernel/time/timekeeping.c | 26 ++++++++++++++++++++++++-- 3 files changed, 44 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/linux/ktime.h b/include/linux/ktime.h index 603bec2913b0..06177ba10a16 100644 --- a/include/linux/ktime.h +++ b/include/linux/ktime.h @@ -58,13 +58,6 @@ union ktime { typedef union ktime ktime_t; /* Kill this */ -#define KTIME_MAX ((s64)~((u64)1 << 63)) -#if (BITS_PER_LONG == 64) -# define KTIME_SEC_MAX (KTIME_MAX / NSEC_PER_SEC) -#else -# define KTIME_SEC_MAX LONG_MAX -#endif - /* * ktime_t definitions when using the 64-bit scalar representation: */ diff --git a/include/linux/time.h b/include/linux/time.h index c81c5e40fcb5..b0bbd8f0130d 100644 --- a/include/linux/time.h +++ b/include/linux/time.h @@ -107,11 +107,29 @@ static inline struct timespec timespec_sub(struct timespec lhs, return ts_delta; } +#define KTIME_MAX ((s64)~((u64)1 << 63)) +#if (BITS_PER_LONG == 64) +# define KTIME_SEC_MAX (KTIME_MAX / NSEC_PER_SEC) +#else +# define KTIME_SEC_MAX LONG_MAX +#endif + /* * Returns true if the timespec is norm, false if denorm: */ -#define timespec_valid(ts) \ - (((ts)->tv_sec >= 0) && (((unsigned long) (ts)->tv_nsec) < NSEC_PER_SEC)) +static inline bool timespec_valid(const struct timespec *ts) +{ + /* Dates before 1970 are bogus */ + if (ts->tv_sec < 0) + return false; + /* Can't have more nanoseconds then a second */ + if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC) + return false; + /* Disallow values that could overflow ktime_t */ + if ((unsigned long long)ts->tv_sec >= KTIME_SEC_MAX) + return false; + return true; +} extern void read_persistent_clock(struct timespec *ts); extern void read_boot_clock(struct timespec *ts); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index e16af197a2bc..898bef066a44 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -427,7 +427,7 @@ int do_settimeofday(const struct timespec *tv) struct timespec ts_delta, xt; unsigned long flags; - if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) + if (!timespec_valid(tv)) return -EINVAL; write_seqlock_irqsave(&tk->lock, flags); @@ -463,6 +463,8 @@ int timekeeping_inject_offset(struct timespec *ts) { struct timekeeper *tk = &timekeeper; unsigned long flags; + struct timespec tmp; + int ret = 0; if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC) return -EINVAL; @@ -471,10 +473,17 @@ int timekeeping_inject_offset(struct timespec *ts) timekeeping_forward_now(tk); + /* Make sure the proposed value is valid */ + tmp = timespec_add(tk_xtime(tk), *ts); + if (!timespec_valid(&tmp)) { + ret = -EINVAL; + goto error; + } tk_xtime_add(tk, ts); tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *ts)); +error: /* even if we error out, we forwarded the time, so call update */ timekeeping_update(tk, true); write_sequnlock_irqrestore(&tk->lock, flags); @@ -482,7 +491,7 @@ int timekeeping_inject_offset(struct timespec *ts) /* signal hrtimers about time change */ clock_was_set(); - return 0; + return ret; } EXPORT_SYMBOL(timekeeping_inject_offset); @@ -649,7 +658,20 @@ void __init timekeeping_init(void) struct timespec now, boot, tmp; read_persistent_clock(&now); + if (!timespec_valid(&now)) { + pr_warn("WARNING: Persistent clock returned invalid value!\n" + " Check your CMOS/BIOS settings.\n"); + now.tv_sec = 0; + now.tv_nsec = 0; + } + read_boot_clock(&boot); + if (!timespec_valid(&boot)) { + pr_warn("WARNING: Boot clock returned invalid value!\n" + " Check your CMOS/BIOS settings.\n"); + boot.tv_sec = 0; + boot.tv_nsec = 0; + } seqlock_init(&tk->lock); -- cgit v1.2.3 From 58569aee5a1a5dcc25c34a0a2ed9a377874e6b05 Mon Sep 17 00:00:00 2001 From: "Arnaud Patard (Rtp)" Date: Thu, 26 Jul 2012 12:15:46 +0200 Subject: ARM: Orion: Set eth packet size csum offload limit The mv643xx ethernet controller limits the packet size for the TX checksum offloading. This patch sets this limits for Kirkwood and Dove which have smaller limits that the default. As a side note, this patch is an updated version of a patch sent some years ago: http://lists.infradead.org/pipermail/linux-arm-kernel/2010-June/017320.html which seems to have been lost. Signed-off-by: Arnaud Patard Signed-off-by: Jason Cooper Cc: --- arch/arm/mach-dove/common.c | 3 ++- arch/arm/mach-kirkwood/common.c | 4 ++-- arch/arm/mach-mv78xx0/common.c | 6 ++++-- arch/arm/mach-orion5x/common.c | 3 ++- arch/arm/plat-orion/common.c | 8 ++++++-- arch/arm/plat-orion/include/plat/common.h | 6 ++++-- include/linux/mv643xx_eth.h | 2 ++ 7 files changed, 22 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/arch/arm/mach-dove/common.c b/arch/arm/mach-dove/common.c index 4db5de54b6a7..6321567d8eaa 100644 --- a/arch/arm/mach-dove/common.c +++ b/arch/arm/mach-dove/common.c @@ -102,7 +102,8 @@ void __init dove_ehci1_init(void) void __init dove_ge00_init(struct mv643xx_eth_platform_data *eth_data) { orion_ge00_init(eth_data, DOVE_GE00_PHYS_BASE, - IRQ_DOVE_GE00_SUM, IRQ_DOVE_GE00_ERR); + IRQ_DOVE_GE00_SUM, IRQ_DOVE_GE00_ERR, + 1600); } /***************************************************************************** diff --git a/arch/arm/mach-kirkwood/common.c b/arch/arm/mach-kirkwood/common.c index c4b64adcbfce..3226077735b1 100644 --- a/arch/arm/mach-kirkwood/common.c +++ b/arch/arm/mach-kirkwood/common.c @@ -301,7 +301,7 @@ void __init kirkwood_ge00_init(struct mv643xx_eth_platform_data *eth_data) { orion_ge00_init(eth_data, GE00_PHYS_BASE, IRQ_KIRKWOOD_GE00_SUM, - IRQ_KIRKWOOD_GE00_ERR); + IRQ_KIRKWOOD_GE00_ERR, 1600); /* The interface forgets the MAC address assigned by u-boot if the clock is turned off, so claim the clk now. */ clk_prepare_enable(ge0); @@ -315,7 +315,7 @@ void __init kirkwood_ge01_init(struct mv643xx_eth_platform_data *eth_data) { orion_ge01_init(eth_data, GE01_PHYS_BASE, IRQ_KIRKWOOD_GE01_SUM, - IRQ_KIRKWOOD_GE01_ERR); + IRQ_KIRKWOOD_GE01_ERR, 1600); clk_prepare_enable(ge1); } diff --git a/arch/arm/mach-mv78xx0/common.c b/arch/arm/mach-mv78xx0/common.c index b4c53b846c9c..3057f7d4329a 100644 --- a/arch/arm/mach-mv78xx0/common.c +++ b/arch/arm/mach-mv78xx0/common.c @@ -213,7 +213,8 @@ void __init mv78xx0_ge00_init(struct mv643xx_eth_platform_data *eth_data) { orion_ge00_init(eth_data, GE00_PHYS_BASE, IRQ_MV78XX0_GE00_SUM, - IRQ_MV78XX0_GE_ERR); + IRQ_MV78XX0_GE_ERR, + MV643XX_TX_CSUM_DEFAULT_LIMIT); } @@ -224,7 +225,8 @@ void __init mv78xx0_ge01_init(struct mv643xx_eth_platform_data *eth_data) { orion_ge01_init(eth_data, GE01_PHYS_BASE, IRQ_MV78XX0_GE01_SUM, - NO_IRQ); + NO_IRQ, + MV643XX_TX_CSUM_DEFAULT_LIMIT); } diff --git a/arch/arm/mach-orion5x/common.c b/arch/arm/mach-orion5x/common.c index 9148b229d0de..410291c67666 100644 --- a/arch/arm/mach-orion5x/common.c +++ b/arch/arm/mach-orion5x/common.c @@ -109,7 +109,8 @@ void __init orion5x_eth_init(struct mv643xx_eth_platform_data *eth_data) { orion_ge00_init(eth_data, ORION5X_ETH_PHYS_BASE, IRQ_ORION5X_ETH_SUM, - IRQ_ORION5X_ETH_ERR); + IRQ_ORION5X_ETH_ERR, + MV643XX_TX_CSUM_DEFAULT_LIMIT); } diff --git a/arch/arm/plat-orion/common.c b/arch/arm/plat-orion/common.c index d245a87dc014..b8b747a9d360 100644 --- a/arch/arm/plat-orion/common.c +++ b/arch/arm/plat-orion/common.c @@ -291,10 +291,12 @@ static struct platform_device orion_ge00 = { void __init orion_ge00_init(struct mv643xx_eth_platform_data *eth_data, unsigned long mapbase, unsigned long irq, - unsigned long irq_err) + unsigned long irq_err, + unsigned int tx_csum_limit) { fill_resources(&orion_ge00_shared, orion_ge00_shared_resources, mapbase + 0x2000, SZ_16K - 1, irq_err); + orion_ge00_shared_data.tx_csum_limit = tx_csum_limit; ge_complete(&orion_ge00_shared_data, orion_ge00_resources, irq, &orion_ge00_shared, eth_data, &orion_ge00); @@ -343,10 +345,12 @@ static struct platform_device orion_ge01 = { void __init orion_ge01_init(struct mv643xx_eth_platform_data *eth_data, unsigned long mapbase, unsigned long irq, - unsigned long irq_err) + unsigned long irq_err, + unsigned int tx_csum_limit) { fill_resources(&orion_ge01_shared, orion_ge01_shared_resources, mapbase + 0x2000, SZ_16K - 1, irq_err); + orion_ge01_shared_data.tx_csum_limit = tx_csum_limit; ge_complete(&orion_ge01_shared_data, orion_ge01_resources, irq, &orion_ge01_shared, eth_data, &orion_ge01); diff --git a/arch/arm/plat-orion/include/plat/common.h b/arch/arm/plat-orion/include/plat/common.h index e00fdb213609..ae2377ef63e5 100644 --- a/arch/arm/plat-orion/include/plat/common.h +++ b/arch/arm/plat-orion/include/plat/common.h @@ -39,12 +39,14 @@ void __init orion_rtc_init(unsigned long mapbase, void __init orion_ge00_init(struct mv643xx_eth_platform_data *eth_data, unsigned long mapbase, unsigned long irq, - unsigned long irq_err); + unsigned long irq_err, + unsigned int tx_csum_limit); void __init orion_ge01_init(struct mv643xx_eth_platform_data *eth_data, unsigned long mapbase, unsigned long irq, - unsigned long irq_err); + unsigned long irq_err, + unsigned int tx_csum_limit); void __init orion_ge10_init(struct mv643xx_eth_platform_data *eth_data, unsigned long mapbase, diff --git a/include/linux/mv643xx_eth.h b/include/linux/mv643xx_eth.h index 51bf8ada6dc0..49258e0ed1c6 100644 --- a/include/linux/mv643xx_eth.h +++ b/include/linux/mv643xx_eth.h @@ -15,6 +15,8 @@ #define MV643XX_ETH_SIZE_REG_4 0x2224 #define MV643XX_ETH_BASE_ADDR_ENABLE_REG 0x2290 +#define MV643XX_TX_CSUM_DEFAULT_LIMIT 0 + struct mv643xx_eth_shared_platform_data { struct mbus_dram_target_info *dram; struct platform_device *shared_smi; -- cgit v1.2.3 From ca08649eb5dd30f11a5a8fe8659b48899b7ea6a1 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Thu, 16 Aug 2012 11:31:27 -0400 Subject: Revert "xen PVonHVM: move shared_info to MMIO before kexec" This reverts commit 00e37bdb0113a98408de42db85be002f21dbffd3. During shutdown of PVHVM guests with more than 2VCPUs on certain machines we can hit the race where the replaced shared_info is not replaced fast enough and the PV time clock retries reading the same area over and over without any any success and is stuck in an infinite loop. Acked-by: Olaf Hering Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/enlighten.c | 118 +++++---------------------------------------- arch/x86/xen/suspend.c | 2 +- arch/x86/xen/xen-ops.h | 2 +- drivers/xen/platform-pci.c | 15 ------ include/xen/events.h | 2 - 5 files changed, 13 insertions(+), 126 deletions(-) (limited to 'include') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index a6f8acbdfc9a..f1814fc2cb77 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -31,7 +31,6 @@ #include #include #include -#include #include #include @@ -1472,130 +1471,38 @@ asmlinkage void __init xen_start_kernel(void) #endif } -#ifdef CONFIG_XEN_PVHVM -/* - * The pfn containing the shared_info is located somewhere in RAM. This - * will cause trouble if the current kernel is doing a kexec boot into a - * new kernel. The new kernel (and its startup code) can not know where - * the pfn is, so it can not reserve the page. The hypervisor will - * continue to update the pfn, and as a result memory corruption occours - * in the new kernel. - * - * One way to work around this issue is to allocate a page in the - * xen-platform pci device's BAR memory range. But pci init is done very - * late and the shared_info page is already in use very early to read - * the pvclock. So moving the pfn from RAM to MMIO is racy because some - * code paths on other vcpus could access the pfn during the small - * window when the old pfn is moved to the new pfn. There is even a - * small window were the old pfn is not backed by a mfn, and during that - * time all reads return -1. - * - * Because it is not known upfront where the MMIO region is located it - * can not be used right from the start in xen_hvm_init_shared_info. - * - * To minimise trouble the move of the pfn is done shortly before kexec. - * This does not eliminate the race because all vcpus are still online - * when the syscore_ops will be called. But hopefully there is no work - * pending at this point in time. Also the syscore_op is run last which - * reduces the risk further. - */ - -static struct shared_info *xen_hvm_shared_info; - -static void xen_hvm_connect_shared_info(unsigned long pfn) +void __ref xen_hvm_init_shared_info(void) { + int cpu; struct xen_add_to_physmap xatp; + static struct shared_info *shared_info_page = 0; + if (!shared_info_page) + shared_info_page = (struct shared_info *) + extend_brk(PAGE_SIZE, PAGE_SIZE); xatp.domid = DOMID_SELF; xatp.idx = 0; xatp.space = XENMAPSPACE_shared_info; - xatp.gpfn = pfn; + xatp.gpfn = __pa(shared_info_page) >> PAGE_SHIFT; if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp)) BUG(); -} -static void xen_hvm_set_shared_info(struct shared_info *sip) -{ - int cpu; - - HYPERVISOR_shared_info = sip; + HYPERVISOR_shared_info = (struct shared_info *)shared_info_page; /* xen_vcpu is a pointer to the vcpu_info struct in the shared_info * page, we use it in the event channel upcall and in some pvclock * related functions. We don't need the vcpu_info placement * optimizations because we don't use any pv_mmu or pv_irq op on * HVM. - * When xen_hvm_set_shared_info is run at boot time only vcpu 0 is - * online but xen_hvm_set_shared_info is run at resume time too and + * When xen_hvm_init_shared_info is run at boot time only vcpu 0 is + * online but xen_hvm_init_shared_info is run at resume time too and * in that case multiple vcpus might be online. */ for_each_online_cpu(cpu) { per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu]; } } -/* Reconnect the shared_info pfn to a mfn */ -void xen_hvm_resume_shared_info(void) -{ - xen_hvm_connect_shared_info(__pa(xen_hvm_shared_info) >> PAGE_SHIFT); -} - -#ifdef CONFIG_KEXEC -static struct shared_info *xen_hvm_shared_info_kexec; -static unsigned long xen_hvm_shared_info_pfn_kexec; - -/* Remember a pfn in MMIO space for kexec reboot */ -void __devinit xen_hvm_prepare_kexec(struct shared_info *sip, unsigned long pfn) -{ - xen_hvm_shared_info_kexec = sip; - xen_hvm_shared_info_pfn_kexec = pfn; -} - -static void xen_hvm_syscore_shutdown(void) -{ - struct xen_memory_reservation reservation = { - .domid = DOMID_SELF, - .nr_extents = 1, - }; - unsigned long prev_pfn; - int rc; - - if (!xen_hvm_shared_info_kexec) - return; - - prev_pfn = __pa(xen_hvm_shared_info) >> PAGE_SHIFT; - set_xen_guest_handle(reservation.extent_start, &prev_pfn); - - /* Move pfn to MMIO, disconnects previous pfn from mfn */ - xen_hvm_connect_shared_info(xen_hvm_shared_info_pfn_kexec); - - /* Update pointers, following hypercall is also a memory barrier */ - xen_hvm_set_shared_info(xen_hvm_shared_info_kexec); - - /* Allocate new mfn for previous pfn */ - do { - rc = HYPERVISOR_memory_op(XENMEM_populate_physmap, &reservation); - if (rc == 0) - msleep(123); - } while (rc == 0); - - /* Make sure the previous pfn is really connected to a (new) mfn */ - BUG_ON(rc != 1); -} - -static struct syscore_ops xen_hvm_syscore_ops = { - .shutdown = xen_hvm_syscore_shutdown, -}; -#endif - -/* Use a pfn in RAM, may move to MMIO before kexec. */ -static void __init xen_hvm_init_shared_info(void) -{ - /* Remember pointer for resume */ - xen_hvm_shared_info = extend_brk(PAGE_SIZE, PAGE_SIZE); - xen_hvm_connect_shared_info(__pa(xen_hvm_shared_info) >> PAGE_SHIFT); - xen_hvm_set_shared_info(xen_hvm_shared_info); -} - +#ifdef CONFIG_XEN_PVHVM static void __init init_hvm_pv_info(void) { int major, minor; @@ -1646,9 +1553,6 @@ static void __init xen_hvm_guest_init(void) init_hvm_pv_info(); xen_hvm_init_shared_info(); -#ifdef CONFIG_KEXEC - register_syscore_ops(&xen_hvm_syscore_ops); -#endif if (xen_feature(XENFEAT_hvm_callback_vector)) xen_have_vector_callback = 1; diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c index ae8a00c39de4..45329c8c226e 100644 --- a/arch/x86/xen/suspend.c +++ b/arch/x86/xen/suspend.c @@ -30,7 +30,7 @@ void xen_arch_hvm_post_suspend(int suspend_cancelled) { #ifdef CONFIG_XEN_PVHVM int cpu; - xen_hvm_resume_shared_info(); + xen_hvm_init_shared_info(); xen_callback_vector(); xen_unplug_emulated_devices(); if (xen_feature(XENFEAT_hvm_safe_pvclock)) { diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 1e4329e04e0f..202d4c150154 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -41,7 +41,7 @@ void xen_enable_syscall(void); void xen_vcpu_restore(void); void xen_callback_vector(void); -void xen_hvm_resume_shared_info(void); +void xen_hvm_init_shared_info(void); void xen_unplug_emulated_devices(void); void __init xen_build_dynamic_phys_to_machine(void); diff --git a/drivers/xen/platform-pci.c b/drivers/xen/platform-pci.c index d4c50d63acbc..97ca359ae2bd 100644 --- a/drivers/xen/platform-pci.c +++ b/drivers/xen/platform-pci.c @@ -101,19 +101,6 @@ static int platform_pci_resume(struct pci_dev *pdev) return 0; } -static void __devinit prepare_shared_info(void) -{ -#ifdef CONFIG_KEXEC - unsigned long addr; - struct shared_info *hvm_shared_info; - - addr = alloc_xen_mmio(PAGE_SIZE); - hvm_shared_info = ioremap(addr, PAGE_SIZE); - memset(hvm_shared_info, 0, PAGE_SIZE); - xen_hvm_prepare_kexec(hvm_shared_info, addr >> PAGE_SHIFT); -#endif -} - static int __devinit platform_pci_init(struct pci_dev *pdev, const struct pci_device_id *ent) { @@ -151,8 +138,6 @@ static int __devinit platform_pci_init(struct pci_dev *pdev, platform_mmio = mmio_addr; platform_mmiolen = mmio_len; - prepare_shared_info(); - if (!xen_have_vector_callback) { ret = xen_allocate_irq(pdev); if (ret) { diff --git a/include/xen/events.h b/include/xen/events.h index 9c641deb65d2..04399b28e821 100644 --- a/include/xen/events.h +++ b/include/xen/events.h @@ -58,8 +58,6 @@ void notify_remote_via_irq(int irq); void xen_irq_resume(void); -void xen_hvm_prepare_kexec(struct shared_info *sip, unsigned long pfn); - /* Clear an irq's pending state, in preparation for polling on it */ void xen_clear_irq_pending(int irq); void xen_set_irq_pending(int irq); -- cgit v1.2.3 From 8857df3aceb7a8eb7558059b7da109e41dd1fb95 Mon Sep 17 00:00:00 2001 From: Michael Hennerich Date: Fri, 20 Jul 2012 09:31:00 +0100 Subject: iio: frequency: ADF4350: Fix potential reference div factor overflow. With small channel spacing values and high reference frequencies it is possible to exceed the range of the 10-bit counter. Workaround by checking the range and widening some constrains. We don't use the REG1_PHASE value in this case the datasheet recommends to set it to 1 if not used. Signed-off-by: Michael Hennerich Signed-off-by: Jonathan Cameron --- drivers/iio/frequency/adf4350.c | 24 +++++++++++++++--------- include/linux/iio/frequency/adf4350.h | 2 ++ 2 files changed, 17 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/drivers/iio/frequency/adf4350.c b/drivers/iio/frequency/adf4350.c index 59fbb3ae40e7..e35bb8f6fe75 100644 --- a/drivers/iio/frequency/adf4350.c +++ b/drivers/iio/frequency/adf4350.c @@ -129,7 +129,7 @@ static int adf4350_set_freq(struct adf4350_state *st, unsigned long long freq) { struct adf4350_platform_data *pdata = st->pdata; u64 tmp; - u32 div_gcd, prescaler; + u32 div_gcd, prescaler, chspc; u16 mdiv, r_cnt = 0; u8 band_sel_div; @@ -158,14 +158,20 @@ static int adf4350_set_freq(struct adf4350_state *st, unsigned long long freq) if (pdata->ref_div_factor) r_cnt = pdata->ref_div_factor - 1; - do { - r_cnt = adf4350_tune_r_cnt(st, r_cnt); + chspc = st->chspc; - st->r1_mod = st->fpfd / st->chspc; - while (st->r1_mod > ADF4350_MAX_MODULUS) { - r_cnt = adf4350_tune_r_cnt(st, r_cnt); - st->r1_mod = st->fpfd / st->chspc; - } + do { + do { + do { + r_cnt = adf4350_tune_r_cnt(st, r_cnt); + st->r1_mod = st->fpfd / chspc; + if (r_cnt > ADF4350_MAX_R_CNT) { + /* try higher spacing values */ + chspc++; + r_cnt = 0; + } + } while ((st->r1_mod > ADF4350_MAX_MODULUS) && r_cnt); + } while (r_cnt == 0); tmp = freq * (u64)st->r1_mod + (st->fpfd > 1); do_div(tmp, st->fpfd); /* Div round closest (n + d/2)/d */ @@ -194,7 +200,7 @@ static int adf4350_set_freq(struct adf4350_state *st, unsigned long long freq) st->regs[ADF4350_REG0] = ADF4350_REG0_INT(st->r0_int) | ADF4350_REG0_FRACT(st->r0_fract); - st->regs[ADF4350_REG1] = ADF4350_REG1_PHASE(0) | + st->regs[ADF4350_REG1] = ADF4350_REG1_PHASE(1) | ADF4350_REG1_MOD(st->r1_mod) | prescaler; diff --git a/include/linux/iio/frequency/adf4350.h b/include/linux/iio/frequency/adf4350.h index b76b4a87065e..be91f344d5fc 100644 --- a/include/linux/iio/frequency/adf4350.h +++ b/include/linux/iio/frequency/adf4350.h @@ -87,6 +87,8 @@ #define ADF4350_MAX_BANDSEL_CLK 125000 /* Hz */ #define ADF4350_MAX_FREQ_REFIN 250000000 /* Hz */ #define ADF4350_MAX_MODULUS 4095 +#define ADF4350_MAX_R_CNT 1023 + /** * struct adf4350_platform_data - platform specific information -- cgit v1.2.3 From ac5aa7f9e0891a115ab307b4bdde9c55b9232970 Mon Sep 17 00:00:00 2001 From: Richard Genoud Date: Fri, 10 Aug 2012 16:52:58 +0200 Subject: pinctrl: header: trivial: declare struct device As struct device is used as a function argument, it should at least be declared (device.h is not included). Signed-off-by: Richard Genoud Signed-off-by: Linus Walleij --- include/linux/pinctrl/consumer.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/pinctrl/consumer.h b/include/linux/pinctrl/consumer.h index 6dd96fb45482..e9b7f4350844 100644 --- a/include/linux/pinctrl/consumer.h +++ b/include/linux/pinctrl/consumer.h @@ -20,6 +20,7 @@ /* This struct is private to the core and should be regarded as a cookie */ struct pinctrl; struct pinctrl_state; +struct device; #ifdef CONFIG_PINCTRL -- cgit v1.2.3 From 8513915accc611e576dbebb93422c257e7e68be8 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 18 Aug 2012 17:43:05 -0700 Subject: ALSA: fix pcm.h kernel-doc warning and notation Fix kernel-doc warning in and add function name to make the kernel-doc notation complete. Warning(include/sound/pcm.h:1081): No description found for parameter 'substream' Signed-off-by: Randy Dunlap Signed-off-by: Takashi Iwai --- include/sound/pcm.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/sound/pcm.h b/include/sound/pcm.h index c75c0d1a85e2..cdca2ab1e711 100644 --- a/include/sound/pcm.h +++ b/include/sound/pcm.h @@ -1075,7 +1075,8 @@ static inline void snd_pcm_limit_isa_dma_size(int dma, size_t *max) const char *snd_pcm_format_name(snd_pcm_format_t format); /** - * Get a string naming the direction of a stream + * snd_pcm_stream_str - Get a string naming the direction of a stream + * @substream: the pcm substream instance */ static inline const char *snd_pcm_stream_str(struct snd_pcm_substream *substream) { -- cgit v1.2.3 From 3296193d1421c2d6f9e49e181cecfd917f0f5764 Mon Sep 17 00:00:00 2001 From: Timur Tabi Date: Tue, 14 Aug 2012 13:20:23 +0000 Subject: dt: introduce for_each_available_child_of_node, of_get_next_available_child Macro for_each_child_of_node() makes it easy to iterate over all of the children for a given device tree node, including those nodes that are marked as unavailable (i.e. status = "disabled"). Introduce for_each_available_child_of_node(), which is like for_each_child_of_node(), but it automatically skips unavailable nodes. This also requires the introduction of helper function of_get_next_available_child(), which returns the next available child node. Signed-off-by: Timur Tabi Signed-off-by: David S. Miller --- drivers/of/base.c | 27 +++++++++++++++++++++++++++ include/linux/of.h | 7 +++++++ 2 files changed, 34 insertions(+) (limited to 'include') diff --git a/drivers/of/base.c b/drivers/of/base.c index c181b94abc36..d4a1c9a043e1 100644 --- a/drivers/of/base.c +++ b/drivers/of/base.c @@ -363,6 +363,33 @@ struct device_node *of_get_next_child(const struct device_node *node, } EXPORT_SYMBOL(of_get_next_child); +/** + * of_get_next_available_child - Find the next available child node + * @node: parent node + * @prev: previous child of the parent node, or NULL to get first + * + * This function is like of_get_next_child(), except that it + * automatically skips any disabled nodes (i.e. status = "disabled"). + */ +struct device_node *of_get_next_available_child(const struct device_node *node, + struct device_node *prev) +{ + struct device_node *next; + + read_lock(&devtree_lock); + next = prev ? prev->sibling : node->child; + for (; next; next = next->sibling) { + if (!of_device_is_available(next)) + continue; + if (of_node_get(next)) + break; + } + of_node_put(prev); + read_unlock(&devtree_lock); + return next; +} +EXPORT_SYMBOL(of_get_next_available_child); + /** * of_find_node_by_path - Find a node matching a full OF path * @path: The full path to match diff --git a/include/linux/of.h b/include/linux/of.h index 5919ee33f2b7..1b1163225f3b 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -190,10 +190,17 @@ extern struct device_node *of_get_parent(const struct device_node *node); extern struct device_node *of_get_next_parent(struct device_node *node); extern struct device_node *of_get_next_child(const struct device_node *node, struct device_node *prev); +extern struct device_node *of_get_next_available_child( + const struct device_node *node, struct device_node *prev); + #define for_each_child_of_node(parent, child) \ for (child = of_get_next_child(parent, NULL); child != NULL; \ child = of_get_next_child(parent, child)) +#define for_each_available_child_of_node(parent, child) \ + for (child = of_get_next_available_child(parent, NULL); child != NULL; \ + child = of_get_next_available_child(parent, child)) + static inline int of_get_child_count(const struct device_node *np) { struct device_node *child; -- cgit v1.2.3 From c0de08d04215031d68fa13af36f347a6cfa252ca Mon Sep 17 00:00:00 2001 From: Eric Leblond Date: Thu, 16 Aug 2012 22:02:58 +0000 Subject: af_packet: don't emit packet on orig fanout group If a packet is emitted on one socket in one group of fanout sockets, it is transmitted again. It is thus read again on one of the sockets of the fanout group. This result in a loop for software which generate packets when receiving one. This retransmission is not the intended behavior: a fanout group must behave like a single socket. The packet should not be transmitted on a socket if it originates from a socket belonging to the same fanout group. This patch fixes the issue by changing the transmission check to take fanout group info account. Reported-by: Aleksandr Kotov Signed-off-by: Eric Leblond Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 ++ net/core/dev.c | 16 ++++++++++++++-- net/packet/af_packet.c | 9 +++++++++ 3 files changed, 25 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 3560d688161e..59dc05f38247 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1522,6 +1522,8 @@ struct packet_type { struct sk_buff **(*gro_receive)(struct sk_buff **head, struct sk_buff *skb); int (*gro_complete)(struct sk_buff *skb); + bool (*id_match)(struct packet_type *ptype, + struct sock *sk); void *af_packet_priv; struct list_head list; }; diff --git a/net/core/dev.c b/net/core/dev.c index a39354ee1432..debd9372472f 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1642,6 +1642,19 @@ static inline int deliver_skb(struct sk_buff *skb, return pt_prev->func(skb, skb->dev, pt_prev, orig_dev); } +static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb) +{ + if (ptype->af_packet_priv == NULL) + return false; + + if (ptype->id_match) + return ptype->id_match(ptype, skb->sk); + else if ((struct sock *)ptype->af_packet_priv == skb->sk) + return true; + + return false; +} + /* * Support routine. Sends outgoing frames to any network * taps currently in use. @@ -1659,8 +1672,7 @@ static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) * they originated from - MvS (miquels@drinkel.ow.org) */ if ((ptype->dev == dev || !ptype->dev) && - (ptype->af_packet_priv == NULL || - (struct sock *)ptype->af_packet_priv != skb->sk)) { + (!skb_loop_sk(ptype, skb))) { if (pt_prev) { deliver_skb(skb2, pt_prev, skb->dev); pt_prev = ptype; diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 8ac890a1a4c0..aee7196aac36 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1273,6 +1273,14 @@ static void __fanout_unlink(struct sock *sk, struct packet_sock *po) spin_unlock(&f->lock); } +bool match_fanout_group(struct packet_type *ptype, struct sock * sk) +{ + if (ptype->af_packet_priv == (void*)((struct packet_sock *)sk)->fanout) + return true; + + return false; +} + static int fanout_add(struct sock *sk, u16 id, u16 type_flags) { struct packet_sock *po = pkt_sk(sk); @@ -1325,6 +1333,7 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags) match->prot_hook.dev = po->prot_hook.dev; match->prot_hook.func = packet_rcv_fanout; match->prot_hook.af_packet_priv = match; + match->prot_hook.id_match = match_fanout_group; dev_add_pack(&match->prot_hook); list_add(&match->list, &fanout_list); } -- cgit v1.2.3 From 9d7b0fc1ef1f17aff57c0dc9a59453d8fca255c3 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Mon, 20 Aug 2012 02:56:56 -0700 Subject: net: ipv6: fix oops in inet_putpeer() Commit 97bab73f (inet: Hide route peer accesses behind helpers.) introduced a bug in xfrm6_policy_destroy(). The xfrm_dst's _rt6i_peer member is not initialized, causing a false positive result from inetpeer_ptr_is_peer(), which in turn causes a NULL pointer dereference in inet_putpeer(). Pid: 314, comm: kworker/0:1 Not tainted 3.6.0-rc1+ #17 To Be Filled By O.E.M. To Be Filled By O.E.M./P4S800D-X EIP: 0060:[] EFLAGS: 00010246 CPU: 0 EIP is at inet_putpeer+0xe/0x16 EAX: 00000000 EBX: f3481700 ECX: 00000000 EDX: 000dd641 ESI: f3481700 EDI: c05e949c EBP: f551def4 ESP: f551def4 DS: 007b ES: 007b FS: 0000 GS: 00e0 SS: 0068 CR0: 8005003b CR2: 00000070 CR3: 3243d000 CR4: 00000750 DR0: 00000000 DR1: 00000000 DR2: 00000000 DR3: 00000000 DR6: ffff0ff0 DR7: 00000400 f551df04 c0423de1 00000000 f3481700 f551df18 c038d5f7 f254b9f8 f551df28 f34f85d8 f551df20 c03ef48d f551df3c c0396870 f30697e8 f24e1738 c05e98f4 f5509540 c05cd2b4 f551df7c c0142d2b c043feb5 f5509540 00000000 c05cd2e8 [] xfrm6_dst_destroy+0x42/0xdb [] dst_destroy+0x1d/0xa4 [] xfrm_bundle_flo_delete+0x2b/0x36 [] flow_cache_gc_task+0x85/0x9f [] process_one_work+0x122/0x441 [] ? apic_timer_interrupt+0x31/0x38 [] ? flow_cache_new_hashrnd+0x2b/0x2b [] worker_thread+0x113/0x3cc Fix by adding a init_dst() callback to struct xfrm_policy_afinfo to properly initialize the dst's peer pointer. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- include/net/xfrm.h | 2 ++ net/ipv6/xfrm6_policy.c | 8 ++++++++ net/xfrm/xfrm_policy.c | 2 ++ 3 files changed, 12 insertions(+) (limited to 'include') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 62b619e82a90..976a81abe1a2 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -292,6 +292,8 @@ struct xfrm_policy_afinfo { struct flowi *fl, int reverse); int (*get_tos)(const struct flowi *fl); + void (*init_dst)(struct net *net, + struct xfrm_dst *dst); int (*init_path)(struct xfrm_dst *path, struct dst_entry *dst, int nfheader_len); diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index ef39812107b1..f8c4c08ffb60 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -73,6 +73,13 @@ static int xfrm6_get_tos(const struct flowi *fl) return 0; } +static void xfrm6_init_dst(struct net *net, struct xfrm_dst *xdst) +{ + struct rt6_info *rt = (struct rt6_info *)xdst; + + rt6_init_peer(rt, net->ipv6.peers); +} + static int xfrm6_init_path(struct xfrm_dst *path, struct dst_entry *dst, int nfheader_len) { @@ -286,6 +293,7 @@ static struct xfrm_policy_afinfo xfrm6_policy_afinfo = { .get_saddr = xfrm6_get_saddr, .decode_session = _decode_session6, .get_tos = xfrm6_get_tos, + .init_dst = xfrm6_init_dst, .init_path = xfrm6_init_path, .fill_dst = xfrm6_fill_dst, .blackhole_route = ip6_blackhole_route, diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index c5a5165a5927..5a2aa17e4d3c 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -1357,6 +1357,8 @@ static inline struct xfrm_dst *xfrm_alloc_dst(struct net *net, int family) memset(dst + 1, 0, sizeof(*xdst) - sizeof(*dst)); xdst->flo.ops = &xfrm_bundle_fc_ops; + if (afinfo->init_dst) + afinfo->init_dst(net, xdst); } else xdst = ERR_PTR(-ENOBUFS); -- cgit v1.2.3 From af74115eed22698f771fec1287a864975c9a6671 Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Wed, 15 Aug 2012 21:24:52 -0700 Subject: target: Remove unused se_cmd.cmd_spdtl This was originally for helping fabrics to determine overflow/underflow status, and has been superceeded by SCF_OVERFLOW_BIT + SCF_UNDERFLOW_BIT. Signed-off-by: Roland Dreier Signed-off-by: Nicholas Bellinger --- drivers/target/target_core_transport.c | 2 -- include/target/target_core_base.h | 2 -- 2 files changed, 4 deletions(-) (limited to 'include') diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c index ea9a3d2e4f55..4de3186dc44e 100644 --- a/drivers/target/target_core_transport.c +++ b/drivers/target/target_core_transport.c @@ -1165,8 +1165,6 @@ int target_cmd_size_check(struct se_cmd *cmd, unsigned int size) " 0x%02x\n", cmd->se_tfo->get_fabric_name(), cmd->data_length, size, cmd->t_task_cdb[0]); - cmd->cmd_spdtl = size; - if (cmd->data_direction == DMA_TO_DEVICE) { pr_err("Rejecting underflow/overflow" " WRITE data\n"); diff --git a/include/target/target_core_base.h b/include/target/target_core_base.h index 128ce46fa48a..015cea01ae39 100644 --- a/include/target/target_core_base.h +++ b/include/target/target_core_base.h @@ -503,8 +503,6 @@ struct se_cmd { u32 se_ordered_id; /* Total size in bytes associated with command */ u32 data_length; - /* SCSI Presented Data Transfer Length */ - u32 cmd_spdtl; u32 residual_count; u32 orig_fe_lun; /* Persistent Reservation key */ -- cgit v1.2.3 From e0e3cea46d31d23dc40df0a49a7a2c04fe8edfea Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 21 Aug 2012 06:21:17 +0000 Subject: af_netlink: force credentials passing [CVE-2012-3520] Pablo Neira Ayuso discovered that avahi and potentially NetworkManager accept spoofed Netlink messages because of a kernel bug. The kernel passes all-zero SCM_CREDENTIALS ancillary data to the receiver if the sender did not provide such data, instead of not including any such data at all or including the correct data from the peer (as it is the case with AF_UNIX). This bug was introduced in commit 16e572626961 (af_unix: dont send SCM_CREDENTIALS by default) This patch forces passing credentials for netlink, as before the regression. Another fix would be to not add SCM_CREDENTIALS in netlink messages if not provided by the sender, but it might break some programs. With help from Florian Weimer & Petr Matousek This issue is designated as CVE-2012-3520 Signed-off-by: Eric Dumazet Cc: Petr Matousek Cc: Florian Weimer Cc: Pablo Neira Ayuso Signed-off-by: David S. Miller --- include/net/scm.h | 4 +++- net/netlink/af_netlink.c | 2 +- net/unix/af_unix.c | 4 ++-- 3 files changed, 6 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/net/scm.h b/include/net/scm.h index 079d7887dac1..7dc0854f0b38 100644 --- a/include/net/scm.h +++ b/include/net/scm.h @@ -70,9 +70,11 @@ static __inline__ void scm_destroy(struct scm_cookie *scm) } static __inline__ int scm_send(struct socket *sock, struct msghdr *msg, - struct scm_cookie *scm) + struct scm_cookie *scm, bool forcecreds) { memset(scm, 0, sizeof(*scm)); + if (forcecreds) + scm_set_cred(scm, task_tgid(current), current_cred()); unix_get_peersec_dgram(sock, scm); if (msg->msg_controllen <= 0) return 0; diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 5463969da45b..1445d73533ed 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -1362,7 +1362,7 @@ static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock, if (NULL == siocb->scm) siocb->scm = &scm; - err = scm_send(sock, msg, siocb->scm); + err = scm_send(sock, msg, siocb->scm, true); if (err < 0) return err; diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index e4768c180da2..c5ee4ff61364 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1450,7 +1450,7 @@ static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock, if (NULL == siocb->scm) siocb->scm = &tmp_scm; wait_for_unix_gc(); - err = scm_send(sock, msg, siocb->scm); + err = scm_send(sock, msg, siocb->scm, false); if (err < 0) return err; @@ -1619,7 +1619,7 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock, if (NULL == siocb->scm) siocb->scm = &tmp_scm; wait_for_unix_gc(); - err = scm_send(sock, msg, siocb->scm); + err = scm_send(sock, msg, siocb->scm, false); if (err < 0) return err; -- cgit v1.2.3 From 04ccfe77f132b973659f11954443214659014072 Mon Sep 17 00:00:00 2001 From: Damien Lespiau Date: Fri, 17 Aug 2012 14:20:02 +0000 Subject: drm: Remove two unused fields from struct drm_display_mode Signed-off-by: Damien Lespiau Reviewed-by: Jani Nikula Signed-off-by: Dave Airlie --- drivers/gpu/drm/drm_modes.c | 3 --- include/drm/drm_crtc.h | 2 -- 2 files changed, 5 deletions(-) (limited to 'include') diff --git a/drivers/gpu/drm/drm_modes.c b/drivers/gpu/drm/drm_modes.c index b7adb4a967fd..28637c181b15 100644 --- a/drivers/gpu/drm/drm_modes.c +++ b/drivers/gpu/drm/drm_modes.c @@ -706,9 +706,6 @@ void drm_mode_set_crtcinfo(struct drm_display_mode *p, int adjust_flags) p->crtc_vblank_end = max(p->crtc_vsync_end, p->crtc_vtotal); p->crtc_hblank_start = min(p->crtc_hsync_start, p->crtc_hdisplay); p->crtc_hblank_end = max(p->crtc_hsync_end, p->crtc_htotal); - - p->crtc_hadjusted = false; - p->crtc_vadjusted = false; } EXPORT_SYMBOL(drm_mode_set_crtcinfo); diff --git a/include/drm/drm_crtc.h b/include/drm/drm_crtc.h index a1a0386e0160..ced362533e3c 100644 --- a/include/drm/drm_crtc.h +++ b/include/drm/drm_crtc.h @@ -166,8 +166,6 @@ struct drm_display_mode { int crtc_vsync_start; int crtc_vsync_end; int crtc_vtotal; - int crtc_hadjusted; - int crtc_vadjusted; /* Driver private mode info */ int private_size; -- cgit v1.2.3 From c3a5ce0416b6c172a23bc8a3760d8704d3d1535b Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Tue, 21 Aug 2012 16:16:00 -0700 Subject: string: do not export memweight() to userspace Fix the following warning: usr/include/linux/string.h:8: userspace cannot reference function or variable defined in the kernel Signed-off-by: WANG Cong Acked-by: Akinobu Mita Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/string.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/string.h b/include/linux/string.h index ffe0442e18d2..b9178812d9df 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -144,8 +144,8 @@ static inline bool strstarts(const char *str, const char *prefix) { return strncmp(str, prefix, strlen(prefix)) == 0; } -#endif extern size_t memweight(const void *ptr, size_t bytes); +#endif /* __KERNEL__ */ #endif /* _LINUX_STRING_H_ */ -- cgit v1.2.3 From c67fe3752abe6ab47639e2f9b836900c3dc3da84 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 21 Aug 2012 16:16:17 -0700 Subject: mm: compaction: Abort async compaction if locks are contended or taking too long Jim Schutt reported a problem that pointed at compaction contending heavily on locks. The workload is straight-forward and in his own words; The systems in question have 24 SAS drives spread across 3 HBAs, running 24 Ceph OSD instances, one per drive. FWIW these servers are dual-socket Intel 5675 Xeons w/48 GB memory. I've got ~160 Ceph Linux clients doing dd simultaneously to a Ceph file system backed by 12 of these servers. Early in the test everything looks fine procs -------------------memory------------------ ---swap-- -----io---- --system-- -----cpu------- r b swpd free buff cache si so bi bo in cs us sy id wa st 31 15 0 287216 576 38606628 0 0 2 1158 2 14 1 3 95 0 0 27 15 0 225288 576 38583384 0 0 18 2222016 203357 134876 11 56 17 15 0 28 17 0 219256 576 38544736 0 0 11 2305932 203141 146296 11 49 23 17 0 6 18 0 215596 576 38552872 0 0 7 2363207 215264 166502 12 45 22 20 0 22 18 0 226984 576 38596404 0 0 3 2445741 223114 179527 12 43 23 22 0 and then it goes to pot procs -------------------memory------------------ ---swap-- -----io---- --system-- -----cpu------- r b swpd free buff cache si so bi bo in cs us sy id wa st 163 8 0 464308 576 36791368 0 0 11 22210 866 536 3 13 79 4 0 207 14 0 917752 576 36181928 0 0 712 1345376 134598 47367 7 90 1 2 0 123 12 0 685516 576 36296148 0 0 429 1386615 158494 60077 8 84 5 3 0 123 12 0 598572 576 36333728 0 0 1107 1233281 147542 62351 7 84 5 4 0 622 7 0 660768 576 36118264 0 0 557 1345548 151394 59353 7 85 4 3 0 223 11 0 283960 576 36463868 0 0 46 1107160 121846 33006 6 93 1 1 0 Note that system CPU usage is very high blocks being written out has dropped by 42%. He analysed this with perf and found perf record -g -a sleep 10 perf report --sort symbol --call-graph fractal,5 34.63% [k] _raw_spin_lock_irqsave | |--97.30%-- isolate_freepages | compaction_alloc | unmap_and_move | migrate_pages | compact_zone | compact_zone_order | try_to_compact_pages | __alloc_pages_direct_compact | __alloc_pages_slowpath | __alloc_pages_nodemask | alloc_pages_vma | do_huge_pmd_anonymous_page | handle_mm_fault | do_page_fault | page_fault | | | |--87.39%-- skb_copy_datagram_iovec | | tcp_recvmsg | | inet_recvmsg | | sock_recvmsg | | sys_recvfrom | | system_call | | __recv | | | | | --100.00%-- (nil) | | | --12.61%-- memcpy --2.70%-- [...] There was other data but primarily it is all showing that compaction is contended heavily on the zone->lock and zone->lru_lock. commit [b2eef8c0: mm: compaction: minimise the time IRQs are disabled while isolating pages for migration] noted that it was possible for migration to hold the lru_lock for an excessive amount of time. Very broadly speaking this patch expands the concept. This patch introduces compact_checklock_irqsave() to check if a lock is contended or the process needs to be scheduled. If either condition is true then async compaction is aborted and the caller is informed. The page allocator will fail a THP allocation if compaction failed due to contention. This patch also introduces compact_trylock_irqsave() which will acquire the lock only if it is not contended and the process does not need to schedule. Reported-by: Jim Schutt Tested-by: Jim Schutt Signed-off-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compaction.h | 4 +- mm/compaction.c | 100 +++++++++++++++++++++++++++++++++++---------- mm/internal.h | 1 + mm/page_alloc.c | 17 +++++--- 4 files changed, 93 insertions(+), 29 deletions(-) (limited to 'include') diff --git a/include/linux/compaction.h b/include/linux/compaction.h index 133ddcf83397..ef658147e4e8 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -22,7 +22,7 @@ extern int sysctl_extfrag_handler(struct ctl_table *table, int write, extern int fragmentation_index(struct zone *zone, unsigned int order); extern unsigned long try_to_compact_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *mask, - bool sync); + bool sync, bool *contended); extern int compact_pgdat(pg_data_t *pgdat, int order); extern unsigned long compaction_suitable(struct zone *zone, int order); @@ -64,7 +64,7 @@ static inline bool compaction_deferred(struct zone *zone, int order) #else static inline unsigned long try_to_compact_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *nodemask, - bool sync) + bool sync, bool *contended) { return COMPACT_CONTINUE; } diff --git a/mm/compaction.c b/mm/compaction.c index bcce7897e17a..7fcd3a52e68d 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -50,6 +50,47 @@ static inline bool migrate_async_suitable(int migratetype) return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE; } +/* + * Compaction requires the taking of some coarse locks that are potentially + * very heavily contended. Check if the process needs to be scheduled or + * if the lock is contended. For async compaction, back out in the event + * if contention is severe. For sync compaction, schedule. + * + * Returns true if the lock is held. + * Returns false if the lock is released and compaction should abort + */ +static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags, + bool locked, struct compact_control *cc) +{ + if (need_resched() || spin_is_contended(lock)) { + if (locked) { + spin_unlock_irqrestore(lock, *flags); + locked = false; + } + + /* async aborts if taking too long or contended */ + if (!cc->sync) { + if (cc->contended) + *cc->contended = true; + return false; + } + + cond_resched(); + if (fatal_signal_pending(current)) + return false; + } + + if (!locked) + spin_lock_irqsave(lock, *flags); + return true; +} + +static inline bool compact_trylock_irqsave(spinlock_t *lock, + unsigned long *flags, struct compact_control *cc) +{ + return compact_checklock_irqsave(lock, flags, false, cc); +} + /* * Isolate free pages onto a private freelist. Caller must hold zone->lock. * If @strict is true, will abort returning 0 on any invalid PFNs or non-free @@ -173,7 +214,7 @@ isolate_freepages_range(unsigned long start_pfn, unsigned long end_pfn) } /* Update the number of anon and file isolated pages in the zone */ -static void acct_isolated(struct zone *zone, struct compact_control *cc) +static void acct_isolated(struct zone *zone, bool locked, struct compact_control *cc) { struct page *page; unsigned int count[2] = { 0, }; @@ -181,8 +222,14 @@ static void acct_isolated(struct zone *zone, struct compact_control *cc) list_for_each_entry(page, &cc->migratepages, lru) count[!!page_is_file_cache(page)]++; - __mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]); - __mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]); + /* If locked we can use the interrupt unsafe versions */ + if (locked) { + __mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]); + __mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]); + } else { + mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]); + mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]); + } } /* Similar to reclaim, but different enough that they don't share logic */ @@ -228,6 +275,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, struct list_head *migratelist = &cc->migratepages; isolate_mode_t mode = 0; struct lruvec *lruvec; + unsigned long flags; + bool locked; /* * Ensure that there are not too many pages isolated from the LRU @@ -247,25 +296,22 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, /* Time to isolate some pages for migration */ cond_resched(); - spin_lock_irq(&zone->lru_lock); + spin_lock_irqsave(&zone->lru_lock, flags); + locked = true; for (; low_pfn < end_pfn; low_pfn++) { struct page *page; - bool locked = true; /* give a chance to irqs before checking need_resched() */ if (!((low_pfn+1) % SWAP_CLUSTER_MAX)) { - spin_unlock_irq(&zone->lru_lock); + spin_unlock_irqrestore(&zone->lru_lock, flags); locked = false; } - if (need_resched() || spin_is_contended(&zone->lru_lock)) { - if (locked) - spin_unlock_irq(&zone->lru_lock); - cond_resched(); - spin_lock_irq(&zone->lru_lock); - if (fatal_signal_pending(current)) - break; - } else if (!locked) - spin_lock_irq(&zone->lru_lock); + + /* Check if it is ok to still hold the lock */ + locked = compact_checklock_irqsave(&zone->lru_lock, &flags, + locked, cc); + if (!locked) + break; /* * migrate_pfn does not necessarily start aligned to a @@ -349,9 +395,10 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, } } - acct_isolated(zone, cc); + acct_isolated(zone, locked, cc); - spin_unlock_irq(&zone->lru_lock); + if (locked) + spin_unlock_irqrestore(&zone->lru_lock, flags); trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); @@ -461,7 +508,16 @@ static void isolate_freepages(struct zone *zone, * are disabled */ isolated = 0; - spin_lock_irqsave(&zone->lock, flags); + + /* + * The zone lock must be held to isolate freepages. This + * unfortunately this is a very coarse lock and can be + * heavily contended if there are parallel allocations + * or parallel compactions. For async compaction do not + * spin on the lock + */ + if (!compact_trylock_irqsave(&zone->lock, &flags, cc)) + break; if (suitable_migration_target(page)) { end_pfn = min(pfn + pageblock_nr_pages, zone_end_pfn); isolated = isolate_freepages_block(pfn, end_pfn, @@ -773,7 +829,7 @@ out: static unsigned long compact_zone_order(struct zone *zone, int order, gfp_t gfp_mask, - bool sync) + bool sync, bool *contended) { struct compact_control cc = { .nr_freepages = 0, @@ -782,6 +838,7 @@ static unsigned long compact_zone_order(struct zone *zone, .migratetype = allocflags_to_migratetype(gfp_mask), .zone = zone, .sync = sync, + .contended = contended, }; INIT_LIST_HEAD(&cc.freepages); INIT_LIST_HEAD(&cc.migratepages); @@ -803,7 +860,7 @@ int sysctl_extfrag_threshold = 500; */ unsigned long try_to_compact_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *nodemask, - bool sync) + bool sync, bool *contended) { enum zone_type high_zoneidx = gfp_zone(gfp_mask); int may_enter_fs = gfp_mask & __GFP_FS; @@ -827,7 +884,8 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, nodemask) { int status; - status = compact_zone_order(zone, order, gfp_mask, sync); + status = compact_zone_order(zone, order, gfp_mask, sync, + contended); rc = max(status, rc); /* If a normal allocation would succeed, stop compacting */ diff --git a/mm/internal.h b/mm/internal.h index 3314f79d775a..b8c91b342e24 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -130,6 +130,7 @@ struct compact_control { int order; /* order a direct compactor needs */ int migratetype; /* MOVABLE, RECLAIMABLE etc */ struct zone *zone; + bool *contended; /* True if a lock was contended */ }; unsigned long diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 07f19248acb5..c66fb875104a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2102,7 +2102,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, int migratetype, bool sync_migration, - bool *deferred_compaction, + bool *contended_compaction, bool *deferred_compaction, unsigned long *did_some_progress) { struct page *page; @@ -2117,7 +2117,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, current->flags |= PF_MEMALLOC; *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, - nodemask, sync_migration); + nodemask, sync_migration, + contended_compaction); current->flags &= ~PF_MEMALLOC; if (*did_some_progress != COMPACT_SKIPPED) { @@ -2163,7 +2164,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, int migratetype, bool sync_migration, - bool *deferred_compaction, + bool *contended_compaction, bool *deferred_compaction, unsigned long *did_some_progress) { return NULL; @@ -2336,6 +2337,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, unsigned long did_some_progress; bool sync_migration = false; bool deferred_compaction = false; + bool contended_compaction = false; /* * In the slowpath, we sanity check order to avoid ever trying to @@ -2425,6 +2427,7 @@ rebalance: nodemask, alloc_flags, preferred_zone, migratetype, sync_migration, + &contended_compaction, &deferred_compaction, &did_some_progress); if (page) @@ -2434,10 +2437,11 @@ rebalance: /* * If compaction is deferred for high-order allocations, it is because * sync compaction recently failed. In this is the case and the caller - * has requested the system not be heavily disrupted, fail the - * allocation now instead of entering direct reclaim + * requested a movable allocation that does not heavily disrupt the + * system then fail the allocation instead of entering direct reclaim. */ - if (deferred_compaction && (gfp_mask & __GFP_NO_KSWAPD)) + if ((deferred_compaction || contended_compaction) && + (gfp_mask & __GFP_NO_KSWAPD)) goto nopage; /* Try direct reclaim and then allocating */ @@ -2508,6 +2512,7 @@ rebalance: nodemask, alloc_flags, preferred_zone, migratetype, sync_migration, + &contended_compaction, &deferred_compaction, &did_some_progress); if (page) -- cgit v1.2.3 From 8ad5db8a8ddbe3bd33078863a027011e28f1f4ee Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 17 Aug 2012 20:10:46 -0400 Subject: introduce kref_put_mutex() equivalent of mutex_lock(mutex); if (!kref_put(kref, release)) mutex_unlock(mutex); Signed-off-by: Al Viro --- include/linux/kref.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include') diff --git a/include/linux/kref.h b/include/linux/kref.h index 9c07dcebded7..65af6887872f 100644 --- a/include/linux/kref.h +++ b/include/linux/kref.h @@ -18,6 +18,7 @@ #include #include #include +#include struct kref { atomic_t refcount; @@ -93,4 +94,21 @@ static inline int kref_put(struct kref *kref, void (*release)(struct kref *kref) { return kref_sub(kref, 1, release); } + +static inline int kref_put_mutex(struct kref *kref, + void (*release)(struct kref *kref), + struct mutex *lock) +{ + WARN_ON(release == NULL); + if (unlikely(!atomic_add_unless(&kref->refcount, -1, 1))) { + mutex_lock(lock); + if (unlikely(!atomic_dec_and_test(&kref->refcount))) { + mutex_unlock(lock); + return 0; + } + release(kref); + return 1; + } + return 0; +} #endif /* _KREF_H_ */ -- cgit v1.2.3 From c7a9b09b1a4a1fbccb2ec409daec95f9068d77c0 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 15 Aug 2012 20:51:54 +0000 Subject: ARM: omap: allow building omap44xx without SMP The new omap4 cpuidle implementation currently requires ARCH_NEEDS_CPU_IDLE_COUPLED, which only works on SMP. This patch makes it possible to build a non-SMP kernel for that platform. This is not normally desired for end-users but can be useful for testing. Without this patch, building rand-0y2jSKT results in: drivers/cpuidle/coupled.c: In function 'cpuidle_coupled_poke': drivers/cpuidle/coupled.c:317:3: error: implicit declaration of function '__smp_call_function_single' [-Werror=implicit-function-declaration] It's not clear if this patch is the best solution for the problem at hand. I have made sure that we can now build the kernel in all configurations, but that does not mean it will actually work on an OMAP44xx. Signed-off-by: Arnd Bergmann Acked-by: Santosh Shilimkar Tested-by: Santosh Shilimkar Cc: Kevin Hilman Cc: Tony Lindgren --- arch/arm/mach-omap2/Kconfig | 2 +- arch/arm/mach-omap2/cpuidle44xx.c | 3 ++- include/linux/cpuidle.h | 4 ++++ 3 files changed, 7 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/arch/arm/mach-omap2/Kconfig b/arch/arm/mach-omap2/Kconfig index dd2db025f778..66a8be331caf 100644 --- a/arch/arm/mach-omap2/Kconfig +++ b/arch/arm/mach-omap2/Kconfig @@ -62,7 +62,7 @@ config ARCH_OMAP4 select PM_OPP if PM select USB_ARCH_HAS_EHCI if USB_SUPPORT select ARM_CPU_SUSPEND if PM - select ARCH_NEEDS_CPU_IDLE_COUPLED + select ARCH_NEEDS_CPU_IDLE_COUPLED if SMP config SOC_OMAP5 bool "TI OMAP5" diff --git a/arch/arm/mach-omap2/cpuidle44xx.c b/arch/arm/mach-omap2/cpuidle44xx.c index ee05e193fc61..288bee6cbb76 100644 --- a/arch/arm/mach-omap2/cpuidle44xx.c +++ b/arch/arm/mach-omap2/cpuidle44xx.c @@ -238,8 +238,9 @@ int __init omap4_idle_init(void) for_each_cpu(cpu_id, cpu_online_mask) { dev = &per_cpu(omap4_idle_dev, cpu_id); dev->cpu = cpu_id; +#ifdef CONFIG_ARCH_NEEDS_CPU_IDLE_COUPLED dev->coupled_cpus = *cpu_online_mask; - +#endif cpuidle_register_driver(&omap4_idle_driver); if (cpuidle_register_device(dev)) { diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h index 040b13b5c14a..279b1eaa8b73 100644 --- a/include/linux/cpuidle.h +++ b/include/linux/cpuidle.h @@ -194,6 +194,10 @@ static inline int cpuidle_play_dead(void) {return -ENODEV; } #ifdef CONFIG_ARCH_NEEDS_CPU_IDLE_COUPLED void cpuidle_coupled_parallel_barrier(struct cpuidle_device *dev, atomic_t *a); +#else +static inline void cpuidle_coupled_parallel_barrier(struct cpuidle_device *dev, atomic_t *a) +{ +} #endif /****************************** -- cgit v1.2.3