From b0f77d0eae0c58a5a9691a067ada112ceeae2d00 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Wed, 14 Jul 2010 20:50:29 -0700 Subject: net: fix problem in reading sock TX queue Fix problem in reading the tx_queue recorded in a socket. In dev_pick_tx, the TX queue is read by doing a check with sk_tx_queue_recorded on the socket, followed by a sk_tx_queue_get. The problem is that there is not mutual exclusion across these calls in the socket so it it is possible that the queue in the sock can be invalidated after sk_tx_queue_recorded is called so that sk_tx_queue get returns -1, which sets 65535 in queue_index and thus dev_pick_tx returns 65536 which is a bogus queue and can cause crash in dev_queue_xmit. We fix this by only calling sk_tx_queue_get which does the proper checks. The interface is that sk_tx_queue_get returns the TX queue if the sock argument is non-NULL and TX queue is recorded, else it returns -1. sk_tx_queue_recorded is no longer used so it can be completely removed. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/net/sock.h | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index 731150d52799..0a691ea7654a 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1224,12 +1224,7 @@ static inline void sk_tx_queue_clear(struct sock *sk) static inline int sk_tx_queue_get(const struct sock *sk) { - return sk->sk_tx_queue_mapping; -} - -static inline bool sk_tx_queue_recorded(const struct sock *sk) -{ - return (sk && sk->sk_tx_queue_mapping >= 0); + return sk ? sk->sk_tx_queue_mapping : -1; } static inline void sk_set_socket(struct sock *sk, struct socket *sock) -- cgit v1.2.3 From 13ceef099edd2b70c5a6f3a9ef5d6d97cda2e096 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 14 Jul 2010 07:56:33 +0200 Subject: jbd2/ocfs2: Fix block checksumming when a buffer is used in several transactions OCFS2 uses t_commit trigger to compute and store checksum of the just committed blocks. When a buffer has b_frozen_data, checksum is computed for it instead of b_data but this can result in an old checksum being written to the filesystem in the following scenario: 1) transaction1 is opened 2) handle1 is opened 3) journal_access(handle1, bh) - This sets jh->b_transaction to transaction1 4) modify(bh) 5) journal_dirty(handle1, bh) 6) handle1 is closed 7) start committing transaction1, opening transaction2 8) handle2 is opened 9) journal_access(handle2, bh) - This copies off b_frozen_data to make it safe for transaction1 to commit. jh->b_next_transaction is set to transaction2. 10) jbd2_journal_write_metadata() checksums b_frozen_data 11) the journal correctly writes b_frozen_data to the disk journal 12) handle2 is closed - There was no dirty call for the bh on handle2, so it is never queued for any more journal operation 13) Checkpointing finally happens, and it just spools the bh via normal buffer writeback. This will write b_data, which was never triggered on and thus contains a wrong (old) checksum. This patch fixes the problem by calling the trigger at the moment data is frozen for journal commit - i.e., either when b_frozen_data is created by do_get_write_access or just before we write a buffer to the log if b_frozen_data does not exist. We also rename the trigger to t_frozen as that better describes when it is called. Signed-off-by: Jan Kara Signed-off-by: Mark Fasheh Signed-off-by: Joel Becker --- fs/jbd2/journal.c | 15 +++++++-------- fs/jbd2/transaction.c | 9 ++++++--- fs/ocfs2/journal.c | 24 ++++++++++++------------ include/linux/jbd2.h | 11 ++++++----- 4 files changed, 31 insertions(+), 28 deletions(-) (limited to 'include') diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index bc2ff5932769..036880895bfc 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -297,7 +297,6 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction, struct page *new_page; unsigned int new_offset; struct buffer_head *bh_in = jh2bh(jh_in); - struct jbd2_buffer_trigger_type *triggers; journal_t *journal = transaction->t_journal; /* @@ -328,21 +327,21 @@ repeat: done_copy_out = 1; new_page = virt_to_page(jh_in->b_frozen_data); new_offset = offset_in_page(jh_in->b_frozen_data); - triggers = jh_in->b_frozen_triggers; } else { new_page = jh2bh(jh_in)->b_page; new_offset = offset_in_page(jh2bh(jh_in)->b_data); - triggers = jh_in->b_triggers; } mapped_data = kmap_atomic(new_page, KM_USER0); /* - * Fire any commit trigger. Do this before checking for escaping, - * as the trigger may modify the magic offset. If a copy-out - * happens afterwards, it will have the correct data in the buffer. + * Fire data frozen trigger if data already wasn't frozen. Do this + * before checking for escaping, as the trigger may modify the magic + * offset. If a copy-out happens afterwards, it will have the correct + * data in the buffer. */ - jbd2_buffer_commit_trigger(jh_in, mapped_data + new_offset, - triggers); + if (!done_copy_out) + jbd2_buffer_frozen_trigger(jh_in, mapped_data + new_offset, + jh_in->b_triggers); /* * Check for escaping diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index e214d68620ac..b8e0806681bb 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -725,6 +725,9 @@ done: page = jh2bh(jh)->b_page; offset = ((unsigned long) jh2bh(jh)->b_data) & ~PAGE_MASK; source = kmap_atomic(page, KM_USER0); + /* Fire data frozen trigger just before we copy the data */ + jbd2_buffer_frozen_trigger(jh, source + offset, + jh->b_triggers); memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size); kunmap_atomic(source, KM_USER0); @@ -963,15 +966,15 @@ void jbd2_journal_set_triggers(struct buffer_head *bh, jh->b_triggers = type; } -void jbd2_buffer_commit_trigger(struct journal_head *jh, void *mapped_data, +void jbd2_buffer_frozen_trigger(struct journal_head *jh, void *mapped_data, struct jbd2_buffer_trigger_type *triggers) { struct buffer_head *bh = jh2bh(jh); - if (!triggers || !triggers->t_commit) + if (!triggers || !triggers->t_frozen) return; - triggers->t_commit(triggers, bh, mapped_data, bh->b_size); + triggers->t_frozen(triggers, bh, mapped_data, bh->b_size); } void jbd2_buffer_abort_trigger(struct journal_head *jh, diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 39113b5e79e7..625de9d7088c 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -472,7 +472,7 @@ static inline struct ocfs2_triggers *to_ocfs2_trigger(struct jbd2_buffer_trigger return container_of(triggers, struct ocfs2_triggers, ot_triggers); } -static void ocfs2_commit_trigger(struct jbd2_buffer_trigger_type *triggers, +static void ocfs2_frozen_trigger(struct jbd2_buffer_trigger_type *triggers, struct buffer_head *bh, void *data, size_t size) { @@ -491,7 +491,7 @@ static void ocfs2_commit_trigger(struct jbd2_buffer_trigger_type *triggers, * Quota blocks have their own trigger because the struct ocfs2_block_check * offset depends on the blocksize. */ -static void ocfs2_dq_commit_trigger(struct jbd2_buffer_trigger_type *triggers, +static void ocfs2_dq_frozen_trigger(struct jbd2_buffer_trigger_type *triggers, struct buffer_head *bh, void *data, size_t size) { @@ -511,7 +511,7 @@ static void ocfs2_dq_commit_trigger(struct jbd2_buffer_trigger_type *triggers, * Directory blocks also have their own trigger because the * struct ocfs2_block_check offset depends on the blocksize. */ -static void ocfs2_db_commit_trigger(struct jbd2_buffer_trigger_type *triggers, +static void ocfs2_db_frozen_trigger(struct jbd2_buffer_trigger_type *triggers, struct buffer_head *bh, void *data, size_t size) { @@ -544,7 +544,7 @@ static void ocfs2_abort_trigger(struct jbd2_buffer_trigger_type *triggers, static struct ocfs2_triggers di_triggers = { .ot_triggers = { - .t_commit = ocfs2_commit_trigger, + .t_frozen = ocfs2_frozen_trigger, .t_abort = ocfs2_abort_trigger, }, .ot_offset = offsetof(struct ocfs2_dinode, i_check), @@ -552,7 +552,7 @@ static struct ocfs2_triggers di_triggers = { static struct ocfs2_triggers eb_triggers = { .ot_triggers = { - .t_commit = ocfs2_commit_trigger, + .t_frozen = ocfs2_frozen_trigger, .t_abort = ocfs2_abort_trigger, }, .ot_offset = offsetof(struct ocfs2_extent_block, h_check), @@ -560,7 +560,7 @@ static struct ocfs2_triggers eb_triggers = { static struct ocfs2_triggers rb_triggers = { .ot_triggers = { - .t_commit = ocfs2_commit_trigger, + .t_frozen = ocfs2_frozen_trigger, .t_abort = ocfs2_abort_trigger, }, .ot_offset = offsetof(struct ocfs2_refcount_block, rf_check), @@ -568,7 +568,7 @@ static struct ocfs2_triggers rb_triggers = { static struct ocfs2_triggers gd_triggers = { .ot_triggers = { - .t_commit = ocfs2_commit_trigger, + .t_frozen = ocfs2_frozen_trigger, .t_abort = ocfs2_abort_trigger, }, .ot_offset = offsetof(struct ocfs2_group_desc, bg_check), @@ -576,14 +576,14 @@ static struct ocfs2_triggers gd_triggers = { static struct ocfs2_triggers db_triggers = { .ot_triggers = { - .t_commit = ocfs2_db_commit_trigger, + .t_frozen = ocfs2_db_frozen_trigger, .t_abort = ocfs2_abort_trigger, }, }; static struct ocfs2_triggers xb_triggers = { .ot_triggers = { - .t_commit = ocfs2_commit_trigger, + .t_frozen = ocfs2_frozen_trigger, .t_abort = ocfs2_abort_trigger, }, .ot_offset = offsetof(struct ocfs2_xattr_block, xb_check), @@ -591,14 +591,14 @@ static struct ocfs2_triggers xb_triggers = { static struct ocfs2_triggers dq_triggers = { .ot_triggers = { - .t_commit = ocfs2_dq_commit_trigger, + .t_frozen = ocfs2_dq_frozen_trigger, .t_abort = ocfs2_abort_trigger, }, }; static struct ocfs2_triggers dr_triggers = { .ot_triggers = { - .t_commit = ocfs2_commit_trigger, + .t_frozen = ocfs2_frozen_trigger, .t_abort = ocfs2_abort_trigger, }, .ot_offset = offsetof(struct ocfs2_dx_root_block, dr_check), @@ -606,7 +606,7 @@ static struct ocfs2_triggers dr_triggers = { static struct ocfs2_triggers dl_triggers = { .ot_triggers = { - .t_commit = ocfs2_commit_trigger, + .t_frozen = ocfs2_frozen_trigger, .t_abort = ocfs2_abort_trigger, }, .ot_offset = offsetof(struct ocfs2_dx_leaf, dl_check), diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index a4d2e9f7088a..adf832dec3f3 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1026,11 +1026,12 @@ void __jbd2_journal_insert_checkpoint(struct journal_head *, transaction_t *); struct jbd2_buffer_trigger_type { /* - * Fired just before a buffer is written to the journal. - * mapped_data is a mapped buffer that is the frozen data for - * commit. + * Fired a the moment data to write to the journal are known to be + * stable - so either at the moment b_frozen_data is created or just + * before a buffer is written to the journal. mapped_data is a mapped + * buffer that is the frozen data for commit. */ - void (*t_commit)(struct jbd2_buffer_trigger_type *type, + void (*t_frozen)(struct jbd2_buffer_trigger_type *type, struct buffer_head *bh, void *mapped_data, size_t size); @@ -1042,7 +1043,7 @@ struct jbd2_buffer_trigger_type { struct buffer_head *bh); }; -extern void jbd2_buffer_commit_trigger(struct journal_head *jh, +extern void jbd2_buffer_frozen_trigger(struct journal_head *jh, void *mapped_data, struct jbd2_buffer_trigger_type *triggers); extern void jbd2_buffer_abort_trigger(struct journal_head *jh, -- cgit v1.2.3 From 58c84eda07560a6b75b03e8d3b26d6eddfc14011 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Thu, 15 Jul 2010 09:41:42 -0600 Subject: PCI: fall back to original BIOS BAR addresses If we fail to assign resources to a PCI BAR, this patch makes us try the original address from BIOS rather than leaving it disabled. Linux tries to make sure all PCI device BARs are inside the upstream PCI host bridge or P2P bridge apertures, reassigning BARs if necessary. Windows does similar reassignment. Before this patch, if we could not move a BAR into an aperture, we left the resource unassigned, i.e., at address zero. Windows leaves such BARs at the original BIOS addresses, and this patch makes Linux do the same. This is a bit ugly because we disable the resource long before we try to reassign it, so we have to keep track of the BIOS BAR address somewhere. For lack of a better place, I put it in the struct pci_dev. I think it would be cleaner to attempt the assignment immediately when the claim fails, so we could easily remember the original address. But we currently claim motherboard resources in the middle, after attempting to claim PCI resources and before assigning new PCI resources, and changing that is a fairly big job. Addresses https://bugzilla.kernel.org/show_bug.cgi?id=16263 Reported-by: Andrew Tested-by: Andrew Signed-off-by: Bjorn Helgaas Signed-off-by: Jesse Barnes --- arch/x86/pci/i386.c | 1 + drivers/pci/setup-res.c | 32 ++++++++++++++++++++++++++++++++ include/linux/pci.h | 1 + 3 files changed, 34 insertions(+) (limited to 'include') diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c index 6fdb3ec30c31..55253095be84 100644 --- a/arch/x86/pci/i386.c +++ b/arch/x86/pci/i386.c @@ -184,6 +184,7 @@ static void __init pcibios_allocate_resources(int pass) idx, r, disabled, pass); if (pci_claim_resource(dev, idx) < 0) { /* We'll assign a new address later */ + dev->fw_addr[idx] = r->start; r->end -= r->start; r->start = 0; } diff --git a/drivers/pci/setup-res.c b/drivers/pci/setup-res.c index 92379e2d37e7..2aaa13150de3 100644 --- a/drivers/pci/setup-res.c +++ b/drivers/pci/setup-res.c @@ -156,6 +156,38 @@ static int __pci_assign_resource(struct pci_bus *bus, struct pci_dev *dev, pcibios_align_resource, dev); } + if (ret < 0 && dev->fw_addr[resno]) { + struct resource *root, *conflict; + resource_size_t start, end; + + /* + * If we failed to assign anything, let's try the address + * where firmware left it. That at least has a chance of + * working, which is better than just leaving it disabled. + */ + + if (res->flags & IORESOURCE_IO) + root = &ioport_resource; + else + root = &iomem_resource; + + start = res->start; + end = res->end; + res->start = dev->fw_addr[resno]; + res->end = res->start + size - 1; + dev_info(&dev->dev, "BAR %d: trying firmware assignment %pR\n", + resno, res); + conflict = request_resource_conflict(root, res); + if (conflict) { + dev_info(&dev->dev, + "BAR %d: %pR conflicts with %s %pR\n", resno, + res, conflict->name, conflict); + res->start = start; + res->end = end; + } else + ret = 0; + } + if (!ret) { res->flags &= ~IORESOURCE_STARTALIGN; dev_info(&dev->dev, "BAR %d: assigned %pR\n", resno, res); diff --git a/include/linux/pci.h b/include/linux/pci.h index 7cb00845f150..f26fda76b87f 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -288,6 +288,7 @@ struct pci_dev { */ unsigned int irq; struct resource resource[DEVICE_COUNT_RESOURCE]; /* I/O and memory regions + expansion ROMs */ + resource_size_t fw_addr[DEVICE_COUNT_RESOURCE]; /* FW-assigned addr */ /* These fields are used by common fixups */ unsigned int transparent:1; /* Transparent PCI bridge */ -- cgit v1.2.3 From 7f8275d0d660c146de6ee3017e1e2e594c49e820 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 19 Jul 2010 14:56:17 +1000 Subject: mm: add context argument to shrinker callback The current shrinker implementation requires the registered callback to have global state to work from. This makes it difficult to shrink caches that are not global (e.g. per-filesystem caches). Pass the shrinker structure to the callback so that users can embed the shrinker structure in the context the shrinker needs to operate on and get back to it in the callback via container_of(). Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig --- arch/x86/kvm/mmu.c | 2 +- drivers/gpu/drm/i915/i915_gem.c | 2 +- fs/dcache.c | 2 +- fs/gfs2/glock.c | 2 +- fs/gfs2/quota.c | 2 +- fs/gfs2/quota.h | 2 +- fs/inode.c | 2 +- fs/mbcache.c | 5 +++-- fs/nfs/dir.c | 2 +- fs/nfs/internal.h | 3 ++- fs/quota/dquot.c | 2 +- fs/ubifs/shrinker.c | 2 +- fs/ubifs/ubifs.h | 2 +- fs/xfs/linux-2.6/xfs_buf.c | 5 +++-- fs/xfs/linux-2.6/xfs_sync.c | 1 + fs/xfs/quota/xfs_qm.c | 7 +++++-- include/linux/mm.h | 2 +- mm/vmscan.c | 8 +++++--- 18 files changed, 31 insertions(+), 22 deletions(-) (limited to 'include') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 3699613e8830..b1ed0a1a5913 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2926,7 +2926,7 @@ static int kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm) return kvm_mmu_zap_page(kvm, page) + 1; } -static int mmu_shrink(int nr_to_scan, gfp_t gfp_mask) +static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask) { struct kvm *kvm; struct kvm *kvm_freed = NULL; diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index 8757ecf6e96b..e7018708cc31 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -4978,7 +4978,7 @@ i915_gpu_is_active(struct drm_device *dev) } static int -i915_gem_shrink(int nr_to_scan, gfp_t gfp_mask) +i915_gem_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask) { drm_i915_private_t *dev_priv, *next_dev; struct drm_i915_gem_object *obj_priv, *next_obj; diff --git a/fs/dcache.c b/fs/dcache.c index c8c78ba07827..86d4db15473e 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -896,7 +896,7 @@ EXPORT_SYMBOL(shrink_dcache_parent); * * In this case we return -1 to tell the caller that we baled. */ -static int shrink_dcache_memory(int nr, gfp_t gfp_mask) +static int shrink_dcache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask) { if (nr) { if (!(gfp_mask & __GFP_FS)) diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index dbab3fdc2582..0898f3ec8212 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -1358,7 +1358,7 @@ void gfs2_glock_complete(struct gfs2_glock *gl, int ret) } -static int gfs2_shrink_glock_memory(int nr, gfp_t gfp_mask) +static int gfs2_shrink_glock_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask) { struct gfs2_glock *gl; int may_demote; diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index b256d6f24288..8f02d3db8f42 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -77,7 +77,7 @@ static LIST_HEAD(qd_lru_list); static atomic_t qd_lru_count = ATOMIC_INIT(0); static DEFINE_SPINLOCK(qd_lru_lock); -int gfs2_shrink_qd_memory(int nr, gfp_t gfp_mask) +int gfs2_shrink_qd_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask) { struct gfs2_quota_data *qd; struct gfs2_sbd *sdp; diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h index 195f60c8bd14..e7d236ca48bd 100644 --- a/fs/gfs2/quota.h +++ b/fs/gfs2/quota.h @@ -51,7 +51,7 @@ static inline int gfs2_quota_lock_check(struct gfs2_inode *ip) return ret; } -extern int gfs2_shrink_qd_memory(int nr, gfp_t gfp_mask); +extern int gfs2_shrink_qd_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask); extern const struct quotactl_ops gfs2_quotactl_ops; #endif /* __QUOTA_DOT_H__ */ diff --git a/fs/inode.c b/fs/inode.c index 2bee20ae3d65..722860b323a9 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -512,7 +512,7 @@ static void prune_icache(int nr_to_scan) * This function is passed the number of inodes to scan, and it returns the * total number of remaining possibly-reclaimable inodes. */ -static int shrink_icache_memory(int nr, gfp_t gfp_mask) +static int shrink_icache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask) { if (nr) { /* diff --git a/fs/mbcache.c b/fs/mbcache.c index ec88ff3d04a9..e28f21b95344 100644 --- a/fs/mbcache.c +++ b/fs/mbcache.c @@ -115,7 +115,7 @@ mb_cache_indexes(struct mb_cache *cache) * What the mbcache registers as to get shrunk dynamically. */ -static int mb_cache_shrink_fn(int nr_to_scan, gfp_t gfp_mask); +static int mb_cache_shrink_fn(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask); static struct shrinker mb_cache_shrinker = { .shrink = mb_cache_shrink_fn, @@ -191,13 +191,14 @@ forget: * This function is called by the kernel memory management when memory * gets low. * + * @shrink: (ignored) * @nr_to_scan: Number of objects to scan * @gfp_mask: (ignored) * * Returns the number of objects which are present in the cache. */ static int -mb_cache_shrink_fn(int nr_to_scan, gfp_t gfp_mask) +mb_cache_shrink_fn(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask) { LIST_HEAD(free_list); struct list_head *l, *ltmp; diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 782b431ef91c..e60416d3f818 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1710,7 +1710,7 @@ static void nfs_access_free_list(struct list_head *head) } } -int nfs_access_cache_shrinker(int nr_to_scan, gfp_t gfp_mask) +int nfs_access_cache_shrinker(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask) { LIST_HEAD(head); struct nfs_inode *nfsi; diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index d8bd619e386c..e70f44b9b3f4 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -205,7 +205,8 @@ extern struct rpc_procinfo nfs4_procedures[]; void nfs_close_context(struct nfs_open_context *ctx, int is_sync); /* dir.c */ -extern int nfs_access_cache_shrinker(int nr_to_scan, gfp_t gfp_mask); +extern int nfs_access_cache_shrinker(struct shrinker *shrink, + int nr_to_scan, gfp_t gfp_mask); /* inode.c */ extern struct workqueue_struct *nfsiod_workqueue; diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 12c233da1b6b..437d2ca2de97 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -676,7 +676,7 @@ static void prune_dqcache(int count) * This is called from kswapd when we think we need some * more memory */ -static int shrink_dqcache_memory(int nr, gfp_t gfp_mask) +static int shrink_dqcache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask) { if (nr) { spin_lock(&dq_list_lock); diff --git a/fs/ubifs/shrinker.c b/fs/ubifs/shrinker.c index 02feb59cefca..0b201114a5ad 100644 --- a/fs/ubifs/shrinker.c +++ b/fs/ubifs/shrinker.c @@ -277,7 +277,7 @@ static int kick_a_thread(void) return 0; } -int ubifs_shrinker(int nr, gfp_t gfp_mask) +int ubifs_shrinker(struct shrinker *shrink, int nr, gfp_t gfp_mask) { int freed, contention = 0; long clean_zn_cnt = atomic_long_read(&ubifs_clean_zn_cnt); diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index 2eef553d50c8..04310878f449 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -1575,7 +1575,7 @@ int ubifs_tnc_start_commit(struct ubifs_info *c, struct ubifs_zbranch *zroot); int ubifs_tnc_end_commit(struct ubifs_info *c); /* shrinker.c */ -int ubifs_shrinker(int nr_to_scan, gfp_t gfp_mask); +int ubifs_shrinker(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask); /* commit.c */ int ubifs_bg_thread(void *info); diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index 649ade8ef598..2ee3f7a60163 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -45,7 +45,7 @@ static kmem_zone_t *xfs_buf_zone; STATIC int xfsbufd(void *); -STATIC int xfsbufd_wakeup(int, gfp_t); +STATIC int xfsbufd_wakeup(struct shrinker *, int, gfp_t); STATIC void xfs_buf_delwri_queue(xfs_buf_t *, int); static struct shrinker xfs_buf_shake = { .shrink = xfsbufd_wakeup, @@ -340,7 +340,7 @@ _xfs_buf_lookup_pages( __func__, gfp_mask); XFS_STATS_INC(xb_page_retries); - xfsbufd_wakeup(0, gfp_mask); + xfsbufd_wakeup(NULL, 0, gfp_mask); congestion_wait(BLK_RW_ASYNC, HZ/50); goto retry; } @@ -1762,6 +1762,7 @@ xfs_buf_runall_queues( STATIC int xfsbufd_wakeup( + struct shrinker *shrink, int priority, gfp_t mask) { diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c index ef7f0218bccb..be375827af98 100644 --- a/fs/xfs/linux-2.6/xfs_sync.c +++ b/fs/xfs/linux-2.6/xfs_sync.c @@ -838,6 +838,7 @@ static struct rw_semaphore xfs_mount_list_lock; static int xfs_reclaim_inode_shrink( + struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask) { diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c index 8c117ff2e3ab..67c018392d62 100644 --- a/fs/xfs/quota/xfs_qm.c +++ b/fs/xfs/quota/xfs_qm.c @@ -69,7 +69,7 @@ STATIC void xfs_qm_list_destroy(xfs_dqlist_t *); STATIC int xfs_qm_init_quotainos(xfs_mount_t *); STATIC int xfs_qm_init_quotainfo(xfs_mount_t *); -STATIC int xfs_qm_shake(int, gfp_t); +STATIC int xfs_qm_shake(struct shrinker *, int, gfp_t); static struct shrinker xfs_qm_shaker = { .shrink = xfs_qm_shake, @@ -2117,7 +2117,10 @@ xfs_qm_shake_freelist( */ /* ARGSUSED */ STATIC int -xfs_qm_shake(int nr_to_scan, gfp_t gfp_mask) +xfs_qm_shake( + struct shrinker *shrink, + int nr_to_scan, + gfp_t gfp_mask) { int ndqused, nfree, n; diff --git a/include/linux/mm.h b/include/linux/mm.h index b969efb03787..a2b48041b910 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -999,7 +999,7 @@ static inline void sync_mm_rss(struct task_struct *task, struct mm_struct *mm) * querying the cache size, so a fastpath for that case is appropriate. */ struct shrinker { - int (*shrink)(int nr_to_scan, gfp_t gfp_mask); + int (*shrink)(struct shrinker *, int nr_to_scan, gfp_t gfp_mask); int seeks; /* seeks to recreate an obj */ /* These are for internal use */ diff --git a/mm/vmscan.c b/mm/vmscan.c index 9c7e57cc63a3..199fa436c0dd 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -213,8 +213,9 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, list_for_each_entry(shrinker, &shrinker_list, list) { unsigned long long delta; unsigned long total_scan; - unsigned long max_pass = (*shrinker->shrink)(0, gfp_mask); + unsigned long max_pass; + max_pass = (*shrinker->shrink)(shrinker, 0, gfp_mask); delta = (4 * scanned) / shrinker->seeks; delta *= max_pass; do_div(delta, lru_pages + 1); @@ -242,8 +243,9 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, int shrink_ret; int nr_before; - nr_before = (*shrinker->shrink)(0, gfp_mask); - shrink_ret = (*shrinker->shrink)(this_scan, gfp_mask); + nr_before = (*shrinker->shrink)(shrinker, 0, gfp_mask); + shrink_ret = (*shrinker->shrink)(shrinker, this_scan, + gfp_mask); if (shrink_ret == -1) break; if (shrink_ret < nr_before) -- cgit v1.2.3 From 772a2f9b488f4d27c314da5eeabde750b9ead41b Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 15 Jul 2010 10:39:47 +0200 Subject: fb: handle allocation failure in alloc_apertures() If the kzalloc() fails we should return NULL. All the places that call alloc_apertures() check for this already. Signed-off-by: Dan Carpenter Acked-by: James Simmons Acked-by: Marcin Slusarz Signed-off-by: Dave Airlie --- include/linux/fb.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/fb.h b/include/linux/fb.h index 8e5a9dfb76bf..e7445df44d6c 100644 --- a/include/linux/fb.h +++ b/include/linux/fb.h @@ -873,6 +873,8 @@ struct fb_info { static inline struct apertures_struct *alloc_apertures(unsigned int max_num) { struct apertures_struct *a = kzalloc(sizeof(struct apertures_struct) + max_num * sizeof(struct aperture), GFP_KERNEL); + if (!a) + return NULL; a->count = max_num; return a; } -- cgit v1.2.3 From 07fca0e57fca925032526349f4370f97ed580cc9 Mon Sep 17 00:00:00 2001 From: Sam Ravnborg Date: Sat, 10 Jul 2010 08:35:00 +0200 Subject: tracing: Properly align linker defined symbols We define a number of symbols in the linker scipt like this: __start_syscalls_metadata = .; *(__syscalls_metadata) But we do not know the alignment of "." when we assign the __start_syscalls_metadata symbol. gcc started to uses bigger alignment for structs (32 bytes), so we saw situations where the linker due to alignment constraints increased the value of "." after the symbol assignment. This resulted in boot fails. Fix this by forcing a 32 byte alignment of "." before the assignment. This patch introduces the forced alignment for ftrace_events and syscalls_metadata. It may be required in more places. Reported-by: Zeev Tarantov Signed-off-by: Sam Ravnborg LKML-Reference: <20100710063459.GA14596@merkur.ravnborg.org> Cc: Frederic Weisbecker Signed-off-by: Steven Rostedt --- include/asm-generic/vmlinux.lds.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include') diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 48c5299cbf26..4b5902ad0d5d 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -63,6 +63,12 @@ /* Align . to a 8 byte boundary equals to maximum function alignment. */ #define ALIGN_FUNCTION() . = ALIGN(8) +/* + * Align to a 32 byte boundary equal to the + * alignment gcc 4.5 uses for a struct + */ +#define STRUCT_ALIGN() . = ALIGN(32) + /* The actual configuration determine if the init/exit sections * are handled as text/data or they can be discarded (which * often happens at runtime) @@ -166,7 +172,11 @@ LIKELY_PROFILE() \ BRANCH_PROFILE() \ TRACE_PRINTKS() \ + \ + STRUCT_ALIGN(); \ FTRACE_EVENTS() \ + \ + STRUCT_ALIGN(); \ TRACE_SYSCALLS() /* -- cgit v1.2.3 From 844b9a8707f1fcf0482e0c52f44a555e799ccda6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 20 Jul 2010 13:24:34 -0700 Subject: vfs: fix RCU-lockdep false positive due to /proc If a single-threaded process does a file-descriptor operation, and some other process accesses that same file descriptor via /proc, the current rcu_dereference_check_fdtable() can give a false-positive RCU-lockdep splat due to the reference count being increased by the /proc access after the reference-count check in fget_light() but before the check in rcu_dereference_check_fdtable(). This commit prevents this false positive by checking for a single-threaded process. To avoid #include hell, this commit uses the wrapper for thread_group_empty(current) defined by rcu_my_thread_group_empty() provided in a separate commit. Located-by: Miles Lane Located-by: Eric Dumazet Signed-off-by: Paul E. McKenney Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/fdtable.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h index 013dc529e95f..d147461bc271 100644 --- a/include/linux/fdtable.h +++ b/include/linux/fdtable.h @@ -61,7 +61,8 @@ struct files_struct { (rcu_dereference_check((fdtfd), \ rcu_read_lock_held() || \ lockdep_is_held(&(files)->file_lock) || \ - atomic_read(&(files)->count) == 1)) + atomic_read(&(files)->count) == 1 || \ + rcu_my_thread_group_empty())) #define files_fdtable(files) \ (rcu_dereference_check_fdtable((files), (files)->fdt)) -- cgit v1.2.3 From a6a1a095ec8ace2912fc280d371eee8ff5da5736 Mon Sep 17 00:00:00 2001 From: Doug Goldstein Date: Tue, 20 Jul 2010 15:22:25 -0700 Subject: include/linux/vgaarb.h: add missing part of include guard vgaarb.h was missing the #define of the #ifndef at the top for the guard to prevent multiple #include's from causing re-define errors Signed-off-by: Doug Goldstein Cc: Dave Airlie Cc: Jesse Barnes Signed-off-by: Andrew Morton Signed-off-by: Dave Airlie --- include/linux/vgaarb.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/vgaarb.h b/include/linux/vgaarb.h index c9a975976995..814f294d4cd0 100644 --- a/include/linux/vgaarb.h +++ b/include/linux/vgaarb.h @@ -29,6 +29,7 @@ */ #ifndef LINUX_VGA_H +#define LINUX_VGA_H #include -- cgit v1.2.3 From f8324e20f8289dffc646d64366332e05eaacab25 Mon Sep 17 00:00:00 2001 From: Mikael Pettersson Date: Tue, 20 Jul 2010 18:45:14 -0700 Subject: math-emu: correct test for downshifting fraction in _FP_FROM_INT() The kernel's math-emu code contains a macro _FP_FROM_INT() which is used to convert an integer to a raw normalized floating-point value. It does this basically in three steps: 1. Compute the exponent from the number of leading zero bits. 2. Downshift large fractions to put the MSB in the right position for normalized fractions. 3. Upshift small fractions to put the MSB in the right position. There is an boundary error in step 2, causing a fraction with its MSB exactly one bit above the normalized MSB position to not be downshifted. This results in a non-normalized raw float, which when packed becomes a massively inaccurate representation for that input. The impact of this depends on a number of arch-specific factors, but it is known to have broken emulation of FXTOD instructions on UltraSPARC III, which was originally reported as GCC bug 44631 . Any arch which uses math-emu to emulate conversions from integers to same-size floats may be affected. The fix is simple: the exponent comparison used to determine if the fraction should be downshifted must be "<=" not "<". I'm sending a kernel module to test this as a reply to this message. There are also SPARC user-space test cases in the GCC bug entry. Signed-off-by: Mikael Pettersson Signed-off-by: David S. Miller --- include/math-emu/op-common.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/math-emu/op-common.h b/include/math-emu/op-common.h index fd882261225e..9696a5e2c437 100644 --- a/include/math-emu/op-common.h +++ b/include/math-emu/op-common.h @@ -799,7 +799,7 @@ do { \ X##_e -= (_FP_W_TYPE_SIZE - rsize); \ X##_e = rsize - X##_e - 1; \ \ - if (_FP_FRACBITS_##fs < rsize && _FP_WFRACBITS_##fs < X##_e) \ + if (_FP_FRACBITS_##fs < rsize && _FP_WFRACBITS_##fs <= X##_e) \ __FP_FRAC_SRS_1(ur_, (X##_e - _FP_WFRACBITS_##fs + 1), rsize);\ _FP_FRAC_DISASSEMBLE_##wc(X, ur_, rsize); \ if ((_FP_WFRACBITS_##fs - X##_e - 1) > 0) \ -- cgit v1.2.3 From edd63cb6b91024332d6983fc51058ac1ef0c081e Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Wed, 21 Jul 2010 19:27:07 -0500 Subject: sysrq,kdb: Use __handle_sysrq() for kdb's sysrq function The kdb code should not toggle the sysrq state in case an end user wants to try and resume the normal kernel execution. Signed-off-by: Jason Wessel Acked-by: Dmitry Torokhov --- drivers/char/sysrq.c | 2 +- include/linux/sysrq.h | 1 + kernel/debug/kdb/kdb_main.c | 3 +-- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/drivers/char/sysrq.c b/drivers/char/sysrq.c index 5d64e3acb000..878ac0c2cc68 100644 --- a/drivers/char/sysrq.c +++ b/drivers/char/sysrq.c @@ -493,7 +493,7 @@ static void __sysrq_put_key_op(int key, struct sysrq_key_op *op_p) sysrq_key_table[i] = op_p; } -static void __handle_sysrq(int key, struct tty_struct *tty, int check_mask) +void __handle_sysrq(int key, struct tty_struct *tty, int check_mask) { struct sysrq_key_op *op_p; int orig_log_level; diff --git a/include/linux/sysrq.h b/include/linux/sysrq.h index 4496322e28dd..609e8ca5f534 100644 --- a/include/linux/sysrq.h +++ b/include/linux/sysrq.h @@ -45,6 +45,7 @@ struct sysrq_key_op { */ void handle_sysrq(int key, struct tty_struct *tty); +void __handle_sysrq(int key, struct tty_struct *tty, int check_mask); int register_sysrq_key(int key, struct sysrq_key_op *op); int unregister_sysrq_key(int key, struct sysrq_key_op *op); struct sysrq_key_op *__sysrq_get_key_op(int key); diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 7e9bfd54a0db..ebe4a287419e 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -1820,9 +1820,8 @@ static int kdb_sr(int argc, const char **argv) { if (argc != 1) return KDB_ARGCOUNT; - sysrq_toggle_support(1); kdb_trap_printk++; - handle_sysrq(*argv[1], NULL); + __handle_sysrq(*argv[1], NULL, 0); kdb_trap_printk--; return 0; -- cgit v1.2.3 From 8a35747a5d13b99e076b0222729e0caa48cb69b6 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Wed, 21 Jul 2010 21:44:31 +0000 Subject: macvtap: Limit packet queue length Mark Wagner reported OOM symptoms when sending UDP traffic over a macvtap link to a kvm receiver. This appears to be caused by the fact that macvtap packet queues are unlimited in length. This means that if the receiver can't keep up with the rate of flow, then we will hit OOM. Of course it gets worse if the OOM killer then decides to kill the receiver. This patch imposes a cap on the packet queue length, in the same way as the tuntap driver, using the device TX queue length. Please note that macvtap currently has no way of giving congestion notification, that means the software device TX queue cannot be used and packets will always be dropped once the macvtap driver queue fills up. This shouldn't be a great problem for the scenario where macvtap is used to feed a kvm receiver, as the traffic is most likely external in origin so congestion notification can't be applied anyway. Of course, if anybody decides to complain about guest-to-guest UDP packet loss down the track, then we may have to revisit this. Incidentally, this patch also fixes a real memory leak when macvtap_get_queue fails. Chris Wright noticed that for this patch to work, we need a non-zero TX queue length. This patch includes his work to change the default macvtap TX queue length to 500. Reported-by: Mark Wagner Signed-off-by: Herbert Xu Acked-by: Chris Wright Acked-by: Arnd Bergmann Signed-off-by: David S. Miller --- drivers/net/macvlan.c | 10 ++++++++-- drivers/net/macvtap.c | 18 ++++++++++++++++-- include/linux/if_macvlan.h | 2 ++ 3 files changed, 26 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c index 87e8d4cb4057..f15fe2cf72ae 100644 --- a/drivers/net/macvlan.c +++ b/drivers/net/macvlan.c @@ -499,7 +499,7 @@ static const struct net_device_ops macvlan_netdev_ops = { .ndo_validate_addr = eth_validate_addr, }; -static void macvlan_setup(struct net_device *dev) +void macvlan_common_setup(struct net_device *dev) { ether_setup(dev); @@ -508,6 +508,12 @@ static void macvlan_setup(struct net_device *dev) dev->destructor = free_netdev; dev->header_ops = &macvlan_hard_header_ops, dev->ethtool_ops = &macvlan_ethtool_ops; +} +EXPORT_SYMBOL_GPL(macvlan_common_setup); + +static void macvlan_setup(struct net_device *dev) +{ + macvlan_common_setup(dev); dev->tx_queue_len = 0; } @@ -705,7 +711,6 @@ int macvlan_link_register(struct rtnl_link_ops *ops) /* common fields */ ops->priv_size = sizeof(struct macvlan_dev); ops->get_tx_queues = macvlan_get_tx_queues; - ops->setup = macvlan_setup; ops->validate = macvlan_validate; ops->maxtype = IFLA_MACVLAN_MAX; ops->policy = macvlan_policy; @@ -719,6 +724,7 @@ EXPORT_SYMBOL_GPL(macvlan_link_register); static struct rtnl_link_ops macvlan_link_ops = { .kind = "macvlan", + .setup = macvlan_setup, .newlink = macvlan_newlink, .dellink = macvlan_dellink, }; diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c index a8a94e2f6ddc..ff02b836c3c4 100644 --- a/drivers/net/macvtap.c +++ b/drivers/net/macvtap.c @@ -180,11 +180,18 @@ static int macvtap_forward(struct net_device *dev, struct sk_buff *skb) { struct macvtap_queue *q = macvtap_get_queue(dev, skb); if (!q) - return -ENOLINK; + goto drop; + + if (skb_queue_len(&q->sk.sk_receive_queue) >= dev->tx_queue_len) + goto drop; skb_queue_tail(&q->sk.sk_receive_queue, skb); wake_up_interruptible_poll(sk_sleep(&q->sk), POLLIN | POLLRDNORM | POLLRDBAND); - return 0; + return NET_RX_SUCCESS; + +drop: + kfree_skb(skb); + return NET_RX_DROP; } /* @@ -235,8 +242,15 @@ static void macvtap_dellink(struct net_device *dev, macvlan_dellink(dev, head); } +static void macvtap_setup(struct net_device *dev) +{ + macvlan_common_setup(dev); + dev->tx_queue_len = TUN_READQ_SIZE; +} + static struct rtnl_link_ops macvtap_link_ops __read_mostly = { .kind = "macvtap", + .setup = macvtap_setup, .newlink = macvtap_newlink, .dellink = macvtap_dellink, }; diff --git a/include/linux/if_macvlan.h b/include/linux/if_macvlan.h index 9ea047aca795..1ffaeffeff74 100644 --- a/include/linux/if_macvlan.h +++ b/include/linux/if_macvlan.h @@ -67,6 +67,8 @@ static inline void macvlan_count_rx(const struct macvlan_dev *vlan, } } +extern void macvlan_common_setup(struct net_device *dev); + extern int macvlan_common_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], int (*receive)(struct sk_buff *skb), -- cgit v1.2.3 From 718be4aaf3613cf7c2d097f925abc3d3553c0605 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Thu, 22 Jul 2010 16:54:27 -0400 Subject: ACPI: skip checking BM_STS if the BIOS doesn't ask for it It turns out that there is a bit in the _CST for Intel FFH C3 that tells the OS if we should be checking BM_STS or not. Linux has been unconditionally checking BM_STS. If the chip-set is configured to enable BM_STS, it can retard or completely prevent entry into deep C-states -- as illustrated by turbostat: http://userweb.kernel.org/~lenb/acpi/utils/pmtools/turbostat/ ref: Intel Processor Vendor-Specific ACPI Interface Specification table 4 "_CST FFH GAS Field Encoding" Bit 1: Set to 1 if OSPM should use Bus Master avoidance for this C-state https://bugzilla.kernel.org/show_bug.cgi?id=15886 Signed-off-by: Len Brown --- arch/x86/kernel/acpi/cstate.c | 9 +++++++++ drivers/acpi/processor_idle.c | 2 +- include/acpi/processor.h | 3 ++- 3 files changed, 12 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c index 2e837f5080fe..fb7a5f052e2b 100644 --- a/arch/x86/kernel/acpi/cstate.c +++ b/arch/x86/kernel/acpi/cstate.c @@ -145,6 +145,15 @@ int acpi_processor_ffh_cstate_probe(unsigned int cpu, percpu_entry->states[cx->index].eax = cx->address; percpu_entry->states[cx->index].ecx = MWAIT_ECX_INTERRUPT_BREAK; } + + /* + * For _CST FFH on Intel, if GAS.access_size bit 1 is cleared, + * then we should skip checking BM_STS for this C-state. + * ref: "Intel Processor Vendor-Specific ACPI Interface Specification" + */ + if ((c->x86_vendor == X86_VENDOR_INTEL) && !(reg->access_size & 0x2)) + cx->bm_sts_skip = 1; + return retval; } EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_probe); diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c index b1b385692f46..b351342f1faf 100644 --- a/drivers/acpi/processor_idle.c +++ b/drivers/acpi/processor_idle.c @@ -947,7 +947,7 @@ static int acpi_idle_enter_bm(struct cpuidle_device *dev, if (acpi_idle_suspend) return(acpi_idle_enter_c1(dev, state)); - if (acpi_idle_bm_check()) { + if (!cx->bm_sts_skip && acpi_idle_bm_check()) { if (dev->safe_state) { dev->last_state = dev->safe_state; return dev->safe_state->enter(dev, dev->safe_state); diff --git a/include/acpi/processor.h b/include/acpi/processor.h index da565a48240e..a68ca8a11a53 100644 --- a/include/acpi/processor.h +++ b/include/acpi/processor.h @@ -48,7 +48,7 @@ struct acpi_power_register { u8 space_id; u8 bit_width; u8 bit_offset; - u8 reserved; + u8 access_size; u64 address; } __attribute__ ((packed)); @@ -63,6 +63,7 @@ struct acpi_processor_cx { u32 power; u32 usage; u64 time; + u8 bm_sts_skip; char desc[ACPI_CX_DESC_LEN]; }; -- cgit v1.2.3 From da5e37efe8704fc2b354626467f80f73c5e3c020 Mon Sep 17 00:00:00 2001 From: Sam Ravnborg Date: Tue, 13 Jul 2010 11:39:42 +0200 Subject: vmlinux.lds: fix .data..init_task output section (fix popwerpc boot) The .data..init_task output section was missing a load offset causing a popwerpc target to fail to boot. Sean MacLennan tracked it down to the definition of INIT_TASK_DATA_SECTION(). There are only two users of INIT_TASK_DATA_SECTION() in the kernel today: cris and popwerpc. cris do not support relocatable kernels and is thus not impacted by this change. Fix INIT_TASK_DATA_SECTION() to specify load offset like all other output sections. Reported-by: Sean MacLennan Signed-off-by: Sam Ravnborg Signed-off-by: Benjamin Herrenschmidt --- include/asm-generic/vmlinux.lds.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 48c5299cbf26..cdfff74e9739 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -435,7 +435,7 @@ */ #define INIT_TASK_DATA_SECTION(align) \ . = ALIGN(align); \ - .data..init_task : { \ + .data..init_task : AT(ADDR(.data..init_task) - LOAD_OFFSET) { \ INIT_TASK_DATA(align) \ } -- cgit v1.2.3 From 72ad5d77fb981963edae15eee8196c80238f5ed0 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 23 Jul 2010 22:59:09 +0200 Subject: ACPI / Sleep: Allow the NVS saving to be skipped during suspend to RAM Commit 2a6b69765ad794389f2fc3e14a0afa1a995221c2 (ACPI: Store NVS state even when entering suspend to RAM) caused the ACPI suspend code save the NVS area during suspend and restore it during resume unconditionally, although it is known that some systems need to use acpi_sleep=s4_nonvs for hibernation to work. To allow the affected systems to avoid saving and restoring the NVS area during suspend to RAM and resume, introduce kernel command line option acpi_sleep=nonvs and make acpi_sleep=s4_nonvs work as its alias temporarily (add acpi_sleep=s4_nonvs to the feature removal file). Addresses https://bugzilla.kernel.org/show_bug.cgi?id=16396 . Signed-off-by: Rafael J. Wysocki Reported-and-tested-by: tomas m Signed-off-by: Len Brown --- Documentation/feature-removal-schedule.txt | 7 ++++++ Documentation/kernel-parameters.txt | 4 ++-- arch/x86/kernel/acpi/sleep.c | 9 ++++++-- drivers/acpi/sleep.c | 35 +++++++++++++++--------------- include/linux/acpi.h | 2 +- 5 files changed, 34 insertions(+), 23 deletions(-) (limited to 'include') diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt index c268783bc4e7..1571c0c83dba 100644 --- a/Documentation/feature-removal-schedule.txt +++ b/Documentation/feature-removal-schedule.txt @@ -647,3 +647,10 @@ Who: Stefan Richter ---------------------------- +What: The acpi_sleep=s4_nonvs command line option +When: 2.6.37 +Files: arch/x86/kernel/acpi/sleep.c +Why: superseded by acpi_sleep=nonvs +Who: Rafael J. Wysocki + +---------------------------- diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 4ddb58df081e..2b2407d9a6d0 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -254,8 +254,8 @@ and is between 256 and 4096 characters. It is defined in the file control method, with respect to putting devices into low power states, to be enforced (the ACPI 2.0 ordering of _PTS is used by default). - s4_nonvs prevents the kernel from saving/restoring the - ACPI NVS memory during hibernation. + nonvs prevents the kernel from saving/restoring the + ACPI NVS memory during suspend/hibernation and resume. sci_force_enable causes the kernel to set SCI_EN directly on resume from S1/S3 (which is against the ACPI spec, but some broken systems don't work without it). diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index 82e508677b91..fcc3c61fdecc 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -157,9 +157,14 @@ static int __init acpi_sleep_setup(char *str) #ifdef CONFIG_HIBERNATION if (strncmp(str, "s4_nohwsig", 10) == 0) acpi_no_s4_hw_signature(); - if (strncmp(str, "s4_nonvs", 8) == 0) - acpi_s4_no_nvs(); + if (strncmp(str, "s4_nonvs", 8) == 0) { + pr_warning("ACPI: acpi_sleep=s4_nonvs is deprecated, " + "please use acpi_sleep=nonvs instead"); + acpi_nvs_nosave(); + } #endif + if (strncmp(str, "nonvs", 5) == 0) + acpi_nvs_nosave(); if (strncmp(str, "old_ordering", 12) == 0) acpi_old_suspend_ordering(); str = strchr(str, ','); diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c index 5b7c52e4a00f..2862c781b372 100644 --- a/drivers/acpi/sleep.c +++ b/drivers/acpi/sleep.c @@ -81,6 +81,20 @@ static int acpi_sleep_prepare(u32 acpi_state) #ifdef CONFIG_ACPI_SLEEP static u32 acpi_target_sleep_state = ACPI_STATE_S0; +/* + * The ACPI specification wants us to save NVS memory regions during hibernation + * and to restore them during the subsequent resume. Windows does that also for + * suspend to RAM. However, it is known that this mechanism does not work on + * all machines, so we allow the user to disable it with the help of the + * 'acpi_sleep=nonvs' kernel command line option. + */ +static bool nvs_nosave; + +void __init acpi_nvs_nosave(void) +{ + nvs_nosave = true; +} + /* * ACPI 1.0 wants us to execute _PTS before suspending devices, so we allow the * user to request that behavior by using the 'acpi_old_suspend_ordering' @@ -197,8 +211,7 @@ static int acpi_suspend_begin(suspend_state_t pm_state) u32 acpi_state = acpi_suspend_states[pm_state]; int error = 0; - error = suspend_nvs_alloc(); - + error = nvs_nosave ? 0 : suspend_nvs_alloc(); if (error) return error; @@ -388,20 +401,6 @@ static struct dmi_system_id __initdata acpisleep_dmi_table[] = { #endif /* CONFIG_SUSPEND */ #ifdef CONFIG_HIBERNATION -/* - * The ACPI specification wants us to save NVS memory regions during hibernation - * and to restore them during the subsequent resume. However, it is not certain - * if this mechanism is going to work on all machines, so we allow the user to - * disable this mechanism using the 'acpi_sleep=s4_nonvs' kernel command line - * option. - */ -static bool s4_no_nvs; - -void __init acpi_s4_no_nvs(void) -{ - s4_no_nvs = true; -} - static unsigned long s4_hardware_signature; static struct acpi_table_facs *facs; static bool nosigcheck; @@ -415,7 +414,7 @@ static int acpi_hibernation_begin(void) { int error; - error = s4_no_nvs ? 0 : suspend_nvs_alloc(); + error = nvs_nosave ? 0 : suspend_nvs_alloc(); if (!error) { acpi_target_sleep_state = ACPI_STATE_S4; acpi_sleep_tts_switch(acpi_target_sleep_state); @@ -510,7 +509,7 @@ static int acpi_hibernation_begin_old(void) error = acpi_sleep_prepare(ACPI_STATE_S4); if (!error) { - if (!s4_no_nvs) + if (!nvs_nosave) error = suspend_nvs_alloc(); if (!error) acpi_target_sleep_state = ACPI_STATE_S4; diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 224a38c960d4..ccf94dc5acdf 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -253,7 +253,7 @@ int acpi_resources_are_enforced(void); #ifdef CONFIG_PM_SLEEP void __init acpi_no_s4_hw_signature(void); void __init acpi_old_suspend_ordering(void); -void __init acpi_s4_no_nvs(void); +void __init acpi_nvs_nosave(void); #endif /* CONFIG_PM_SLEEP */ struct acpi_osc_context { -- cgit v1.2.3 From 3b87956ea645fb4de7e59c7d0aa94de04be72615 Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Thu, 22 Jul 2010 18:45:04 +0000 Subject: net sched: fix race in mirred device removal This fixes hang when target device of mirred packet classifier action is removed. If a mirror or redirection action is configured to cause packets to go to another device, the classifier holds a ref count, but was assuming the adminstrator cleaned up all redirections before removing. The fix is to add a notifier and cleanup during unregister. The new list is implicitly protected by RTNL mutex because it is held during filter add/delete as well as notifier. Signed-off-by: Stephen Hemminger Acked-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- include/net/tc_act/tc_mirred.h | 1 + net/sched/act_mirred.c | 43 +++++++++++++++++++++++++++++++++++++++--- 2 files changed, 41 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/tc_act/tc_mirred.h b/include/net/tc_act/tc_mirred.h index ceac661cdfd5..cfe2943690ff 100644 --- a/include/net/tc_act/tc_mirred.h +++ b/include/net/tc_act/tc_mirred.h @@ -9,6 +9,7 @@ struct tcf_mirred { int tcfm_ifindex; int tcfm_ok_push; struct net_device *tcfm_dev; + struct list_head tcfm_list; }; #define to_mirred(pc) \ container_of(pc, struct tcf_mirred, common) diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index c0b6863e3b87..1980b71c283f 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -33,6 +33,7 @@ static struct tcf_common *tcf_mirred_ht[MIRRED_TAB_MASK + 1]; static u32 mirred_idx_gen; static DEFINE_RWLOCK(mirred_lock); +static LIST_HEAD(mirred_list); static struct tcf_hashinfo mirred_hash_info = { .htab = tcf_mirred_ht, @@ -47,7 +48,9 @@ static inline int tcf_mirred_release(struct tcf_mirred *m, int bind) m->tcf_bindcnt--; m->tcf_refcnt--; if(!m->tcf_bindcnt && m->tcf_refcnt <= 0) { - dev_put(m->tcfm_dev); + list_del(&m->tcfm_list); + if (m->tcfm_dev) + dev_put(m->tcfm_dev); tcf_hash_destroy(&m->common, &mirred_hash_info); return 1; } @@ -134,8 +137,10 @@ static int tcf_mirred_init(struct nlattr *nla, struct nlattr *est, m->tcfm_ok_push = ok_push; } spin_unlock_bh(&m->tcf_lock); - if (ret == ACT_P_CREATED) + if (ret == ACT_P_CREATED) { + list_add(&m->tcfm_list, &mirred_list); tcf_hash_insert(pc, &mirred_hash_info); + } return ret; } @@ -162,9 +167,14 @@ static int tcf_mirred(struct sk_buff *skb, struct tc_action *a, m->tcf_tm.lastuse = jiffies; dev = m->tcfm_dev; + if (!dev) { + printk_once(KERN_NOTICE "tc mirred: target device is gone\n"); + goto out; + } + if (!(dev->flags & IFF_UP)) { if (net_ratelimit()) - pr_notice("tc mirred to Houston: device %s is gone!\n", + pr_notice("tc mirred to Houston: device %s is down\n", dev->name); goto out; } @@ -232,6 +242,28 @@ nla_put_failure: return -1; } +static int mirred_device_event(struct notifier_block *unused, + unsigned long event, void *ptr) +{ + struct net_device *dev = ptr; + struct tcf_mirred *m; + + if (event == NETDEV_UNREGISTER) + list_for_each_entry(m, &mirred_list, tcfm_list) { + if (m->tcfm_dev == dev) { + dev_put(dev); + m->tcfm_dev = NULL; + } + } + + return NOTIFY_DONE; +} + +static struct notifier_block mirred_device_notifier = { + .notifier_call = mirred_device_event, +}; + + static struct tc_action_ops act_mirred_ops = { .kind = "mirred", .hinfo = &mirred_hash_info, @@ -252,12 +284,17 @@ MODULE_LICENSE("GPL"); static int __init mirred_init_module(void) { + int err = register_netdevice_notifier(&mirred_device_notifier); + if (err) + return err; + pr_info("Mirror/redirect action on\n"); return tcf_register_action(&act_mirred_ops); } static void __exit mirred_cleanup_module(void) { + unregister_netdevice_notifier(&mirred_device_notifier); tcf_unregister_action(&act_mirred_ops); } -- cgit v1.2.3