From 6ae0516b8a50ece5d766be608a305707e0450060 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Fri, 16 Dec 2011 14:04:23 +0100 Subject: block, cfq: fix empty queue crash caused by request merge All requests of a queue could be merged to other requests of other queue. Such queue will not have request in it, but it's in service tree. This will cause kernel oops. I encounter a BUG_ON() in cfq_dispatch_request() with next patch, but the issue should exist without the patch. Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 4c12869fcf77..3548705b04e4 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -1655,6 +1655,8 @@ cfq_merged_requests(struct request_queue *q, struct request *rq, struct request *next) { struct cfq_queue *cfqq = RQ_CFQQ(rq); + struct cfq_data *cfqd = q->elevator->elevator_data; + /* * reposition in fifo if next is older than rq */ @@ -1669,6 +1671,16 @@ cfq_merged_requests(struct request_queue *q, struct request *rq, cfq_remove_request(next); cfq_blkiocg_update_io_merged_stats(&(RQ_CFQG(rq))->blkg, rq_data_dir(next), rq_is_sync(next)); + + cfqq = RQ_CFQQ(next); + /* + * all requests of this queue are merged to other queues, delete it + * from the service tree. If it's the active_queue, + * cfq_dispatch_requests() will choose to expire it or do idle + */ + if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list) && + cfqq != cfqd->active_queue) + cfq_del_cfqq_rr(cfqd, cfqq); } static int cfq_allow_merge(struct request_queue *q, struct request *rq, -- cgit v1.2.3 From 8bd6960c6ae65d7f92bfb708154ee813417d7b26 Mon Sep 17 00:00:00 2001 From: KyongHo Cho Date: Fri, 16 Dec 2011 21:38:25 +0900 Subject: iommu: Initialize domain->handler in iommu_domain_alloc() Since it is not guaranteed that an iommu driver initializes in its domain_init() function, it must be initialized with NULL to prevent calling a function in an arbitrary location when iommu fault occurred. Signed-off-by: KyongHo Cho Signed-off-by: Joerg Roedel --- drivers/iommu/iommu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 2fb2963df553..5b5fa5cdaa31 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -90,7 +90,7 @@ struct iommu_domain *iommu_domain_alloc(struct bus_type *bus) if (bus == NULL || bus->iommu_ops == NULL) return NULL; - domain = kmalloc(sizeof(*domain), GFP_KERNEL); + domain = kzalloc(sizeof(*domain), GFP_KERNEL); if (!domain) return NULL; -- cgit v1.2.3 From b3bba872ddb0320a7ecb54decae53c13ceb2ed4c Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Thu, 8 Dec 2011 16:53:54 -0600 Subject: writeback: show writeback reason with __print_symbolic This makes the binary trace understandable by trace-cmd. CC: Dave Chinner CC: Curt Wohlgemuth CC: Steven Rostedt Signed-off-by: Wu Fengguang --- fs/fs-writeback.c | 11 ----------- include/trace/events/writeback.h | 15 +++++++++++++-- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index ac86f8b3e3cb..517f211a3bd4 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -47,17 +47,6 @@ struct wb_writeback_work { struct completion *done; /* set if the caller waits */ }; -const char *wb_reason_name[] = { - [WB_REASON_BACKGROUND] = "background", - [WB_REASON_TRY_TO_FREE_PAGES] = "try_to_free_pages", - [WB_REASON_SYNC] = "sync", - [WB_REASON_PERIODIC] = "periodic", - [WB_REASON_LAPTOP_TIMER] = "laptop_timer", - [WB_REASON_FREE_MORE_MEM] = "free_more_memory", - [WB_REASON_FS_FREE_SPACE] = "fs_free_space", - [WB_REASON_FORKER_THREAD] = "forker_thread" -}; - /* * Include the creation of the trace points after defining the * wb_writeback_work structure so that the definition remains local to this diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index b99caa8b780c..99d1d0decf88 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -21,6 +21,16 @@ {I_REFERENCED, "I_REFERENCED"} \ ) +#define WB_WORK_REASON \ + {WB_REASON_BACKGROUND, "background"}, \ + {WB_REASON_TRY_TO_FREE_PAGES, "try_to_free_pages"}, \ + {WB_REASON_SYNC, "sync"}, \ + {WB_REASON_PERIODIC, "periodic"}, \ + {WB_REASON_LAPTOP_TIMER, "laptop_timer"}, \ + {WB_REASON_FREE_MORE_MEM, "free_more_memory"}, \ + {WB_REASON_FS_FREE_SPACE, "fs_free_space"}, \ + {WB_REASON_FORKER_THREAD, "forker_thread"} + struct wb_writeback_work; DECLARE_EVENT_CLASS(writeback_work_class, @@ -55,7 +65,7 @@ DECLARE_EVENT_CLASS(writeback_work_class, __entry->for_kupdate, __entry->range_cyclic, __entry->for_background, - wb_reason_name[__entry->reason] + __print_symbolic(__entry->reason, WB_WORK_REASON) ) ); #define DEFINE_WRITEBACK_WORK_EVENT(name) \ @@ -184,7 +194,8 @@ TRACE_EVENT(writeback_queue_io, __entry->older, /* older_than_this in jiffies */ __entry->age, /* older_than_this in relative milliseconds */ __entry->moved, - wb_reason_name[__entry->reason]) + __print_symbolic(__entry->reason, WB_WORK_REASON) + ) ); TRACE_EVENT(global_dirty_state, -- cgit v1.2.3 From fa0ad6575f6d459e215dded90b10cc455a889145 Mon Sep 17 00:00:00 2001 From: Paul Bolle Date: Sat, 5 Nov 2011 12:21:30 +0100 Subject: kconfig: adapt update-po-config to new UML layout Commit 5c48b108 ("um: take arch/um/sys-x86 to arch/x86/um") broke the make target update-po-config, as its symlink trick (again) fails. (Previous breakage was fixed with commit bdc69ca4 ("kconfig: change update-po-config to reflect new layout of arch/um").) The new UML layout allows to drop the symlick trick entirely. And if, one day, another architecture supports UML too, that should now work without again breaking this make target. Signed-off-by: Paul Bolle Signed-off-by: Michal Marek --- scripts/kconfig/Makefile | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/scripts/kconfig/Makefile b/scripts/kconfig/Makefile index ba573fe7c74d..914833d99b06 100644 --- a/scripts/kconfig/Makefile +++ b/scripts/kconfig/Makefile @@ -60,8 +60,8 @@ update-po-config: $(obj)/kxgettext $(obj)/gconf.glade.h --directory=$(srctree) --directory=$(objtree) \ --output $(obj)/config.pot $(Q)sed -i s/CHARSET/UTF-8/ $(obj)/config.pot - $(Q)ln -fs Kconfig.x86 arch/um/Kconfig - $(Q)(for i in `ls $(srctree)/arch/*/Kconfig`; \ + $(Q)(for i in `ls $(srctree)/arch/*/Kconfig \ + $(srctree)/arch/*/um/Kconfig`; \ do \ echo " GEN $$i"; \ $(obj)/kxgettext $$i \ @@ -69,7 +69,6 @@ update-po-config: $(obj)/kxgettext $(obj)/gconf.glade.h done ) $(Q)msguniq --sort-by-file --to-code=UTF-8 $(obj)/config.pot \ --output $(obj)/linux.pot - $(Q)rm -f $(srctree)/arch/um/Kconfig $(Q)rm -f $(obj)/config.pot PHONY += allnoconfig allyesconfig allmodconfig alldefconfig randconfig -- cgit v1.2.3 From c070e38e4ee005f55895df177a9e14d90d6204b3 Mon Sep 17 00:00:00 2001 From: Laurent Pinchart Date: Wed, 16 Nov 2011 10:54:02 -0300 Subject: [media] omap3isp: Fix crash caused by subdevs now having a pointer to devnodes Commit 3e0ec41c5c5ee14e27f65e28d4a616de34f59a97 ("V4L: dynamically allocate video_device nodes in subdevices") makes the embedding video_device directly. Fix accesses to the devnode accordingly. Signed-off-by: Laurent Pinchart Acked-by: Sakari Ailus Signed-off-by: Mauro Carvalho Chehab --- drivers/media/video/omap3isp/ispccdc.c | 2 +- drivers/media/video/omap3isp/ispstat.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/media/video/omap3isp/ispccdc.c b/drivers/media/video/omap3isp/ispccdc.c index b0b0fa5a3572..54a4a3f22e2e 100644 --- a/drivers/media/video/omap3isp/ispccdc.c +++ b/drivers/media/video/omap3isp/ispccdc.c @@ -1408,7 +1408,7 @@ static void ccdc_hs_vs_isr(struct isp_ccdc_device *ccdc) { struct isp_pipeline *pipe = to_isp_pipeline(&ccdc->video_out.video.entity); - struct video_device *vdev = &ccdc->subdev.devnode; + struct video_device *vdev = ccdc->subdev.devnode; struct v4l2_event event; memset(&event, 0, sizeof(event)); diff --git a/drivers/media/video/omap3isp/ispstat.c b/drivers/media/video/omap3isp/ispstat.c index 68d539456c55..bc0b2c7349b9 100644 --- a/drivers/media/video/omap3isp/ispstat.c +++ b/drivers/media/video/omap3isp/ispstat.c @@ -496,7 +496,7 @@ static int isp_stat_bufs_alloc(struct ispstat *stat, u32 size) static void isp_stat_queue_event(struct ispstat *stat, int err) { - struct video_device *vdev = &stat->subdev.devnode; + struct video_device *vdev = stat->subdev.devnode; struct v4l2_event event; struct omap3isp_stat_event_status *status = (void *)event.u.data; -- cgit v1.2.3 From 609f6ea1c9cdfe0c43a927e13205a57d0c266d5a Mon Sep 17 00:00:00 2001 From: majianpeng Date: Wed, 21 Dec 2011 15:27:24 +0100 Subject: block: re-use existing 'reading' variable instead of checking direction again Signed-off-by: majianpeng Signed-off-by: Jens Axboe --- block/blk-map.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/blk-map.c b/block/blk-map.c index 164cd0059706..623e1cd4cffe 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -311,7 +311,7 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf, if (IS_ERR(bio)) return PTR_ERR(bio); - if (rq_data_dir(rq) == WRITE) + if (!reading) bio->bi_rw |= REQ_WRITE; if (do_copy) -- cgit v1.2.3 From e30e2fdfe56288576ee9e04dbb06b4bd5f282203 Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Thu, 22 Dec 2011 02:45:29 +0530 Subject: VFS: Fix race between CPU hotplug and lglocks Currently, the *_global_[un]lock_online() routines are not at all synchronized with CPU hotplug. Soft-lockups detected as a consequence of this race was reported earlier at https://lkml.org/lkml/2011/8/24/185. (Thanks to Cong Meng for finding out that the root-cause of this issue is the race condition between br_write_[un]lock() and CPU hotplug, which results in the lock states getting messed up). Fixing this race by just adding {get,put}_online_cpus() at appropriate places in *_global_[un]lock_online() is not a good option, because, then suddenly br_write_[un]lock() would become blocking, whereas they have been kept as non-blocking all this time, and we would want to keep them that way. So, overall, we want to ensure 3 things: 1. br_write_lock() and br_write_unlock() must remain as non-blocking. 2. The corresponding lock and unlock of the per-cpu spinlocks must not happen for different sets of CPUs. 3. Either prevent any new CPU online operation in between this lock-unlock, or ensure that the newly onlined CPU does not proceed with its corresponding per-cpu spinlock unlocked. To achieve all this: (a) We introduce a new spinlock that is taken by the *_global_lock_online() routine and released by the *_global_unlock_online() routine. (b) We register a callback for CPU hotplug notifications, and this callback takes the same spinlock as above. (c) We maintain a bitmap which is close to the cpu_online_mask, and once it is initialized in the lock_init() code, all future updates to it are done in the callback, under the above spinlock. (d) The above bitmap is used (instead of cpu_online_mask) while locking and unlocking the per-cpu locks. The callback takes the spinlock upon the CPU_UP_PREPARE event. So, if the br_write_lock-unlock sequence is in progress, the callback keeps spinning, thus preventing the CPU online operation till the lock-unlock sequence is complete. This takes care of requirement (3). The bitmap that we maintain remains unmodified throughout the lock-unlock sequence, since all updates to it are managed by the callback, which takes the same spinlock as the one taken by the lock code and released only by the unlock routine. Combining this with (d) above, satisfies requirement (2). Overall, since we use a spinlock (mentioned in (a)) to prevent CPU hotplug operations from racing with br_write_lock-unlock, requirement (1) is also taken care of. By the way, it is to be noted that a CPU offline operation can actually run in parallel with our lock-unlock sequence, because our callback doesn't react to notifications earlier than CPU_DEAD (in order to maintain our bitmap properly). And this means, since we use our own bitmap (which is stale, on purpose) during the lock-unlock sequence, we could end up unlocking the per-cpu lock of an offline CPU (because we had locked it earlier, when the CPU was online), in order to satisfy requirement (2). But this is harmless, though it looks a bit awkward. Debugged-by: Cong Meng Signed-off-by: Srivatsa S. Bhat Signed-off-by: Al Viro Cc: stable@vger.kernel.org --- include/linux/lglock.h | 36 ++++++++++++++++++++++++++++++++---- 1 file changed, 32 insertions(+), 4 deletions(-) diff --git a/include/linux/lglock.h b/include/linux/lglock.h index f549056fb20b..87f402ccec55 100644 --- a/include/linux/lglock.h +++ b/include/linux/lglock.h @@ -22,6 +22,7 @@ #include #include #include +#include /* can make br locks by using local lock for read side, global lock for write */ #define br_lock_init(name) name##_lock_init() @@ -72,9 +73,31 @@ #define DEFINE_LGLOCK(name) \ \ + DEFINE_SPINLOCK(name##_cpu_lock); \ + cpumask_t name##_cpus __read_mostly; \ DEFINE_PER_CPU(arch_spinlock_t, name##_lock); \ DEFINE_LGLOCK_LOCKDEP(name); \ \ + static int \ + name##_lg_cpu_callback(struct notifier_block *nb, \ + unsigned long action, void *hcpu) \ + { \ + switch (action & ~CPU_TASKS_FROZEN) { \ + case CPU_UP_PREPARE: \ + spin_lock(&name##_cpu_lock); \ + cpu_set((unsigned long)hcpu, name##_cpus); \ + spin_unlock(&name##_cpu_lock); \ + break; \ + case CPU_UP_CANCELED: case CPU_DEAD: \ + spin_lock(&name##_cpu_lock); \ + cpu_clear((unsigned long)hcpu, name##_cpus); \ + spin_unlock(&name##_cpu_lock); \ + } \ + return NOTIFY_OK; \ + } \ + static struct notifier_block name##_lg_cpu_notifier = { \ + .notifier_call = name##_lg_cpu_callback, \ + }; \ void name##_lock_init(void) { \ int i; \ LOCKDEP_INIT_MAP(&name##_lock_dep_map, #name, &name##_lock_key, 0); \ @@ -83,6 +106,11 @@ lock = &per_cpu(name##_lock, i); \ *lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; \ } \ + register_hotcpu_notifier(&name##_lg_cpu_notifier); \ + get_online_cpus(); \ + for_each_online_cpu(i) \ + cpu_set(i, name##_cpus); \ + put_online_cpus(); \ } \ EXPORT_SYMBOL(name##_lock_init); \ \ @@ -124,9 +152,9 @@ \ void name##_global_lock_online(void) { \ int i; \ - preempt_disable(); \ + spin_lock(&name##_cpu_lock); \ rwlock_acquire(&name##_lock_dep_map, 0, 0, _RET_IP_); \ - for_each_online_cpu(i) { \ + for_each_cpu(i, &name##_cpus) { \ arch_spinlock_t *lock; \ lock = &per_cpu(name##_lock, i); \ arch_spin_lock(lock); \ @@ -137,12 +165,12 @@ void name##_global_unlock_online(void) { \ int i; \ rwlock_release(&name##_lock_dep_map, 1, _RET_IP_); \ - for_each_online_cpu(i) { \ + for_each_cpu(i, &name##_cpus) { \ arch_spinlock_t *lock; \ lock = &per_cpu(name##_lock, i); \ arch_spin_unlock(lock); \ } \ - preempt_enable(); \ + spin_unlock(&name##_cpu_lock); \ } \ EXPORT_SYMBOL(name##_global_unlock_online); \ \ -- cgit v1.2.3 From 77e00f2ea94abee1ad13bdfde19cf7aa25992b0e Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Wed, 21 Dec 2011 11:58:17 -0500 Subject: drm/radeon/kms: bail on BTC parts if MC ucode is missing We already do this for cayman, need to also do it for BTC parts. The default memory and voltage setup is not adequate for advanced operation. Continuing will result in an unusable display. Signed-off-by: Alex Deucher Cc: stable@kernel.org Cc: Jean Delvare Signed-off-by: Dave Airlie --- drivers/gpu/drm/radeon/evergreen.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/drivers/gpu/drm/radeon/evergreen.c b/drivers/gpu/drm/radeon/evergreen.c index 5e00d1670aa9..92c9628c572d 100644 --- a/drivers/gpu/drm/radeon/evergreen.c +++ b/drivers/gpu/drm/radeon/evergreen.c @@ -3276,6 +3276,18 @@ int evergreen_init(struct radeon_device *rdev) rdev->accel_working = false; } } + + /* Don't start up if the MC ucode is missing on BTC parts. + * The default clocks and voltages before the MC ucode + * is loaded are not suffient for advanced operations. + */ + if (ASIC_IS_DCE5(rdev)) { + if (!rdev->mc_fw && !(rdev->flags & RADEON_IS_IGP)) { + DRM_ERROR("radeon: MC ucode required for NI+.\n"); + return -EINVAL; + } + } + return 0; } -- cgit v1.2.3 From 8a78389651b3e411ec5a7df61404734f52d6f4eb Mon Sep 17 00:00:00 2001 From: Xi Wang Date: Wed, 21 Dec 2011 05:18:33 -0500 Subject: vmwgfx: fix incorrect VRAM size check in vmw_kms_fb_create() Commit e133e737 didn't correctly fix the integer overflow issue. - unsigned int required_size; + u64 required_size; ... required_size = mode_cmd->pitch * mode_cmd->height; - if (unlikely(required_size > dev_priv->vram_size)) { + if (unlikely(required_size > (u64) dev_priv->vram_size)) { Note that both pitch and height are u32. Their product is still u32 and would overflow before being assigned to required_size. A correct way is to convert pitch and height to u64 before the multiplication. required_size = (u64)mode_cmd->pitch * (u64)mode_cmd->height; This patch calls the existing vmw_kms_validate_mode_vram() for validation. Signed-off-by: Xi Wang Reviewed-and-tested-by: Thomas Hellstrom Signed-off-by: Dave Airlie --- drivers/gpu/drm/vmwgfx/vmwgfx_kms.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c b/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c index 8aa1dbb45c67..f94b33ae2215 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c @@ -1093,7 +1093,6 @@ static struct drm_framebuffer *vmw_kms_fb_create(struct drm_device *dev, struct vmw_surface *surface = NULL; struct vmw_dma_buffer *bo = NULL; struct ttm_base_object *user_obj; - u64 required_size; int ret; /** @@ -1102,8 +1101,9 @@ static struct drm_framebuffer *vmw_kms_fb_create(struct drm_device *dev, * requested framebuffer. */ - required_size = mode_cmd->pitch * mode_cmd->height; - if (unlikely(required_size > (u64) dev_priv->vram_size)) { + if (!vmw_kms_validate_mode_vram(dev_priv, + mode_cmd->pitch, + mode_cmd->height)) { DRM_ERROR("VRAM size is too small for requested mode.\n"); return ERR_PTR(-ENOMEM); } -- cgit v1.2.3 From 7cc8583372a21d98a23b703ad96cab03180b5030 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Thu, 22 Dec 2011 13:23:59 -0800 Subject: sparc64: Fix MSIQ HV call ordering in pci_sun4v_msiq_build_irq(). This silently was working for many years and stopped working on Niagara-T3 machines. We need to set the MSIQ to VALID before we can set it's state to IDLE. On Niagara-T3, setting the state to IDLE first was causing HV_EINVAL errors. The hypervisor documentation says, rather ambiguously, that the MSIQ must be "initialized" before one can set the state. I previously understood this to mean merely that a successful setconf() operation has been performed on the MSIQ, which we have done at this point. But it seems to also mean that it has been set VALID too. Signed-off-by: David S. Miller --- arch/sparc/kernel/pci_sun4v.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/sparc/kernel/pci_sun4v.c b/arch/sparc/kernel/pci_sun4v.c index b272cda35a01..af5755d20fbe 100644 --- a/arch/sparc/kernel/pci_sun4v.c +++ b/arch/sparc/kernel/pci_sun4v.c @@ -849,10 +849,10 @@ static int pci_sun4v_msiq_build_irq(struct pci_pbm_info *pbm, if (!irq) return -ENOMEM; - if (pci_sun4v_msiq_setstate(pbm->devhandle, msiqid, HV_MSIQSTATE_IDLE)) - return -EINVAL; if (pci_sun4v_msiq_setvalid(pbm->devhandle, msiqid, HV_MSIQ_VALID)) return -EINVAL; + if (pci_sun4v_msiq_setstate(pbm->devhandle, msiqid, HV_MSIQSTATE_IDLE)) + return -EINVAL; return irq; } -- cgit v1.2.3 From 09cd9270ea52e0f9851528e8ed028073f96b3c34 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 23 Dec 2011 09:56:55 +1100 Subject: md/linear: fix hot-add of devices to linear arrays. commit d70ed2e4fafdbef0800e73942482bb075c21578b broke hot-add to a linear array. After that commit, metadata if not written to devices until they have been fully integrated into the array as determined by saved_raid_disk. That patch arranged to clear that field after a recovery completed. However for linear arrays, there is no recovery - the integration is instantaneous. So we need to explicitly clear the saved_raid_disk field. Signed-off-by: NeilBrown --- drivers/md/linear.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/md/linear.c b/drivers/md/linear.c index c3273efd08cb..627456542fb3 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -230,6 +230,7 @@ static int linear_add(struct mddev *mddev, struct md_rdev *rdev) return -EINVAL; rdev->raid_disk = rdev->saved_raid_disk; + rdev->saved_raid_disk = -1; newconf = linear_conf(mddev,mddev->raid_disks+1); -- cgit v1.2.3 From 30d7a4836847bdb10b32c78a4879d4aebe0f193b Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 23 Dec 2011 09:57:00 +1100 Subject: md/raid5: ensure correct assessment of drives during degraded reshape. While reshaping a degraded array (as when reshaping a RAID0 by first converting it to a degraded RAID4) we currently get confused about which devices are in_sync. In most cases we get it right, but in the region that is being reshaped we need to treat non-failed devices as in-sync when we have the data but haven't actually written it out yet. Reported-by: Adam Kwolek Signed-off-by: NeilBrown --- drivers/md/raid5.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 31670f8d6b65..858fdbb7eb07 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3065,11 +3065,17 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) } } else if (test_bit(In_sync, &rdev->flags)) set_bit(R5_Insync, &dev->flags); - else { + else if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset) /* in sync if before recovery_offset */ - if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset) - set_bit(R5_Insync, &dev->flags); - } + set_bit(R5_Insync, &dev->flags); + else if (test_bit(R5_UPTODATE, &dev->flags) && + test_bit(R5_Expanded, &dev->flags)) + /* If we've reshaped into here, we assume it is Insync. + * We will shortly update recovery_offset to make + * it official. + */ + set_bit(R5_Insync, &dev->flags); + if (rdev && test_bit(R5_WriteError, &dev->flags)) { clear_bit(R5_Insync, &dev->flags); if (!test_bit(Faulty, &rdev->flags)) { -- cgit v1.2.3 From 60fc13702a1b35118c1548e9c257fa038cecb658 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 23 Dec 2011 09:57:19 +1100 Subject: md: don't give up looking for spares on first failure-to-add Before performing a recovery we try to remove any spares that might not be working, then add any that might have become relevant. Currently we abort on the first spare that cannot be added. This is a false optimisation. It is conceivable that - depending on rules in the personality - a subsequent spare might be accepted. Also the loop does other things like count the available spares and reset the 'recovery_offset' value. If we abort early these might not happen properly. So remove the early abort. In particular if you have an array what is undergoing recovery and which has extra spares, then the recovery may not restart after as reboot as the could of 'spares' might end up as zero. Reported-by: Anssi Hannula Signed-off-by: NeilBrown --- drivers/md/md.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index ee981737edfc..f47f1f8ac44b 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -7360,8 +7360,7 @@ static int remove_and_add_spares(struct mddev *mddev) spares++; md_new_event(mddev); set_bit(MD_CHANGE_DEVS, &mddev->flags); - } else - break; + } } } } -- cgit v1.2.3 From 961902c0f8240175729274cd14198872f42072b7 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 23 Dec 2011 09:57:48 +1100 Subject: md/bitmap: It is OK to clear bits during recovery. commit d0a4bb492772ce5c4bdfba3744a99ed6f6fb238f introduced a regression which is annoying but fairly harmless. When writing to an array that is undergoing recovery (a spare in being integrated into the array), writing to the array will set bits in the bitmap, but they will not be cleared when the write completes. For bits covering areas that have not been recovered yet this is not a problem as the recovery will clear the bits. However bits set in already-recovered region will stay set and never be cleared. This doesn't risk data integrity. The only negatives are: - next time there is a crash, more resyncing than necessary will be done. - the bitmap doesn't look clean, which is confusing. While an array is recovering we don't want to update the 'events_cleared' setting in the bitmap but we do still want to clear bits that have very recently been set - providing they were written to the recovering device. So split those two needs - which previously both depended on 'success' and always clear the bit of the write went to all devices. Signed-off-by: NeilBrown --- drivers/md/bitmap.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index b6907118283a..6d03774b176e 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -1393,9 +1393,6 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto atomic_read(&bitmap->behind_writes), bitmap->mddev->bitmap_info.max_write_behind); } - if (bitmap->mddev->degraded) - /* Never clear bits or update events_cleared when degraded */ - success = 0; while (sectors) { sector_t blocks; @@ -1409,7 +1406,7 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto return; } - if (success && + if (success && !bitmap->mddev->degraded && bitmap->events_cleared < bitmap->mddev->events) { bitmap->events_cleared = bitmap->mddev->events; bitmap->need_sync = 1; -- cgit v1.2.3 From 55205c916e179e09773d98d290334d319f45ac6b Mon Sep 17 00:00:00 2001 From: Vladimir Zapolskiy Date: Thu, 22 Dec 2011 16:15:40 +0100 Subject: oprofile, arm/sh: Fix oprofile_arch_exit() linkage issue This change fixes a linking problem, which happens if oprofile is selected to be compiled as built-in: `oprofile_arch_exit' referenced in section `.init.text' of arch/arm/oprofile/built-in.o: defined in discarded section `.exit.text' of arch/arm/oprofile/built-in.o The problem is appeared after commit 87121ca504, which introduced oprofile_arch_exit() calls from __init function. Note that the aforementioned commit has been backported to stable branches, and the problem is known to be reproduced at least with 3.0.13 and 3.1.5 kernels. Signed-off-by: Vladimir Zapolskiy Signed-off-by: Robert Richter Cc: Will Deacon Cc: oprofile-list Cc: Link: http://lkml.kernel.org/r/20111222151540.GB16765@erda.amd.com Signed-off-by: Ingo Molnar --- arch/arm/oprofile/common.c | 2 +- arch/sh/oprofile/common.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/arm/oprofile/common.c b/arch/arm/oprofile/common.c index c074e66ad224..4e0a371630b3 100644 --- a/arch/arm/oprofile/common.c +++ b/arch/arm/oprofile/common.c @@ -116,7 +116,7 @@ int __init oprofile_arch_init(struct oprofile_operations *ops) return oprofile_perf_init(ops); } -void __exit oprofile_arch_exit(void) +void oprofile_arch_exit(void) { oprofile_perf_exit(); } diff --git a/arch/sh/oprofile/common.c b/arch/sh/oprofile/common.c index b4c2d2b946dd..e4dd5d5a1115 100644 --- a/arch/sh/oprofile/common.c +++ b/arch/sh/oprofile/common.c @@ -49,7 +49,7 @@ int __init oprofile_arch_init(struct oprofile_operations *ops) return oprofile_perf_init(ops); } -void __exit oprofile_arch_exit(void) +void oprofile_arch_exit(void) { oprofile_perf_exit(); kfree(sh_pmu_op_name); @@ -60,5 +60,5 @@ int __init oprofile_arch_init(struct oprofile_operations *ops) ops->backtrace = sh_backtrace; return -ENODEV; } -void __exit oprofile_arch_exit(void) {} +void oprofile_arch_exit(void) {} #endif /* CONFIG_HW_PERF_EVENTS */ -- cgit v1.2.3 From 8d532b2afb2eacc84588db709ec280a3d1219be3 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 23 Dec 2011 07:53:00 -0500 Subject: Btrfs: fix worker lock misuse in find_worker Dan Carpenter noticed that we were doing a double unlock on the worker lock, and sometimes picking a worker thread without the lock held. This fixes both errors. Signed-off-by: Chris Mason Reported-by: Dan Carpenter --- fs/btrfs/async-thread.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index cb97174e2366..0b394580d860 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -563,8 +563,8 @@ static struct btrfs_worker_thread *find_worker(struct btrfs_workers *workers) struct list_head *fallback; int ret; -again: spin_lock_irqsave(&workers->lock, flags); +again: worker = next_worker(workers); if (!worker) { @@ -579,6 +579,7 @@ again: spin_unlock_irqrestore(&workers->lock, flags); /* we're below the limit, start another worker */ ret = __btrfs_start_workers(workers); + spin_lock_irqsave(&workers->lock, flags); if (ret) goto fallback; goto again; -- cgit v1.2.3 From 08c422c27f855d27b0b3d9fa30ebd938d4ae6f1f Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 23 Dec 2011 07:58:13 -0500 Subject: Btrfs: call d_instantiate after all ops are setup This closes races where btrfs is calling d_instantiate too soon during inode creation. All of the callers of btrfs_add_nondir are updated to instantiate after the inode is fully setup in memory. Signed-off-by: Al Viro Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 740e67bbe249..13b0542015ff 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4590,10 +4590,6 @@ static int btrfs_add_nondir(struct btrfs_trans_handle *trans, int err = btrfs_add_link(trans, dir, inode, dentry->d_name.name, dentry->d_name.len, backref, index); - if (!err) { - d_instantiate(dentry, inode); - return 0; - } if (err > 0) err = -EEXIST; return err; @@ -4655,6 +4651,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, else { init_special_inode(inode, inode->i_mode, rdev); btrfs_update_inode(trans, root, inode); + d_instantiate(dentry, inode); } out_unlock: nr = trans->blocks_used; @@ -4722,6 +4719,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, inode->i_mapping->a_ops = &btrfs_aops; inode->i_mapping->backing_dev_info = &root->fs_info->bdi; BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; + d_instantiate(dentry, inode); } out_unlock: nr = trans->blocks_used; @@ -4779,6 +4777,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *parent = dentry->d_parent; err = btrfs_update_inode(trans, root, inode); BUG_ON(err); + d_instantiate(dentry, inode); btrfs_log_new_name(trans, inode, NULL, parent); } @@ -7245,6 +7244,8 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, drop_inode = 1; out_unlock: + if (!err) + d_instantiate(dentry, inode); nr = trans->blocks_used; btrfs_end_transaction_throttle(trans, root); if (drop_inode) { -- cgit v1.2.3 From 2e64694de21a812d637dcbea4471ad1f7897b049 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Fri, 23 Dec 2011 14:24:25 +0100 Subject: perf/x86: Fix raw_spin_unlock_irqrestore() usage Use raw_spin_unlock_irqrestore() as equivalent to raw_spin_lock_irqsave(). Signed-off-by: Robert Richter Cc: Stephane Eranian Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1324646665-13334-1-git-send-email-robert.richter@amd.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 8d601b18bf9f..121f1be4da19 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1169,7 +1169,7 @@ again: */ c = &unconstrained; } else if (intel_try_alt_er(event, orig_idx)) { - raw_spin_unlock(&era->lock); + raw_spin_unlock_irqrestore(&era->lock, flags); goto again; } raw_spin_unlock_irqrestore(&era->lock, flags); -- cgit v1.2.3 From 0b8fd3033c308e4088760aa1d38ce77197b4e074 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 18 Dec 2011 15:49:55 +0000 Subject: xfs: log the inode in ->write_inode calls for kupdate If the writeback code writes back an inode because it has expired we currently use the non-blockin ->write_inode path. This means any inode that is pinned is skipped. With delayed logging and a workload that has very little log traffic otherwise it is very likely that an inode that gets constantly written to is always pinned, and thus we keep refusing to write it. The VM writeback code at that point redirties it and doesn't try to write it again for another 30 seconds. This means under certain scenarious time based metadata writeback never happens. Fix this by calling into xfs_log_inode for kupdate in addition to data integrity syncs, and thus transfer the inode to the log ASAP. Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner Tested-by: Mark Tinguely Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 3eca58f51ae9..1add17ca3350 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -905,7 +905,7 @@ xfs_fs_write_inode( if (!ip->i_update_core) return 0; - if (wbc->sync_mode == WB_SYNC_ALL) { + if (wbc->sync_mode == WB_SYNC_ALL || wbc->for_kupdate) { /* * Make sure the inode has made it it into the log. Instead * of forcing it all the way to stable storage using a -- cgit v1.2.3 From be4f1ac828776bbc7868a68b465cd8eedb733cfd Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 20 Dec 2011 20:08:41 +0000 Subject: xfs: log all dirty inodes in xfs_fs_sync_fs Since Linux 2.6.36 the writeback code has introduces various measures for live lock prevention during sync(). Unfortunately some of these are actively harmful for the XFS model, where the inode gets marked dirty for metadata from the data I/O handler. The older_than_this checks that are now more strictly enforced since writeback: avoid livelocking WB_SYNC_ALL writeback by only calling into __writeback_inodes_sb and thus only sampling the current cut off time once. But on a slow enough devices the previous asynchronous sync pass might not have fully completed yet, and thus XFS might mark metadata dirty only after that sampling of the cut off time for the blocking pass already happened. I have not myself reproduced this myself on a real system, but by introducing artificial delay into the XFS I/O completion workqueues it can be reproduced easily. Fix this by iterating over all XFS inodes in ->sync_fs and log all that are dirty. This might log inode that only got redirtied after the previous pass, but given how cheap delayed logging of inodes is it isn't a major concern for performance. Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner Tested-by: Mark Tinguely Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_super.c | 28 ++++------------------------ fs/xfs/xfs_sync.c | 36 ++++++++++++++++++++++++++++++++++++ fs/xfs/xfs_sync.h | 2 ++ 3 files changed, 42 insertions(+), 24 deletions(-) diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 1add17ca3350..8a899496fd5f 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -868,27 +868,6 @@ xfs_fs_dirty_inode( XFS_I(inode)->i_update_core = 1; } -STATIC int -xfs_log_inode( - struct xfs_inode *ip) -{ - struct xfs_mount *mp = ip->i_mount; - struct xfs_trans *tp; - int error; - - tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS); - error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0); - if (error) { - xfs_trans_cancel(tp, 0); - return error; - } - - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - return xfs_trans_commit(tp, 0); -} - STATIC int xfs_fs_write_inode( struct inode *inode, @@ -902,8 +881,6 @@ xfs_fs_write_inode( if (XFS_FORCED_SHUTDOWN(mp)) return -XFS_ERROR(EIO); - if (!ip->i_update_core) - return 0; if (wbc->sync_mode == WB_SYNC_ALL || wbc->for_kupdate) { /* @@ -913,11 +890,14 @@ xfs_fs_write_inode( * ->sync_fs call do that for thus, which reduces the number * of synchronous log forces dramatically. */ - error = xfs_log_inode(ip); + error = xfs_log_dirty_inode(ip, NULL, 0); if (error) goto out; return 0; } else { + if (!ip->i_update_core) + return 0; + /* * We make this non-blocking if the inode is contended, return * EAGAIN to indicate to the caller that they did not succeed. diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c index be5c51d8f757..f0994aedcd15 100644 --- a/fs/xfs/xfs_sync.c +++ b/fs/xfs/xfs_sync.c @@ -336,6 +336,32 @@ xfs_sync_fsdata( return error; } +int +xfs_log_dirty_inode( + struct xfs_inode *ip, + struct xfs_perag *pag, + int flags) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + int error; + + if (!ip->i_update_core) + return 0; + + tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS); + error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0); + if (error) { + xfs_trans_cancel(tp, 0); + return error; + } + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + return xfs_trans_commit(tp, 0); +} + /* * When remounting a filesystem read-only or freezing the filesystem, we have * two phases to execute. This first phase is syncing the data before we @@ -359,6 +385,16 @@ xfs_quiesce_data( { int error, error2 = 0; + /* + * Log all pending size and timestamp updates. The vfs writeback + * code is supposed to do this, but due to its overagressive + * livelock detection it will skip inodes where appending writes + * were written out in the first non-blocking sync phase if their + * completion took long enough that it happened after taking the + * timestamp for the cut-off in the blocking phase. + */ + xfs_inode_ag_iterator(mp, xfs_log_dirty_inode, 0); + xfs_qm_sync(mp, SYNC_TRYLOCK); xfs_qm_sync(mp, SYNC_WAIT); diff --git a/fs/xfs/xfs_sync.h b/fs/xfs/xfs_sync.h index 941202e7ac6e..fa965479d788 100644 --- a/fs/xfs/xfs_sync.h +++ b/fs/xfs/xfs_sync.h @@ -34,6 +34,8 @@ void xfs_quiesce_attr(struct xfs_mount *mp); void xfs_flush_inodes(struct xfs_inode *ip); +int xfs_log_dirty_inode(struct xfs_inode *ip, struct xfs_perag *pag, int flags); + int xfs_reclaim_inodes(struct xfs_mount *mp, int mode); int xfs_reclaim_inodes_count(struct xfs_mount *mp); void xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan); -- cgit v1.2.3 From 5f0a6e2d503896062f641639dacfe5055c2f593b Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 23 Dec 2011 21:51:06 -0800 Subject: Linux 3.2-rc7 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index a43733df3978..ea51081812f3 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ VERSION = 3 PATCHLEVEL = 2 SUBLEVEL = 0 -EXTRAVERSION = -rc6 +EXTRAVERSION = -rc7 NAME = Saber-toothed Squirrel # *DOCUMENTATION* -- cgit v1.2.3 From 81378f728fe560e175fb2e8fd33206793567e896 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sat, 24 Dec 2011 19:03:46 +0100 Subject: netfilter: ctnetlink: fix return value of ctnetlink_get_expect() This fixes one bogus error that is returned to user-space: libnetfilter_conntrack/utils# ./expect_get TEST: get expectation (-1)(Unknown error 18446744073709551504) This patch includes the correct handling for EAGAIN (nfnetlink uses this error value to restart the operation after module auto-loading). Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_netlink.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index ef21b221f036..3d7ea7af76fc 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -1869,25 +1869,30 @@ ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb, err = -ENOMEM; skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (skb2 == NULL) + if (skb2 == NULL) { + nf_ct_expect_put(exp); goto out; + } rcu_read_lock(); err = ctnetlink_exp_fill_info(skb2, NETLINK_CB(skb).pid, nlh->nlmsg_seq, IPCTNL_MSG_EXP_NEW, exp); rcu_read_unlock(); + nf_ct_expect_put(exp); if (err <= 0) goto free; - nf_ct_expect_put(exp); + err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT); + if (err < 0) + goto out; - return netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT); + return 0; free: kfree_skb(skb2); out: - nf_ct_expect_put(exp); - return err; + /* this avoids a loop in nfnetlink. */ + return err == -EAGAIN ? -ENOBUFS : err; } static int -- cgit v1.2.3 From 1a31a4a8388a90e9240fb4e5e5c9c909fcfdfd0e Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sat, 24 Dec 2011 19:28:47 +0100 Subject: netfilter: ctnetlink: fix scheduling while atomic if helper is autoloaded This patch fixes one scheduling while atomic error: [ 385.565186] ctnetlink v0.93: registering with nfnetlink. [ 385.565349] BUG: scheduling while atomic: lt-expect_creat/16163/0x00000200 It can be triggered with utils/expect_create included in libnetfilter_conntrack if the FTP helper is not loaded. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_netlink.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 3d7ea7af76fc..b6977776d715 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -1358,12 +1358,15 @@ ctnetlink_create_conntrack(struct net *net, u16 zone, nf_ct_protonum(ct)); if (helper == NULL) { rcu_read_unlock(); + spin_unlock_bh(&nf_conntrack_lock); #ifdef CONFIG_MODULES if (request_module("nfct-helper-%s", helpname) < 0) { + spin_lock_bh(&nf_conntrack_lock); err = -EOPNOTSUPP; goto err1; } + spin_lock_bh(&nf_conntrack_lock); rcu_read_lock(); helper = __nf_conntrack_helper_find(helpname, nf_ct_l3num(ct), -- cgit v1.2.3 From bb52c7acf871537a468433775151339f783d2e8c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 23 Dec 2011 19:28:51 +0000 Subject: netem: dont call vfree() under spinlock and BH disabled commit 6373a9a286 (netem: use vmalloc for distribution table) added a regression, since vfree() is called while holding a spinlock and BH being disabled. Fix this by doing the pointers swap in critical section, and freeing after spinlock release. Also add __GFP_NOWARN to the kmalloc() try, since we fallback to vmalloc(). Signed-off-by: Eric Dumazet Acked-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/sched/sch_netem.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index eb3b9a86c6ed..a4ab207cdc59 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -488,7 +488,7 @@ static int get_dist_table(struct Qdisc *sch, const struct nlattr *attr) return -EINVAL; s = sizeof(struct disttable) + n * sizeof(s16); - d = kmalloc(s, GFP_KERNEL); + d = kmalloc(s, GFP_KERNEL | __GFP_NOWARN); if (!d) d = vmalloc(s); if (!d) @@ -501,9 +501,10 @@ static int get_dist_table(struct Qdisc *sch, const struct nlattr *attr) root_lock = qdisc_root_sleeping_lock(sch); spin_lock_bh(root_lock); - dist_free(q->delay_dist); - q->delay_dist = d; + swap(q->delay_dist, d); spin_unlock_bh(root_lock); + + dist_free(d); return 0; } -- cgit v1.2.3 From 2ca526bf4953380abfe5dff455e356967b239c70 Mon Sep 17 00:00:00 2001 From: Stefan Richter Date: Tue, 20 Dec 2011 21:23:28 +0100 Subject: MAINTAINERS: firewire git URL update Signed-off-by: Stefan Richter --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 28f65c249b97..37deb5eead9f 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2624,7 +2624,7 @@ FIREWIRE SUBSYSTEM M: Stefan Richter L: linux1394-devel@lists.sourceforge.net W: http://ieee1394.wiki.kernel.org/ -T: git git://git.kernel.org/pub/scm/linux/kernel/git/ieee1394/linux1394-2.6.git +T: git git://git.kernel.org/pub/scm/linux/kernel/git/ieee1394/linux1394.git S: Maintained F: drivers/firewire/ F: include/linux/firewire*.h -- cgit v1.2.3 From 0924ab2cfa98b1ece26c033d696651fd62896c69 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Wed, 14 Dec 2011 19:25:13 +0100 Subject: KVM: x86: Prevent starting PIT timers in the absence of irqchip support User space may create the PIT and forgets about setting up the irqchips. In that case, firing PIT IRQs will crash the host: BUG: unable to handle kernel NULL pointer dereference at 0000000000000128 IP: [] kvm_set_irq+0x30/0x170 [kvm] ... Call Trace: [] pit_do_work+0x51/0xd0 [kvm] [] process_one_work+0x111/0x4d0 [] worker_thread+0x152/0x340 [] kthread+0x7e/0x90 [] kernel_thread_helper+0x4/0x10 Prevent this by checking the irqchip mode before starting a timer. We can't deny creating the PIT if the irqchips aren't set up yet as current user land expects this order to work. Signed-off-by: Jan Kiszka Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/i8254.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 76e3f1cd0369..405f2620392f 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -338,11 +338,15 @@ static enum hrtimer_restart pit_timer_fn(struct hrtimer *data) return HRTIMER_NORESTART; } -static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period) +static void create_pit_timer(struct kvm *kvm, u32 val, int is_period) { + struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state; struct kvm_timer *pt = &ps->pit_timer; s64 interval; + if (!irqchip_in_kernel(kvm)) + return; + interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ); pr_debug("create pit timer, interval is %llu nsec\n", interval); @@ -394,13 +398,13 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val) /* FIXME: enhance mode 4 precision */ case 4: if (!(ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)) { - create_pit_timer(ps, val, 0); + create_pit_timer(kvm, val, 0); } break; case 2: case 3: if (!(ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)){ - create_pit_timer(ps, val, 1); + create_pit_timer(kvm, val, 1); } break; default: -- cgit v1.2.3 From 423873736b78f549fbfa2f715f2e4de7e6c5e1e9 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Tue, 20 Dec 2011 21:59:03 -0700 Subject: KVM: Remove ability to assign a device without iommu support This option has no users and it exposes a security hole that we can allow devices to be assigned without iommu protection. Make KVM_DEV_ASSIGN_ENABLE_IOMMU a mandatory option. Signed-off-by: Alex Williamson Signed-off-by: Marcelo Tosatti --- Documentation/virtual/kvm/api.txt | 3 +++ virt/kvm/assigned-dev.c | 18 +++++++++--------- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 7945b0bd35e2..ee2c96b3ba5a 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -1151,6 +1151,9 @@ following flags are specified: /* Depends on KVM_CAP_IOMMU */ #define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0) +The KVM_DEV_ASSIGN_ENABLE_IOMMU flag is a mandatory option to ensure +isolation of the device. Usages not specifying this flag are deprecated. + 4.49 KVM_DEASSIGN_PCI_DEVICE Capability: KVM_CAP_DEVICE_DEASSIGNMENT diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c index 3ad0925d23a9..a251a28f79c7 100644 --- a/virt/kvm/assigned-dev.c +++ b/virt/kvm/assigned-dev.c @@ -487,6 +487,9 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm, struct kvm_assigned_dev_kernel *match; struct pci_dev *dev; + if (!(assigned_dev->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU)) + return -EINVAL; + mutex_lock(&kvm->lock); idx = srcu_read_lock(&kvm->srcu); @@ -544,16 +547,14 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm, list_add(&match->list, &kvm->arch.assigned_dev_head); - if (assigned_dev->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU) { - if (!kvm->arch.iommu_domain) { - r = kvm_iommu_map_guest(kvm); - if (r) - goto out_list_del; - } - r = kvm_assign_device(kvm, match); + if (!kvm->arch.iommu_domain) { + r = kvm_iommu_map_guest(kvm); if (r) goto out_list_del; } + r = kvm_assign_device(kvm, match); + if (r) + goto out_list_del; out: srcu_read_unlock(&kvm->srcu, idx); @@ -593,8 +594,7 @@ static int kvm_vm_ioctl_deassign_device(struct kvm *kvm, goto out; } - if (match->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU) - kvm_deassign_device(kvm, match); + kvm_deassign_device(kvm, match); kvm_free_assigned_device(kvm, match); -- cgit v1.2.3 From 3d27e23b17010c668db311140b17bbbb70c78fb9 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Tue, 20 Dec 2011 21:59:09 -0700 Subject: KVM: Device assignment permission checks Only allow KVM device assignment to attach to devices which: - Are not bridges - Have BAR resources (assume others are special devices) - The user has permissions to use Assigning a bridge is a configuration error, it's not supported, and typically doesn't result in the behavior the user is expecting anyway. Devices without BAR resources are typically chipset components that also don't have host drivers. We don't want users to hold such devices captive or cause system problems by fencing them off into an iommu domain. We determine "permission to use" by testing whether the user has access to the PCI sysfs resource files. By default a normal user will not have access to these files, so it provides a good indication that an administration agent has granted the user access to the device. [Yang Bai: add missing #include] [avi: fix comment style] Signed-off-by: Alex Williamson Signed-off-by: Yang Bai Signed-off-by: Marcelo Tosatti --- Documentation/virtual/kvm/api.txt | 4 +++ virt/kvm/assigned-dev.c | 75 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 79 insertions(+) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index ee2c96b3ba5a..4df9af4f6132 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -1154,6 +1154,10 @@ following flags are specified: The KVM_DEV_ASSIGN_ENABLE_IOMMU flag is a mandatory option to ensure isolation of the device. Usages not specifying this flag are deprecated. +Only PCI header type 0 devices with PCI BAR resources are supported by +device assignment. The user requesting this ioctl must have read/write +access to the PCI sysfs resource files associated with the device. + 4.49 KVM_DEASSIGN_PCI_DEVICE Capability: KVM_CAP_DEVICE_DEASSIGNMENT diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c index a251a28f79c7..758e3b36d4cf 100644 --- a/virt/kvm/assigned-dev.c +++ b/virt/kvm/assigned-dev.c @@ -17,6 +17,8 @@ #include #include #include +#include +#include #include "irq.h" static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head, @@ -480,12 +482,73 @@ out: return r; } +/* + * We want to test whether the caller has been granted permissions to + * use this device. To be able to configure and control the device, + * the user needs access to PCI configuration space and BAR resources. + * These are accessed through PCI sysfs. PCI config space is often + * passed to the process calling this ioctl via file descriptor, so we + * can't rely on access to that file. We can check for permissions + * on each of the BAR resource files, which is a pretty clear + * indicator that the user has been granted access to the device. + */ +static int probe_sysfs_permissions(struct pci_dev *dev) +{ +#ifdef CONFIG_SYSFS + int i; + bool bar_found = false; + + for (i = PCI_STD_RESOURCES; i <= PCI_STD_RESOURCE_END; i++) { + char *kpath, *syspath; + struct path path; + struct inode *inode; + int r; + + if (!pci_resource_len(dev, i)) + continue; + + kpath = kobject_get_path(&dev->dev.kobj, GFP_KERNEL); + if (!kpath) + return -ENOMEM; + + /* Per sysfs-rules, sysfs is always at /sys */ + syspath = kasprintf(GFP_KERNEL, "/sys%s/resource%d", kpath, i); + kfree(kpath); + if (!syspath) + return -ENOMEM; + + r = kern_path(syspath, LOOKUP_FOLLOW, &path); + kfree(syspath); + if (r) + return r; + + inode = path.dentry->d_inode; + + r = inode_permission(inode, MAY_READ | MAY_WRITE | MAY_ACCESS); + path_put(&path); + if (r) + return r; + + bar_found = true; + } + + /* If no resources, probably something special */ + if (!bar_found) + return -EPERM; + + return 0; +#else + return -EINVAL; /* No way to control the device without sysfs */ +#endif +} + static int kvm_vm_ioctl_assign_device(struct kvm *kvm, struct kvm_assigned_pci_dev *assigned_dev) { int r = 0, idx; struct kvm_assigned_dev_kernel *match; struct pci_dev *dev; + u8 header_type; if (!(assigned_dev->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU)) return -EINVAL; @@ -516,6 +579,18 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm, r = -EINVAL; goto out_free; } + + /* Don't allow bridges to be assigned */ + pci_read_config_byte(dev, PCI_HEADER_TYPE, &header_type); + if ((header_type & PCI_HEADER_TYPE) != PCI_HEADER_TYPE_NORMAL) { + r = -EPERM; + goto out_put; + } + + r = probe_sysfs_permissions(dev); + if (r) + goto out_put; + if (pci_enable_device(dev)) { printk(KERN_INFO "%s: Could not enable PCI device\n", __func__); r = -EBUSY; -- cgit v1.2.3 From 4d25a066b69fb749a39d0d4c610689dd765a0b0e Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Wed, 21 Dec 2011 12:28:29 +0100 Subject: KVM: Don't automatically expose the TSC deadline timer in cpuid Unlike all of the other cpuid bits, the TSC deadline timer bit is set unconditionally, regardless of what userspace wants. This is broken in several ways: - if userspace doesn't use KVM_CREATE_IRQCHIP, and doesn't emulate the TSC deadline timer feature, a guest that uses the feature will break - live migration to older host kernels that don't support the TSC deadline timer will cause the feature to be pulled from under the guest's feet; breaking it - guests that are broken wrt the feature will fail. Fix by not enabling the feature automatically; instead report it to userspace. Because the feature depends on KVM_CREATE_IRQCHIP, which we cannot guarantee will be called, we expose it via a KVM_CAP_TSC_DEADLINE_TIMER and not KVM_GET_SUPPORTED_CPUID. Fixes the Illumos guest kernel, which uses the TSC deadline timer feature. [avi: add the KVM_CAP + documentation] Reported-by: Alexey Zaytsev Tested-by: Alexey Zaytsev Signed-off-by: Jan Kiszka Signed-off-by: Avi Kivity --- Documentation/virtual/kvm/api.txt | 9 +++++++++ arch/x86/kvm/x86.c | 19 +++++++++---------- include/linux/kvm.h | 1 + 3 files changed, 19 insertions(+), 10 deletions(-) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 4df9af4f6132..e2a4b5287361 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -1100,6 +1100,15 @@ emulate them efficiently. The fields in each entry are defined as follows: eax, ebx, ecx, edx: the values returned by the cpuid instruction for this function/index combination +The TSC deadline timer feature (CPUID leaf 1, ecx[24]) is always returned +as false, since the feature depends on KVM_CREATE_IRQCHIP for local APIC +support. Instead it is reported via + + ioctl(KVM_CHECK_EXTENSION, KVM_CAP_TSC_DEADLINE_TIMER) + +if that returns true and you use KVM_CREATE_IRQCHIP, or if you emulate the +feature in userspace, then you can enable the feature for KVM_SET_CPUID2. + 4.47 KVM_PPC_GET_PVINFO Capability: KVM_CAP_PPC_GET_PVINFO diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c38efd7b792e..4c938da2ba00 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -602,7 +602,6 @@ static void update_cpuid(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; struct kvm_lapic *apic = vcpu->arch.apic; - u32 timer_mode_mask; best = kvm_find_cpuid_entry(vcpu, 1, 0); if (!best) @@ -615,15 +614,12 @@ static void update_cpuid(struct kvm_vcpu *vcpu) best->ecx |= bit(X86_FEATURE_OSXSAVE); } - if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && - best->function == 0x1) { - best->ecx |= bit(X86_FEATURE_TSC_DEADLINE_TIMER); - timer_mode_mask = 3 << 17; - } else - timer_mode_mask = 1 << 17; - - if (apic) - apic->lapic_timer.timer_mode_mask = timer_mode_mask; + if (apic) { + if (best->ecx & bit(X86_FEATURE_TSC_DEADLINE_TIMER)) + apic->lapic_timer.timer_mode_mask = 3 << 17; + else + apic->lapic_timer.timer_mode_mask = 1 << 17; + } } int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) @@ -2135,6 +2131,9 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_TSC_CONTROL: r = kvm_has_tsc_control; break; + case KVM_CAP_TSC_DEADLINE_TIMER: + r = boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER); + break; default: r = 0; break; diff --git a/include/linux/kvm.h b/include/linux/kvm.h index c3892fc1d538..68e67e50d028 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -557,6 +557,7 @@ struct kvm_ppc_pvinfo { #define KVM_CAP_MAX_VCPUS 66 /* returns max vcpus per vm */ #define KVM_CAP_PPC_PAPR 68 #define KVM_CAP_S390_GMAP 71 +#define KVM_CAP_TSC_DEADLINE_TIMER 72 #ifdef KVM_CAP_IRQ_ROUTING -- cgit v1.2.3 From 36cc66d638d3ffbc635b0d48b29c1128fdad38f4 Mon Sep 17 00:00:00 2001 From: Andreas Schwab Date: Tue, 8 Nov 2011 07:08:52 +0000 Subject: KVM: PPC: move compute_tlbie_rb to book3s_64 common header compute_tlbie_rb is only used on ppc64 and cannot be compiled on ppc32. Signed-off-by: Andreas Schwab Signed-off-by: Alexander Graf --- arch/powerpc/include/asm/kvm_book3s.h | 33 -------------------------------- arch/powerpc/include/asm/kvm_book3s_64.h | 33 ++++++++++++++++++++++++++++++++ 2 files changed, 33 insertions(+), 33 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h index d4df013ad779..69c7377d2071 100644 --- a/arch/powerpc/include/asm/kvm_book3s.h +++ b/arch/powerpc/include/asm/kvm_book3s.h @@ -381,39 +381,6 @@ static inline bool kvmppc_critical_section(struct kvm_vcpu *vcpu) } #endif -static inline unsigned long compute_tlbie_rb(unsigned long v, unsigned long r, - unsigned long pte_index) -{ - unsigned long rb, va_low; - - rb = (v & ~0x7fUL) << 16; /* AVA field */ - va_low = pte_index >> 3; - if (v & HPTE_V_SECONDARY) - va_low = ~va_low; - /* xor vsid from AVA */ - if (!(v & HPTE_V_1TB_SEG)) - va_low ^= v >> 12; - else - va_low ^= v >> 24; - va_low &= 0x7ff; - if (v & HPTE_V_LARGE) { - rb |= 1; /* L field */ - if (cpu_has_feature(CPU_FTR_ARCH_206) && - (r & 0xff000)) { - /* non-16MB large page, must be 64k */ - /* (masks depend on page size) */ - rb |= 0x1000; /* page encoding in LP field */ - rb |= (va_low & 0x7f) << 16; /* 7b of VA in AVA/LP field */ - rb |= (va_low & 0xfe); /* AVAL field (P7 doesn't seem to care) */ - } - } else { - /* 4kB page */ - rb |= (va_low & 0x7ff) << 12; /* remaining 11b of VA */ - } - rb |= (v >> 54) & 0x300; /* B field */ - return rb; -} - /* Magic register values loaded into r3 and r4 before the 'sc' assembly * instruction for the OSI hypercalls */ #define OSI_SC_MAGIC_R3 0x113724FA diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h index e43fe42b9875..d0ac94f98f9e 100644 --- a/arch/powerpc/include/asm/kvm_book3s_64.h +++ b/arch/powerpc/include/asm/kvm_book3s_64.h @@ -29,4 +29,37 @@ static inline struct kvmppc_book3s_shadow_vcpu *to_svcpu(struct kvm_vcpu *vcpu) #define SPAPR_TCE_SHIFT 12 +static inline unsigned long compute_tlbie_rb(unsigned long v, unsigned long r, + unsigned long pte_index) +{ + unsigned long rb, va_low; + + rb = (v & ~0x7fUL) << 16; /* AVA field */ + va_low = pte_index >> 3; + if (v & HPTE_V_SECONDARY) + va_low = ~va_low; + /* xor vsid from AVA */ + if (!(v & HPTE_V_1TB_SEG)) + va_low ^= v >> 12; + else + va_low ^= v >> 24; + va_low &= 0x7ff; + if (v & HPTE_V_LARGE) { + rb |= 1; /* L field */ + if (cpu_has_feature(CPU_FTR_ARCH_206) && + (r & 0xff000)) { + /* non-16MB large page, must be 64k */ + /* (masks depend on page size) */ + rb |= 0x1000; /* page encoding in LP field */ + rb |= (va_low & 0x7f) << 16; /* 7b of VA in AVA/LP field */ + rb |= (va_low & 0xfe); /* AVAL field (P7 doesn't seem to care) */ + } + } else { + /* 4kB page */ + rb |= (va_low & 0x7ff) << 12; /* remaining 11b of VA */ + } + rb |= (v >> 54) & 0x300; /* B field */ + return rb; +} + #endif /* __ASM_KVM_BOOK3S_64_H__ */ -- cgit v1.2.3 From 96f38d72867bc54c312decaf8463f1e9607136da Mon Sep 17 00:00:00 2001 From: Andreas Schwab Date: Tue, 8 Nov 2011 07:17:39 +0000 Subject: KVM: PPC: protect use of kvmppc_h_pr kvmppc_h_pr is only available if CONFIG_KVM_BOOK3S_64_PR. Signed-off-by: Andreas Schwab Signed-off-by: Alexander Graf --- arch/powerpc/kvm/book3s_pr.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index 3c791e1eb675..e2cfb9e1e20e 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -658,10 +658,12 @@ program_interrupt: ulong cmd = kvmppc_get_gpr(vcpu, 3); int i; +#ifdef CONFIG_KVM_BOOK3S_64_PR if (kvmppc_h_pr(vcpu, cmd) == EMULATE_DONE) { r = RESUME_GUEST; break; } +#endif run->papr_hcall.nr = cmd; for (i = 0; i < 9; ++i) { -- cgit v1.2.3 From 251da03897b383904901620835044e298061875f Mon Sep 17 00:00:00 2001 From: Michael Neuling Date: Thu, 10 Nov 2011 16:03:20 +0000 Subject: KVM: PPC: fix kvmppc_start_thread() for CONFIG_SMP=N Currently kvmppc_start_thread() tries to wake other SMT threads via xics_wake_cpu(). Unfortunately xics_wake_cpu only exists when CONFIG_SMP=Y so when compiling with CONFIG_SMP=N we get: arch/powerpc/kvm/built-in.o: In function `.kvmppc_start_thread': book3s_hv.c:(.text+0xa1e0): undefined reference to `.xics_wake_cpu' The following should be fine since kvmppc_start_thread() shouldn't called to start non-zero threads when SMP=N since threads_per_core=1. Signed-off-by: Michael Neuling Signed-off-by: Alexander Graf --- arch/powerpc/kvm/book3s_hv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 0cb137a9b038..336983da9e72 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -538,7 +538,7 @@ static void kvmppc_start_thread(struct kvm_vcpu *vcpu) tpaca->kvm_hstate.napping = 0; vcpu->cpu = vc->pcpu; smp_wmb(); -#ifdef CONFIG_PPC_ICP_NATIVE +#if defined(CONFIG_PPC_ICP_NATIVE) && defined(CONFIG_SMP) if (vcpu->arch.ptid) { tpaca->cpu_start = 0x80; wmb(); -- cgit v1.2.3 From fae9dbb4b462d2c908186a47464c7a5299ee27a9 Mon Sep 17 00:00:00 2001 From: Scott Wood Date: Tue, 20 Dec 2011 14:43:45 +0000 Subject: KVM: PPC: e500: include linux/export.h This is required for THIS_MODULE. We recently stopped acquiring it via some other header. Signed-off-by: Scott Wood Signed-off-by: Alexander Graf --- arch/powerpc/kvm/e500.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/kvm/e500.c b/arch/powerpc/kvm/e500.c index 26d20903f2bc..8c0d45a6faf7 100644 --- a/arch/powerpc/kvm/e500.c +++ b/arch/powerpc/kvm/e500.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include -- cgit v1.2.3 From 6d4b9e38d3980826abccfbd90e95bf4bd41b8dd2 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 26 Dec 2011 10:25:26 -0800 Subject: vfs: fix handling of lock allocation failure in lease-break case Bruce Fields notes that commit 778fc546f749 ("locks: fix tracking of inprogress lease breaks") introduced a possible error pointer dereference on failure to allocate memory. locks_conflict() will dereference the passed-in new lease lock structure that may be an error pointer. This means an open (without O_NONBLOCK set) on a file with a lease applied (generally only done when Samba or nfsd (with v4) is running) could crash if a kmalloc() fails. So instead of playing games with IS_ERROR() all over the place, just check the allocation failure early. That makes the code more straightforward, and avoids this possible bad pointer dereference. Based-on-patch-by: J. Bruce Fields Cc: Al Viro Signed-off-by: Linus Torvalds --- fs/locks.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/fs/locks.c b/fs/locks.c index 3b0d05dcd7c1..637694bf3a03 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -1205,6 +1205,8 @@ int __break_lease(struct inode *inode, unsigned int mode) int want_write = (mode & O_ACCMODE) != O_RDONLY; new_fl = lease_alloc(NULL, want_write ? F_WRLCK : F_RDLCK); + if (IS_ERR(new_fl)) + return PTR_ERR(new_fl); lock_flocks(); @@ -1221,12 +1223,6 @@ int __break_lease(struct inode *inode, unsigned int mode) if (fl->fl_owner == current->files) i_have_this_lease = 1; - if (IS_ERR(new_fl) && !i_have_this_lease - && ((mode & O_NONBLOCK) == 0)) { - error = PTR_ERR(new_fl); - goto out; - } - break_time = 0; if (lease_break_time > 0) { break_time = jiffies + lease_break_time * HZ; @@ -1284,8 +1280,7 @@ restart: out: unlock_flocks(); - if (!IS_ERR(new_fl)) - locks_free_lock(new_fl); + locks_free_lock(new_fl); return error; } -- cgit v1.2.3 From ebbd857e6b9a92c0aff4aacd1b1d2361d888633e Mon Sep 17 00:00:00 2001 From: Keith Packard Date: Mon, 26 Dec 2011 17:02:10 -0800 Subject: drm/i915: Disable semaphores by default on SNB Semaphores still cause problems on some machines: > From Udo Steinberg: > > With Linux-3.2-rc6 I'm frequently seeing GPU hangs when large amounts of > text scroll in an xterm, such as when extracting a tar archive. Such as this > one (note the timestamps): > > I can reproduce it fairly easily with something > as simple as: > > while true; do dmesg; done This patch turns them off on SNB while leaving them on for IVB. Reported-by: Udo Steinberg Cc: Daniel Vetter Cc: Eugeni Dodonov Signed-off-by: Keith Packard Signed-off-by: Linus Torvalds --- drivers/gpu/drm/i915/i915_gem_execbuffer.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c index c681dc149d2a..b9da8900ae4e 100644 --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c @@ -756,9 +756,9 @@ intel_enable_semaphores(struct drm_device *dev) if (i915_semaphores >= 0) return i915_semaphores; - /* Enable semaphores on SNB when IO remapping is off */ + /* Disable semaphores on SNB */ if (INTEL_INFO(dev)->gen == 6) - return !intel_iommu_enabled; + return 0; return 1; } -- cgit v1.2.3 From 371de6e4e0042adf4f9b54c414154f57414ddd37 Mon Sep 17 00:00:00 2001 From: Keith Packard Date: Mon, 26 Dec 2011 17:02:11 -0800 Subject: drm/i915: Disable RC6 on Sandybridge by default RC6 fails again. > I found my system freeze mostly during starting up X and KDE. Sometimes it > works for some minutes, sometimes it freezes immediatly. When the freeze > happens, everything is dead (even the reset button does not work, I need to > power cycle). > I disabled RC6, and my system runs wonderfully. > The system is a Z68 Pro board with Sandybridge i5-2500K processor, 8 > GB of RAM and UEFI firmware. Reported-by: Kai Krakow Signed-off-by: Keith Packard Signed-off-by: Linus Torvalds --- drivers/gpu/drm/i915/intel_display.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c index d809b038ca88..daa5743ccbd6 100644 --- a/drivers/gpu/drm/i915/intel_display.c +++ b/drivers/gpu/drm/i915/intel_display.c @@ -7922,13 +7922,11 @@ static bool intel_enable_rc6(struct drm_device *dev) return 0; /* - * Enable rc6 on Sandybridge if DMA remapping is disabled + * Disable rc6 on Sandybridge */ if (INTEL_INFO(dev)->gen == 6) { - DRM_DEBUG_DRIVER("Sandybridge: intel_iommu_enabled %s -- RC6 %sabled\n", - intel_iommu_enabled ? "true" : "false", - !intel_iommu_enabled ? "en" : "dis"); - return !intel_iommu_enabled; + DRM_DEBUG_DRIVER("Sandybridge: RC6 disabled\n"); + return 0; } DRM_DEBUG_DRIVER("RC6 enabled\n"); return 1; -- cgit v1.2.3 From aef950b4ba3196622a5bd5e21ab1d63f30658285 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Tue, 27 Dec 2011 22:32:41 -0500 Subject: packet: fix possible dev refcnt leak when bind fail If bind is fail when bind is called after set PACKET_FANOUT sock option, the dev refcnt will leak. Signed-off-by: Wei Yongjun Signed-off-by: David S. Miller --- net/packet/af_packet.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 3891702b81df..d9d4970b9b07 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -2448,8 +2448,12 @@ static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protoc { struct packet_sock *po = pkt_sk(sk); - if (po->fanout) + if (po->fanout) { + if (dev) + dev_put(dev); + return -EINVAL; + } lock_sock(sk); -- cgit v1.2.3 From f2b20d436534f22ccc3f5ad172499fcb013bb315 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 29 Dec 2011 09:16:28 +0100 Subject: block: fix blk_queue_end_tag() Commit 5e081591 "block: warn if tag is greater than real_max_depth" cleaned up blk_queue_end_tag() to warn when the tag is truly invalid (greater than real_max_depth). However, it changed behavior in the tag < max_depth case to not end the request. Leading to triggering of BUG_ON(blk_queued_rq(rq)) in the request completion path: http://marc.info/?l=linux-kernel&m=132204370518629&w=2 In order to allow blk_queue_resize_tags() to shrink the tag space blk_queue_end_tag() must always complete tags with a value less than real_max_depth regardless of the current max_depth. The comment about "handling the shrink case" seems to be what prompted changes in this space, so remove it and BUG on all invalid tags (made even simpler by Matthew's suggestion to use an unsigned compare). Signed-off-by: Dan Williams Cc: Tao Ma Cc: Matthew Wilcox Reported-by: Meelis Roos Reported-by: Ed Nadolski Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Jens Axboe --- block/blk-tag.c | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/block/blk-tag.c b/block/blk-tag.c index e74d6d13838f..4af6f5cc1167 100644 --- a/block/blk-tag.c +++ b/block/blk-tag.c @@ -282,18 +282,9 @@ EXPORT_SYMBOL(blk_queue_resize_tags); void blk_queue_end_tag(struct request_queue *q, struct request *rq) { struct blk_queue_tag *bqt = q->queue_tags; - int tag = rq->tag; + unsigned tag = rq->tag; /* negative tags invalid */ - BUG_ON(tag == -1); - - if (unlikely(tag >= bqt->max_depth)) { - /* - * This can happen after tag depth has been reduced. - * But tag shouldn't be larger than real_max_depth. - */ - WARN_ON(tag >= bqt->real_max_depth); - return; - } + BUG_ON(tag >= bqt->real_max_depth); list_del_init(&rq->queuelist); rq->cmd_flags &= ~REQ_QUEUED; -- cgit v1.2.3 From 757e55c23dc62eb5adf45368a72f6b26d6a71ae5 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Thu, 29 Dec 2011 19:09:21 -0200 Subject: gspca: Fix bulk mode cameras no longer working (regression fix) The new iso bandwidth calculation code accidentally has broken support for bulk mode cameras. This has broken the following drivers: finepix, jeilinj, ovfx2, ov534, ov534_9, se401, sq905, sq905c, sq930x, stv0680, vicam. Thix patch fixes this. Fix tested with: se401, sq905, sq905c, stv0680 & vicam cams. Signed-off-by: Hans de Goede Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Linus Torvalds --- drivers/media/video/gspca/gspca.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/media/video/gspca/gspca.c b/drivers/media/video/gspca/gspca.c index 881e04c7ffe6..512f32ff446a 100644 --- a/drivers/media/video/gspca/gspca.c +++ b/drivers/media/video/gspca/gspca.c @@ -838,13 +838,13 @@ static int gspca_init_transfer(struct gspca_dev *gspca_dev) gspca_dev->usb_err = 0; /* do the specific subdriver stuff before endpoint selection */ - gspca_dev->alt = 0; + intf = usb_ifnum_to_if(gspca_dev->dev, gspca_dev->iface); + gspca_dev->alt = gspca_dev->cam.bulk ? intf->num_altsetting : 0; if (gspca_dev->sd_desc->isoc_init) { ret = gspca_dev->sd_desc->isoc_init(gspca_dev); if (ret < 0) goto unlock; } - intf = usb_ifnum_to_if(gspca_dev->dev, gspca_dev->iface); xfer = gspca_dev->cam.bulk ? USB_ENDPOINT_XFER_BULK : USB_ENDPOINT_XFER_ISOC; -- cgit v1.2.3 From e26a51148f3ebd859bca8bf2e0f212839b447f62 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Wed, 28 Dec 2011 15:57:11 -0800 Subject: mm/mempolicy.c: refix mbind_range() vma issue commit 8aacc9f550 ("mm/mempolicy.c: fix pgoff in mbind vma merge") is the slightly incorrect fix. Why? Think following case. 1. map 4 pages of a file at offset 0 [0123] 2. map 2 pages just after the first mapping of the same file but with page offset 2 [0123][23] 3. mbind() 2 pages from the first mapping at offset 2. mbind_range() should treat new vma is, [0123][23] |23| mbind vma but it does [0123][23] |01| mbind vma Oops. then, it makes wrong vma merge and splitting ([01][0123] or similar). This patch fixes it. [testcase] test result - before the patch case4: 126: test failed. expect '2,4', actual '2,2,2' case5: passed case6: passed case7: passed case8: passed case_n: 246: test failed. expect '4,2', actual '1,4' ------------[ cut here ]------------ kernel BUG at mm/filemap.c:135! invalid opcode: 0000 [#4] SMP DEBUG_PAGEALLOC (snip long bug on messages) test result - after the patch case4: passed case5: passed case6: passed case7: passed case8: passed case_n: passed source: mbind_vma_test.c ============================================================ #include #include #include #include #include #include #include static unsigned long pagesize; void* mmap_addr; struct bitmask *nmask; char buf[1024]; FILE *file; char retbuf[10240] = ""; int mapped_fd; char *rubysrc = "ruby -e '\ pid = %d; \ vstart = 0x%llx; \ vend = 0x%llx; \ s = `pmap -q #{pid}`; \ rary = []; \ s.each_line {|line|; \ ary=line.split(\" \"); \ addr = ary[0].to_i(16); \ if(vstart <= addr && addr < vend) then \ rary.push(ary[1].to_i()/4); \ end; \ }; \ print rary.join(\",\"); \ '"; void init(void) { void* addr; char buf[128]; nmask = numa_allocate_nodemask(); numa_bitmask_setbit(nmask, 0); pagesize = getpagesize(); sprintf(buf, "%s", "mbind_vma_XXXXXX"); mapped_fd = mkstemp(buf); if (mapped_fd == -1) perror("mkstemp "), exit(1); unlink(buf); if (lseek(mapped_fd, pagesize*8, SEEK_SET) < 0) perror("lseek "), exit(1); if (write(mapped_fd, "\0", 1) < 0) perror("write "), exit(1); addr = mmap(NULL, pagesize*8, PROT_NONE, MAP_SHARED, mapped_fd, 0); if (addr == MAP_FAILED) perror("mmap "), exit(1); if (mprotect(addr+pagesize, pagesize*6, PROT_READ|PROT_WRITE) < 0) perror("mprotect "), exit(1); mmap_addr = addr + pagesize; /* make page populate */ memset(mmap_addr, 0, pagesize*6); } void fin(void) { void* addr = mmap_addr - pagesize; munmap(addr, pagesize*8); memset(buf, 0, sizeof(buf)); memset(retbuf, 0, sizeof(retbuf)); } void mem_bind(int index, int len) { int err; err = mbind(mmap_addr+pagesize*index, pagesize*len, MPOL_BIND, nmask->maskp, nmask->size, 0); if (err) perror("mbind "), exit(err); } void mem_interleave(int index, int len) { int err; err = mbind(mmap_addr+pagesize*index, pagesize*len, MPOL_INTERLEAVE, nmask->maskp, nmask->size, 0); if (err) perror("mbind "), exit(err); } void mem_unbind(int index, int len) { int err; err = mbind(mmap_addr+pagesize*index, pagesize*len, MPOL_DEFAULT, NULL, 0, 0); if (err) perror("mbind "), exit(err); } void Assert(char *expected, char *value, char *name, int line) { if (strcmp(expected, value) == 0) { fprintf(stderr, "%s: passed\n", name); return; } else { fprintf(stderr, "%s: %d: test failed. expect '%s', actual '%s'\n", name, line, expected, value); // exit(1); } } /* AAAA PPPPPPNNNNNN might become PPNNNNNNNNNN case 4 below */ void case4(void) { init(); sprintf(buf, rubysrc, getpid(), mmap_addr, mmap_addr+pagesize*6); mem_bind(0, 4); mem_unbind(2, 2); file = popen(buf, "r"); fread(retbuf, sizeof(retbuf), 1, file); Assert("2,4", retbuf, "case4", __LINE__); fin(); } /* AAAA PPPPPPNNNNNN might become PPPPPPPPPPNN case 5 below */ void case5(void) { init(); sprintf(buf, rubysrc, getpid(), mmap_addr, mmap_addr+pagesize*6); mem_bind(0, 2); mem_bind(2, 2); file = popen(buf, "r"); fread(retbuf, sizeof(retbuf), 1, file); Assert("4,2", retbuf, "case5", __LINE__); fin(); } /* AAAA PPPPNNNNXXXX might become PPPPPPPPPPPP 6 */ void case6(void) { init(); sprintf(buf, rubysrc, getpid(), mmap_addr, mmap_addr+pagesize*6); mem_bind(0, 2); mem_bind(4, 2); mem_bind(2, 2); file = popen(buf, "r"); fread(retbuf, sizeof(retbuf), 1, file); Assert("6", retbuf, "case6", __LINE__); fin(); } /* AAAA PPPPNNNNXXXX might become PPPPPPPPXXXX 7 */ void case7(void) { init(); sprintf(buf, rubysrc, getpid(), mmap_addr, mmap_addr+pagesize*6); mem_bind(0, 2); mem_interleave(4, 2); mem_bind(2, 2); file = popen(buf, "r"); fread(retbuf, sizeof(retbuf), 1, file); Assert("4,2", retbuf, "case7", __LINE__); fin(); } /* AAAA PPPPNNNNXXXX might become PPPPNNNNNNNN 8 */ void case8(void) { init(); sprintf(buf, rubysrc, getpid(), mmap_addr, mmap_addr+pagesize*6); mem_bind(0, 2); mem_interleave(4, 2); mem_interleave(2, 2); file = popen(buf, "r"); fread(retbuf, sizeof(retbuf), 1, file); Assert("2,4", retbuf, "case8", __LINE__); fin(); } void case_n(void) { init(); sprintf(buf, rubysrc, getpid(), mmap_addr, mmap_addr+pagesize*6); /* make redundunt mappings [0][1234][34][7] */ mmap(mmap_addr + pagesize*4, pagesize*2, PROT_READ|PROT_WRITE, MAP_FIXED|MAP_SHARED, mapped_fd, pagesize*3); /* Expect to do nothing. */ mem_unbind(2, 2); file = popen(buf, "r"); fread(retbuf, sizeof(retbuf), 1, file); Assert("4,2", retbuf, "case_n", __LINE__); fin(); } int main(int argc, char** argv) { case4(); case5(); case6(); case7(); case8(); case_n(); return 0; } ============================================================= Signed-off-by: KOSAKI Motohiro Acked-by: Johannes Weiner Cc: Minchan Kim Cc: Caspar Zhang Cc: KOSAKI Motohiro Cc: Christoph Lameter Cc: Hugh Dickins Cc: Mel Gorman Cc: Lee Schermerhorn Cc: [3.1.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index adc395481813..c3fdbcb17658 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -636,6 +636,7 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, struct vm_area_struct *prev; struct vm_area_struct *vma; int err = 0; + pgoff_t pgoff; unsigned long vmstart; unsigned long vmend; @@ -643,13 +644,21 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, if (!vma || vma->vm_start > start) return -EFAULT; + if (start > vma->vm_start) + prev = vma; + for (; vma && vma->vm_start < end; prev = vma, vma = next) { next = vma->vm_next; vmstart = max(start, vma->vm_start); vmend = min(end, vma->vm_end); + if (mpol_equal(vma_policy(vma), new_pol)) + continue; + + pgoff = vma->vm_pgoff + + ((vmstart - vma->vm_start) >> PAGE_SHIFT); prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags, - vma->anon_vma, vma->vm_file, vma->vm_pgoff, + vma->anon_vma, vma->vm_file, pgoff, new_pol); if (prev) { vma = prev; -- cgit v1.2.3 From 34845636a184f3be91a531098192592cbe6db587 Mon Sep 17 00:00:00 2001 From: Andreas Schwab Date: Wed, 28 Dec 2011 15:57:15 -0800 Subject: procfs: do not confuse jiffies with cputime64_t Commit 2a95ea6c0d129b4 ("procfs: do not overflow get_{idle,iowait}_time for nohz") did not take into account that one some architectures jiffies and cputime use different units. This causes get_idle_time() to return numbers in the wrong units, making the idle time fields in /proc/stat wrong. Instead of converting the usec value returned by get_cpu_{idle,iowait}_time_us to units of jiffies, use the new function usecs_to_cputime64 to convert it to the correct unit of cputime64_t. Signed-off-by: Andreas Schwab Acked-by: Michal Hocko Cc: Arnd Bergmann Cc: "Artem S. Tashkinov" Cc: Dave Jones Cc: Alexey Dobriyan Cc: Thomas Gleixner Cc: "Luck, Tony" Cc: Benjamin Herrenschmidt Cc: Martin Schwidefsky Cc: Heiko Carstens Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/include/asm/cputime.h | 1 + arch/powerpc/include/asm/cputime.h | 2 ++ arch/s390/include/asm/cputime.h | 2 ++ fs/proc/stat.c | 4 ++-- include/asm-generic/cputime.h | 1 + 5 files changed, 8 insertions(+), 2 deletions(-) diff --git a/arch/ia64/include/asm/cputime.h b/arch/ia64/include/asm/cputime.h index 6073b187528a..5a274af31b2b 100644 --- a/arch/ia64/include/asm/cputime.h +++ b/arch/ia64/include/asm/cputime.h @@ -60,6 +60,7 @@ typedef u64 cputime64_t; */ #define cputime_to_usecs(__ct) ((__ct) / NSEC_PER_USEC) #define usecs_to_cputime(__usecs) ((__usecs) * NSEC_PER_USEC) +#define usecs_to_cputime64(__usecs) usecs_to_cputime(__usecs) /* * Convert cputime <-> seconds diff --git a/arch/powerpc/include/asm/cputime.h b/arch/powerpc/include/asm/cputime.h index 1cf20bdfbeca..98b7c4b49c9d 100644 --- a/arch/powerpc/include/asm/cputime.h +++ b/arch/powerpc/include/asm/cputime.h @@ -150,6 +150,8 @@ static inline cputime_t usecs_to_cputime(const unsigned long us) return ct; } +#define usecs_to_cputime64(us) usecs_to_cputime(us) + /* * Convert cputime <-> seconds */ diff --git a/arch/s390/include/asm/cputime.h b/arch/s390/include/asm/cputime.h index 081434878296..b9acaaa175d8 100644 --- a/arch/s390/include/asm/cputime.h +++ b/arch/s390/include/asm/cputime.h @@ -87,6 +87,8 @@ usecs_to_cputime(const unsigned int m) return (cputime_t) m * 4096; } +#define usecs_to_cputime64(m) usecs_to_cputime(m) + /* * Convert cputime to milliseconds and back. */ diff --git a/fs/proc/stat.c b/fs/proc/stat.c index 2a30d67dd6b8..0855e6f20391 100644 --- a/fs/proc/stat.c +++ b/fs/proc/stat.c @@ -32,7 +32,7 @@ static cputime64_t get_idle_time(int cpu) idle = kstat_cpu(cpu).cpustat.idle; idle = cputime64_add(idle, arch_idle_time(cpu)); } else - idle = nsecs_to_jiffies64(1000 * idle_time); + idle = usecs_to_cputime64(idle_time); return idle; } @@ -46,7 +46,7 @@ static cputime64_t get_iowait_time(int cpu) /* !NO_HZ so we can rely on cpustat.iowait */ iowait = kstat_cpu(cpu).cpustat.iowait; else - iowait = nsecs_to_jiffies64(1000 * iowait_time); + iowait = usecs_to_cputime64(iowait_time); return iowait; } diff --git a/include/asm-generic/cputime.h b/include/asm-generic/cputime.h index 62ce6823c0f2..12a1764f612b 100644 --- a/include/asm-generic/cputime.h +++ b/include/asm-generic/cputime.h @@ -40,6 +40,7 @@ typedef u64 cputime64_t; */ #define cputime_to_usecs(__ct) jiffies_to_usecs(__ct) #define usecs_to_cputime(__msecs) usecs_to_jiffies(__msecs) +#define usecs_to_cputime64(__msecs) nsecs_to_jiffies64((__msecs) * 1000) /* * Convert cputime to seconds and back. -- cgit v1.2.3 From b0365c8d0cb6e79eb5f21418ae61ab511f31b575 Mon Sep 17 00:00:00 2001 From: Hillf Danton Date: Wed, 28 Dec 2011 15:57:16 -0800 Subject: mm: hugetlb: fix non-atomic enqueue of huge page If a huge page is enqueued under the protection of hugetlb_lock, then the operation is atomic and safe. Signed-off-by: Hillf Danton Reviewed-by: Michal Hocko Acked-by: KAMEZAWA Hiroyuki Cc: [2.6.37+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 73f17c0293c0..2316840b337a 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -901,7 +901,6 @@ retry: h->resv_huge_pages += delta; ret = 0; - spin_unlock(&hugetlb_lock); /* Free the needed pages to the hugetlb pool */ list_for_each_entry_safe(page, tmp, &surplus_list, lru) { if ((--needed) < 0) @@ -915,6 +914,7 @@ retry: VM_BUG_ON(page_count(page)); enqueue_huge_page(h, page); } + spin_unlock(&hugetlb_lock); /* Free unnecessary surplus pages to the buddy allocator */ free: -- cgit v1.2.3