From d4acd722b7bb5f48b9fc3848e8c2a845b100d84f Mon Sep 17 00:00:00 2001
From: James Bottomley <James.Bottomley@HansenPartnership.com>
Date: Wed, 31 Oct 2007 09:38:04 -0500
Subject: [SCSI] sysfs: add filter function to groups

This patch allows the various users of attribute_groups to selectively
allow the appearance of group attributes.  The primary consumer of
this will be the transport classes in which we currently have
elaborate attribute selection algorithms to do this same thing.

Acked-by: Greg KH <greg@kroah.com>
Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>
---
 kernel/params.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/params.c b/kernel/params.c
index 7686417ee00e..dfef46474e55 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -472,7 +472,7 @@ param_sysfs_setup(struct module_kobject *mk,
 			sizeof(mp->grp.attrs[0]));
 	size[1] = (valid_attrs + 1) * sizeof(mp->grp.attrs[0]);
 
-	mp = kmalloc(size[0] + size[1], GFP_KERNEL);
+	mp = kzalloc(size[0] + size[1], GFP_KERNEL);
 	if (!mp)
 		return ERR_PTR(-ENOMEM);
 
-- 
cgit v1.2.3


From fabe874a48de45b137f99b4ed3641e0413f465ce Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Thu, 24 Jan 2008 07:00:45 +0100
Subject: lockdep: fix kernel crash on module unload

Michael Wu noticed in his lkml post at

    http://marc.info/?l=linux-kernel&m=119396182726091&w=2

that certain wireless drivers ended up having their name in module
memory, which would then crash the kernel on module unload.

The patch he proposed was a bit clumsy in that it increased the size of
a lockdep entry significantly; the patch below tries another approach,
it checks, on module teardown, if the name of a class is in module space
and then zaps the class.  This is very similar to what we already do
with keys that are in module space.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/lockdep.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 4335f12a27c6..e2c07ece367d 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -2932,7 +2932,7 @@ static void zap_class(struct lock_class *class)
 
 }
 
-static inline int within(void *addr, void *start, unsigned long size)
+static inline int within(const void *addr, void *start, unsigned long size)
 {
 	return addr >= start && addr < start + size;
 }
@@ -2955,9 +2955,12 @@ void lockdep_free_key_range(void *start, unsigned long size)
 		head = classhash_table + i;
 		if (list_empty(head))
 			continue;
-		list_for_each_entry_safe(class, next, head, hash_entry)
+		list_for_each_entry_safe(class, next, head, hash_entry) {
 			if (within(class->key, start, size))
 				zap_class(class);
+			else if (within(class->name, start, size))
+				zap_class(class);
+		}
 	}
 
 	if (locked)
-- 
cgit v1.2.3


From 3514faca19a6fdc209734431c509631ea92b094e Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Tue, 16 Oct 2007 10:11:44 -0600
Subject: kobject: remove struct kobj_type from struct kset

We don't need a "default" ktype for a kset.  We should set this
explicitly every time for each kset.  This change is needed so that we
can make ksets dynamic, and cleans up one of the odd, undocumented
assumption that the kset/kobject/ktype model has.

This patch is based on a lot of help from Kay Sievers.

Nasty bug in the block code was found by Dave Young
<hidave.darkstar@gmail.com>

Cc: Kay Sievers <kay.sievers@vrfy.org>
Cc: Dave Young <hidave.darkstar@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 arch/powerpc/platforms/pseries/power.c |  2 +-
 arch/s390/hypfs/inode.c                |  4 ++--
 arch/s390/kernel/ipl.c                 |  8 ++++----
 block/genhd.c                          |  5 +++--
 drivers/acpi/bus.c                     |  2 +-
 drivers/base/bus.c                     |  5 +++--
 drivers/base/class.c                   |  8 +++++---
 drivers/base/core.c                    |  5 +++--
 drivers/base/firmware.c                |  5 +++--
 drivers/base/hypervisor.c              |  2 +-
 drivers/base/sys.c                     |  3 ++-
 drivers/edac/edac_mc_sysfs.c           |  2 +-
 drivers/firmware/edd.c                 |  5 +++--
 drivers/firmware/efivars.c             |  9 +++++----
 drivers/parisc/pdc_stable.c            |  9 +++++----
 drivers/pci/hotplug/pci_hotplug_core.c |  7 ++++---
 drivers/pci/hotplug/rpadlpar_sysfs.c   |  1 -
 drivers/uio/uio.c                      |  2 +-
 fs/configfs/mount.c                    |  4 ++--
 fs/debugfs/inode.c                     |  4 ++--
 fs/dlm/lockspace.c                     |  6 ++----
 fs/ecryptfs/main.c                     |  4 ++--
 fs/fuse/inode.c                        |  8 ++++----
 fs/gfs2/locking/dlm/sysfs.c            |  6 ++----
 fs/gfs2/sys.c                          |  6 ++----
 fs/namespace.c                         |  2 +-
 fs/ocfs2/cluster/masklog.c             |  2 +-
 fs/ocfs2/cluster/sys.c                 |  2 +-
 fs/sysfs/file.c                        |  4 +---
 include/linux/kobject.h                | 15 ++++-----------
 kernel/ksysfs.c                        |  2 +-
 kernel/module.c                        |  2 +-
 kernel/params.c                        |  9 +++++----
 kernel/power/main.c                    |  2 +-
 mm/slub.c                              |  5 +++--
 security/inode.c                       |  4 ++--
 36 files changed, 84 insertions(+), 87 deletions(-)

(limited to 'kernel')

diff --git a/arch/powerpc/platforms/pseries/power.c b/arch/powerpc/platforms/pseries/power.c
index 73e69023d90a..08d7a5007167 100644
--- a/arch/powerpc/platforms/pseries/power.c
+++ b/arch/powerpc/platforms/pseries/power.c
@@ -57,7 +57,7 @@ static struct subsys_attribute auto_poweron_attr = {
 };
 
 #ifndef CONFIG_PM
-decl_subsys(power,NULL,NULL);
+decl_subsys(power, NULL);
 
 static struct attribute *g[] = {
         &auto_poweron_attr.attr,
diff --git a/arch/s390/hypfs/inode.c b/arch/s390/hypfs/inode.c
index 5245717295b8..c022ccc04d41 100644
--- a/arch/s390/hypfs/inode.c
+++ b/arch/s390/hypfs/inode.c
@@ -490,7 +490,7 @@ static struct super_operations hypfs_s_ops = {
 	.show_options	= hypfs_show_options,
 };
 
-static decl_subsys(s390, NULL, NULL);
+static decl_subsys(s390, NULL);
 
 static int __init hypfs_init(void)
 {
@@ -506,7 +506,7 @@ static int __init hypfs_init(void)
 			goto fail_diag;
 		}
 	}
-	kobj_set_kset_s(&s390_subsys, hypervisor_subsys);
+	s390_subsys.kobj.kset = &hypervisor_subsys;
 	rc = subsystem_register(&s390_subsys);
 	if (rc)
 		goto fail_sysfs;
diff --git a/arch/s390/kernel/ipl.c b/arch/s390/kernel/ipl.c
index ce0856d32500..cae793af5423 100644
--- a/arch/s390/kernel/ipl.c
+++ b/arch/s390/kernel/ipl.c
@@ -418,7 +418,7 @@ static struct attribute_group ipl_unknown_attr_group = {
 	.attrs = ipl_unknown_attrs,
 };
 
-static decl_subsys(ipl, NULL, NULL);
+static decl_subsys(ipl, NULL);
 
 /*
  * reipl section
@@ -590,7 +590,7 @@ static ssize_t reipl_type_store(struct kset *kset, const char *buf,
 static struct subsys_attribute reipl_type_attr =
 		__ATTR(reipl_type, 0644, reipl_type_show, reipl_type_store);
 
-static decl_subsys(reipl, NULL, NULL);
+static decl_subsys(reipl, NULL);
 
 /*
  * dump section
@@ -685,13 +685,13 @@ static ssize_t dump_type_store(struct kset *kset, const char *buf,
 static struct subsys_attribute dump_type_attr =
 		__ATTR(dump_type, 0644, dump_type_show, dump_type_store);
 
-static decl_subsys(dump, NULL, NULL);
+static decl_subsys(dump, NULL);
 
 /*
  * Shutdown actions section
  */
 
-static decl_subsys(shutdown_actions, NULL, NULL);
+static decl_subsys(shutdown_actions, NULL);
 
 /* on panic */
 
diff --git a/block/genhd.c b/block/genhd.c
index f2ac914160d1..32227b7ecd17 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -584,7 +584,7 @@ static struct kset_uevent_ops block_uevent_ops = {
 	.uevent		= block_uevent,
 };
 
-decl_subsys(block, &ktype_block, &block_uevent_ops);
+decl_subsys(block, &block_uevent_ops);
 
 /*
  * aggregate disk stat collector.  Uses the same stats that the sysfs
@@ -721,7 +721,8 @@ struct gendisk *alloc_disk_node(int minors, int node_id)
 			}
 		}
 		disk->minors = minors;
-		kobj_set_kset_s(disk,block_subsys);
+		disk->kobj.kset = &block_subsys;
+		disk->kobj.ktype = &ktype_block;
 		kobject_init(&disk->kobj);
 		rand_initialize_disk(disk);
 		INIT_WORK(&disk->async_notify,
diff --git a/drivers/acpi/bus.c b/drivers/acpi/bus.c
index f4487c38d9f2..7c172d9d7acf 100644
--- a/drivers/acpi/bus.c
+++ b/drivers/acpi/bus.c
@@ -743,7 +743,7 @@ static int __init acpi_bus_init(void)
 	return -ENODEV;
 }
 
-decl_subsys(acpi, NULL, NULL);
+decl_subsys(acpi, NULL);
 
 static int __init acpi_init(void)
 {
diff --git a/drivers/base/bus.c b/drivers/base/bus.c
index 9a19b071c573..630956037e18 100644
--- a/drivers/base/bus.c
+++ b/drivers/base/bus.c
@@ -166,7 +166,7 @@ static struct kset_uevent_ops bus_uevent_ops = {
 	.filter = bus_uevent_filter,
 };
 
-static decl_subsys(bus, &bus_ktype, &bus_uevent_ops);
+static decl_subsys(bus, &bus_uevent_ops);
 
 
 #ifdef CONFIG_HOTPLUG
@@ -639,6 +639,7 @@ int bus_add_driver(struct device_driver *drv)
 	if (error)
 		goto out_put_bus;
 	drv->kobj.kset = &bus->drivers;
+	drv->kobj.ktype = &driver_ktype;
 	error = kobject_register(&drv->kobj);
 	if (error)
 		goto out_put_bus;
@@ -851,6 +852,7 @@ int bus_register(struct bus_type * bus)
 		goto out;
 
 	bus->subsys.kobj.kset = &bus_subsys;
+	bus->subsys.kobj.ktype = &bus_ktype;
 
 	retval = subsystem_register(&bus->subsys);
 	if (retval)
@@ -868,7 +870,6 @@ int bus_register(struct bus_type * bus)
 
 	kobject_set_name(&bus->drivers.kobj, "drivers");
 	bus->drivers.kobj.parent = &bus->subsys.kobj;
-	bus->drivers.ktype = &driver_ktype;
 	retval = kset_register(&bus->drivers);
 	if (retval)
 		goto bus_drivers_fail;
diff --git a/drivers/base/class.c b/drivers/base/class.c
index a863bb091e11..8ad98924cddb 100644
--- a/drivers/base/class.c
+++ b/drivers/base/class.c
@@ -71,7 +71,7 @@ static struct kobj_type class_ktype = {
 };
 
 /* Hotplug events for classes go to the class_obj subsys */
-static decl_subsys(class, &class_ktype, NULL);
+static decl_subsys(class, NULL);
 
 
 int class_create_file(struct class * cls, const struct class_attribute * attr)
@@ -150,6 +150,7 @@ int class_register(struct class * cls)
 		return error;
 
 	cls->subsys.kobj.kset = &class_subsys;
+	cls->subsys.kobj.ktype = &class_ktype;
 
 	error = subsystem_register(&cls->subsys);
 	if (!error) {
@@ -452,7 +453,7 @@ static struct kset_uevent_ops class_uevent_ops = {
 	.uevent =	class_uevent,
 };
 
-static decl_subsys(class_obj, &class_device_ktype, &class_uevent_ops);
+static decl_subsys(class_obj, &class_uevent_ops);
 
 
 static int class_device_add_attrs(struct class_device * cd)
@@ -537,7 +538,8 @@ static struct class_device_attribute class_uevent_attr =
 
 void class_device_initialize(struct class_device *class_dev)
 {
-	kobj_set_kset_s(class_dev, class_obj_subsys);
+	class_dev->kobj.kset = &class_obj_subsys;
+	class_dev->kobj.ktype = &class_device_ktype;
 	kobject_init(&class_dev->kobj);
 	INIT_LIST_HEAD(&class_dev->node);
 }
diff --git a/drivers/base/core.c b/drivers/base/core.c
index ce6b64c489ad..c8f2ac03d46d 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -405,7 +405,7 @@ static struct device_attribute devt_attr =
  *	devices_subsys - structure to be registered with kobject core.
  */
 
-decl_subsys(devices, &device_ktype, &device_uevent_ops);
+decl_subsys(devices, &device_uevent_ops);
 
 
 /**
@@ -525,7 +525,8 @@ static void klist_children_put(struct klist_node *n)
 
 void device_initialize(struct device *dev)
 {
-	kobj_set_kset_s(dev, devices_subsys);
+	dev->kobj.kset = &devices_subsys;
+	dev->kobj.ktype = &device_ktype;
 	kobject_init(&dev->kobj);
 	klist_init(&dev->klist_children, klist_children_get,
 		   klist_children_put);
diff --git a/drivers/base/firmware.c b/drivers/base/firmware.c
index 90c862932169..336be0450d54 100644
--- a/drivers/base/firmware.c
+++ b/drivers/base/firmware.c
@@ -15,11 +15,12 @@
 
 #include "base.h"
 
-static decl_subsys(firmware, NULL, NULL);
+static decl_subsys(firmware, NULL);
 
 int firmware_register(struct kset *s)
 {
-	kobj_set_kset_s(s, firmware_subsys);
+	s->kobj.kset = &firmware_subsys;
+	s->kobj.ktype = NULL;
 	return subsystem_register(s);
 }
 
diff --git a/drivers/base/hypervisor.c b/drivers/base/hypervisor.c
index 7080b413ddc9..14e75e9ec783 100644
--- a/drivers/base/hypervisor.c
+++ b/drivers/base/hypervisor.c
@@ -11,7 +11,7 @@
 
 #include "base.h"
 
-decl_subsys(hypervisor, NULL, NULL);
+decl_subsys(hypervisor, NULL);
 EXPORT_SYMBOL_GPL(hypervisor_subsys);
 
 int __init hypervisor_init(void)
diff --git a/drivers/base/sys.c b/drivers/base/sys.c
index ac7ff6d0c6e5..7cf19fc318da 100644
--- a/drivers/base/sys.c
+++ b/drivers/base/sys.c
@@ -131,7 +131,7 @@ EXPORT_SYMBOL_GPL(sysdev_class_remove_file);
 /*
  * declare system_subsys
  */
-static decl_subsys(system, &ktype_sysdev_class, NULL);
+static decl_subsys(system, NULL);
 
 int sysdev_class_register(struct sysdev_class * cls)
 {
@@ -139,6 +139,7 @@ int sysdev_class_register(struct sysdev_class * cls)
 		 kobject_name(&cls->kset.kobj));
 	INIT_LIST_HEAD(&cls->drivers);
 	cls->kset.kobj.parent = &system_subsys.kobj;
+	cls->kset.kobj.ktype = &ktype_sysdev_class;
 	cls->kset.kobj.kset = &system_subsys;
 	return kset_register(&cls->kset);
 }
diff --git a/drivers/edac/edac_mc_sysfs.c b/drivers/edac/edac_mc_sysfs.c
index 3706b2bc0987..905fcd73c26e 100644
--- a/drivers/edac/edac_mc_sysfs.c
+++ b/drivers/edac/edac_mc_sysfs.c
@@ -744,7 +744,6 @@ static struct kobj_type ktype_mc_set_attribs = {
  */
 static struct kset mc_kset = {
 	.kobj = {.ktype = &ktype_mc_set_attribs },
-	.ktype = &ktype_mci,
 };
 
 
@@ -767,6 +766,7 @@ int edac_mc_register_sysfs_main_kobj(struct mem_ctl_info *mci)
 
 	/* this instance become part of the mc_kset */
 	kobj_mci->kset = &mc_kset;
+	kobj_mci->ktype = &ktype_mci;
 
 	/* set the name of the mc<id> object */
 	err = kobject_set_name(kobj_mci, "mc%d", mci->mc_idx);
diff --git a/drivers/firmware/edd.c b/drivers/firmware/edd.c
index 6942e065e609..fc567fad3f7c 100644
--- a/drivers/firmware/edd.c
+++ b/drivers/firmware/edd.c
@@ -631,7 +631,7 @@ static struct kobj_type edd_ktype = {
 	.default_attrs	= def_attrs,
 };
 
-static decl_subsys(edd, &edd_ktype, NULL);
+static decl_subsys(edd, NULL);
 
 
 /**
@@ -723,7 +723,8 @@ edd_device_register(struct edd_device *edev, int i)
 	edd_dev_set_info(edev, i);
 	kobject_set_name(&edev->kobj, "int13_dev%02x",
 			 0x80 + i);
-	kobj_set_kset_s(edev,edd_subsys);
+	edev->kobj.kset = &edd_subsys;
+	edev->kobj.ktype = &edd_ktype;
 	error = kobject_register(&edev->kobj);
 	if (!error)
 		edd_populate_dir(edev);
diff --git a/drivers/firmware/efivars.c b/drivers/firmware/efivars.c
index 858a7b95933b..06ecdb9f6013 100644
--- a/drivers/firmware/efivars.c
+++ b/drivers/firmware/efivars.c
@@ -583,8 +583,8 @@ static struct subsys_attribute *efi_subsys_attrs[] = {
 	NULL,	/* maybe more in the future? */
 };
 
-static decl_subsys(vars, &efivar_ktype, NULL);
-static decl_subsys(efi, NULL, NULL);
+static decl_subsys(vars, NULL);
+static decl_subsys(efi, NULL);
 
 /*
  * efivar_create_sysfs_entry()
@@ -629,7 +629,8 @@ efivar_create_sysfs_entry(unsigned long variable_name_size,
 	efi_guid_unparse(vendor_guid, short_name + strlen(short_name));
 
 	kobject_set_name(&new_efivar->kobj, "%s", short_name);
-	kobj_set_kset_s(new_efivar, vars_subsys);
+	new_efivar->kobj.kset = &vars_subsys;
+	new_efivar->kobj.ktype = &efivar_ktype;
 	i = kobject_register(&new_efivar->kobj);
 	if (i) {
 		kfree(short_name);
@@ -687,7 +688,7 @@ efivars_init(void)
 		goto out_free;
 	}
 
-	kobj_set_kset_s(&vars_subsys, efi_subsys);
+	vars_subsys.kobj.kset = &efi_subsys;
 
 	error = subsystem_register(&vars_subsys);
 
diff --git a/drivers/parisc/pdc_stable.c b/drivers/parisc/pdc_stable.c
index ebb09e98d215..1382be64cc3f 100644
--- a/drivers/parisc/pdc_stable.c
+++ b/drivers/parisc/pdc_stable.c
@@ -964,8 +964,8 @@ static struct subsys_attribute *pdcs_subsys_attrs[] = {
 	NULL,
 };
 
-static decl_subsys(paths, &ktype_pdcspath, NULL);
-static decl_subsys(stable, NULL, NULL);
+static decl_subsys(paths, NULL);
+static decl_subsys(stable, NULL);
 
 /**
  * pdcs_register_pathentries - Prepares path entries kobjects for sysfs usage.
@@ -997,7 +997,8 @@ pdcs_register_pathentries(void)
 
 		if ((err = kobject_set_name(&entry->kobj, "%s", entry->name)))
 			return err;
-		kobj_set_kset_s(entry, paths_subsys);
+		entry->kobj.kset = &paths_subsys;
+		entry->kobj.ktype = &ktype_pdcspath;
 		if ((err = kobject_register(&entry->kobj)))
 			return err;
 		
@@ -1072,7 +1073,7 @@ pdc_stable_init(void)
 			error = subsys_create_file(&stable_subsys, attr);
 	
 	/* register the paths subsys as a subsystem of stable subsys */
-	kobj_set_kset_s(&paths_subsys, stable_subsys);
+	paths_subsys.kobj.kset = &stable_subsys;
 	if ((rc = subsystem_register(&paths_subsys)))
 		goto fail_subsysreg;
 
diff --git a/drivers/pci/hotplug/pci_hotplug_core.c b/drivers/pci/hotplug/pci_hotplug_core.c
index 01c351c176ac..ce1cff0fdeca 100644
--- a/drivers/pci/hotplug/pci_hotplug_core.c
+++ b/drivers/pci/hotplug/pci_hotplug_core.c
@@ -96,7 +96,7 @@ static struct kobj_type hotplug_slot_ktype = {
 	.release = &hotplug_slot_release,
 };
 
-decl_subsys_name(pci_hotplug_slots, slots, &hotplug_slot_ktype, NULL);
+decl_subsys_name(pci_hotplug_slots, slots, NULL);
 
 /* these strings match up with the values in pci_bus_speed */
 static char *pci_bus_speed_strings[] = {
@@ -633,7 +633,8 @@ int pci_hp_register (struct hotplug_slot *slot)
 	}
 
 	kobject_set_name(&slot->kobj, "%s", slot->name);
-	kobj_set_kset_s(slot, pci_hotplug_slots_subsys);
+	slot->kobj.kset = &pci_hotplug_slots_subsys;
+	slot->kobj.ktype = &hotplug_slot_ktype;
 
 	/* this can fail if we have already registered a slot with the same name */
 	if (kobject_register(&slot->kobj)) {
@@ -701,7 +702,7 @@ static int __init pci_hotplug_init (void)
 {
 	int result;
 
-	kobj_set_kset_s(&pci_hotplug_slots_subsys, pci_bus_type.subsys);
+	pci_hotplug_slots_subsys.kobj.kset = &pci_bus_type.subsys;
 	result = subsystem_register(&pci_hotplug_slots_subsys);
 	if (result) {
 		err("Register subsys with error %d\n", result);
diff --git a/drivers/pci/hotplug/rpadlpar_sysfs.c b/drivers/pci/hotplug/rpadlpar_sysfs.c
index a080fedf0332..76090937c758 100644
--- a/drivers/pci/hotplug/rpadlpar_sysfs.c
+++ b/drivers/pci/hotplug/rpadlpar_sysfs.c
@@ -131,7 +131,6 @@ struct kobj_type ktype_dlpar_io = {
 struct kset dlpar_io_kset = {
 	.kobj = {.ktype = &ktype_dlpar_io,
 		 .parent = &pci_hotplug_slots_subsys.kobj},
-	.ktype = &ktype_dlpar_io,
 };
 
 int dlpar_sysfs_init(void)
diff --git a/drivers/uio/uio.c b/drivers/uio/uio.c
index 865f32b63b5c..606aae7490ab 100644
--- a/drivers/uio/uio.c
+++ b/drivers/uio/uio.c
@@ -160,7 +160,7 @@ static int uio_dev_add_attributes(struct uio_device *idev)
 		if (!map_found) {
 			map_found = 1;
 			kobject_set_name(&idev->map_attr_kset.kobj,"maps");
-			idev->map_attr_kset.ktype = &map_attr_type;
+			idev->map_attr_kset.kobj.ktype = &map_attr_type;
 			idev->map_attr_kset.kobj.parent = &idev->dev->kobj;
 			ret = kset_register(&idev->map_attr_kset);
 			if (ret)
diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c
index 3bf0278ea843..374ddbd6648d 100644
--- a/fs/configfs/mount.c
+++ b/fs/configfs/mount.c
@@ -128,7 +128,7 @@ void configfs_release_fs(void)
 }
 
 
-static decl_subsys(config, NULL, NULL);
+static decl_subsys(config, NULL);
 
 static int __init configfs_init(void)
 {
@@ -140,7 +140,7 @@ static int __init configfs_init(void)
 	if (!configfs_dir_cachep)
 		goto out;
 
-	kobj_set_kset_s(&config_subsys, kernel_subsys);
+	config_subsys.kobj.kset = &kernel_subsys;
 	err = subsystem_register(&config_subsys);
 	if (err) {
 		kmem_cache_destroy(configfs_dir_cachep);
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 6a713b33992f..f7f13516fc1a 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -426,13 +426,13 @@ exit:
 }
 EXPORT_SYMBOL_GPL(debugfs_rename);
 
-static decl_subsys(debug, NULL, NULL);
+static decl_subsys(debug, NULL);
 
 static int __init debugfs_init(void)
 {
 	int retval;
 
-	kobj_set_kset_s(&debug_subsys, kernel_subsys);
+	debug_subsys.kobj.kset = &kernel_subsys;
 	retval = subsystem_register(&debug_subsys);
 	if (retval)
 		return retval;
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index 6353a8384520..18e4a17b9bee 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -166,9 +166,7 @@ static struct kobj_type dlm_ktype = {
 	.release       = lockspace_kobj_release,
 };
 
-static struct kset dlm_kset = {
-	.ktype  = &dlm_ktype,
-};
+static struct kset dlm_kset;
 
 static int kobject_setup(struct dlm_ls *ls)
 {
@@ -228,7 +226,7 @@ int dlm_lockspace_init(void)
 	spin_lock_init(&lslist_lock);
 
 	kobject_set_name(&dlm_kset.kobj, "dlm");
-	kobj_set_kset_s(&dlm_kset, kernel_subsys);
+	dlm_kset.kobj.kset = &kernel_subsys;
 	error = kset_register(&dlm_kset);
 	if (error)
 		printk("dlm_lockspace_init: cannot register kset %d\n", error);
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index f9f32472c505..fe2f44fa17cc 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -734,7 +734,7 @@ static int ecryptfs_init_kmem_caches(void)
 	return 0;
 }
 
-static decl_subsys(ecryptfs, NULL, NULL);
+static decl_subsys(ecryptfs, NULL);
 
 static ssize_t version_show(struct kset *kset, char *buff)
 {
@@ -798,6 +798,7 @@ static int do_sysfs_registration(void)
 {
 	int rc;
 
+	ecryptfs_subsys.kobj.kset = &fs_subsys;
 	rc = subsystem_register(&ecryptfs_subsys);
 	if (rc) {
 		printk(KERN_ERR
@@ -845,7 +846,6 @@ static int __init ecryptfs_init(void)
 		printk(KERN_ERR "Failed to register filesystem\n");
 		goto out_free_kmem_caches;
 	}
-	kobj_set_kset_s(&ecryptfs_subsys, fs_subsys);
 	rc = do_sysfs_registration();
 	if (rc) {
 		printk(KERN_ERR "sysfs registration failed\n");
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 84f9f7dfdf5b..f5e4182c482e 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -744,8 +744,8 @@ static inline void unregister_fuseblk(void)
 }
 #endif
 
-static decl_subsys(fuse, NULL, NULL);
-static decl_subsys(connections, NULL, NULL);
+static decl_subsys(fuse, NULL);
+static decl_subsys(connections, NULL);
 
 static void fuse_inode_init_once(struct kmem_cache *cachep, void *foo)
 {
@@ -795,12 +795,12 @@ static int fuse_sysfs_init(void)
 {
 	int err;
 
-	kobj_set_kset_s(&fuse_subsys, fs_subsys);
+	fuse_subsys.kobj.kset = &fs_subsys;
 	err = subsystem_register(&fuse_subsys);
 	if (err)
 		goto out_err;
 
-	kobj_set_kset_s(&connections_subsys, fuse_subsys);
+	connections_subsys.kobj.kset = &fuse_subsys;
 	err = subsystem_register(&connections_subsys);
 	if (err)
 		goto out_fuse_unregister;
diff --git a/fs/gfs2/locking/dlm/sysfs.c b/fs/gfs2/locking/dlm/sysfs.c
index ae9e6a25fe2b..93e66b22757f 100644
--- a/fs/gfs2/locking/dlm/sysfs.c
+++ b/fs/gfs2/locking/dlm/sysfs.c
@@ -189,9 +189,7 @@ static struct kobj_type gdlm_ktype = {
 	.sysfs_ops     = &gdlm_attr_ops,
 };
 
-static struct kset gdlm_kset = {
-	.ktype  = &gdlm_ktype,
-};
+static struct kset gdlm_kset;
 
 int gdlm_kobject_setup(struct gdlm_ls *ls, struct kobject *fskobj)
 {
@@ -224,7 +222,7 @@ int gdlm_sysfs_init(void)
 	int error;
 
 	kobject_set_name(&gdlm_kset.kobj, "lock_dlm");
-	kobj_set_kset_s(&gdlm_kset, kernel_subsys);
+	gdlm_kset.kobj.kset = &kernel_subsys;
 	error = kset_register(&gdlm_kset);
 	if (error)
 		printk("lock_dlm: cannot register kset %d\n", error);
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 06e0b7768d97..d7fa54443f0c 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -221,9 +221,7 @@ static struct kobj_type gfs2_ktype = {
 	.sysfs_ops     = &gfs2_attr_ops,
 };
 
-static struct kset gfs2_kset = {
-	.ktype  = &gfs2_ktype,
-};
+static struct kset gfs2_kset;
 
 /*
  * display struct lm_lockstruct fields
@@ -551,7 +549,7 @@ int gfs2_sys_init(void)
 	gfs2_sys_margs = NULL;
 	spin_lock_init(&gfs2_sys_margs_lock);
 	kobject_set_name(&gfs2_kset.kobj, "gfs2");
-	kobj_set_kset_s(&gfs2_kset, fs_subsys);
+	gfs2_kset.kobj.kset = &fs_subsys;
 	return kset_register(&gfs2_kset);
 }
 
diff --git a/fs/namespace.c b/fs/namespace.c
index 06083885b21e..a4a3f70e7e26 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -41,7 +41,7 @@ static struct kmem_cache *mnt_cache __read_mostly;
 static struct rw_semaphore namespace_sem;
 
 /* /sys/fs */
-decl_subsys(fs, NULL, NULL);
+decl_subsys(fs, NULL);
 EXPORT_SYMBOL_GPL(fs_subsys);
 
 static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry)
diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index a4882c8df945..dead319932b3 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -157,7 +157,7 @@ int mlog_sys_init(struct kset *o2cb_subsys)
 	mlog_attr_ptrs[i] = NULL;
 
 	kobject_set_name(&mlog_kset.kobj, "logmask");
-	kobj_set_kset_s(&mlog_kset, *o2cb_subsys);
+	mlog_kset.kobj.kset = o2cb_subsys;
 	return kset_register(&mlog_kset);
 }
 
diff --git a/fs/ocfs2/cluster/sys.c b/fs/ocfs2/cluster/sys.c
index 64f6f378fd09..880d0138bb0a 100644
--- a/fs/ocfs2/cluster/sys.c
+++ b/fs/ocfs2/cluster/sys.c
@@ -72,7 +72,7 @@ static struct kobj_type o2cb_subsys_type = {
 };
 
 /* gives us o2cb_subsys */
-static decl_subsys(o2cb, NULL, NULL);
+static decl_subsys(o2cb, NULL);
 
 static ssize_t
 o2cb_show(struct kobject * kobj, struct attribute * attr, char * buffer)
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 09a0611b3364..387a63662793 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -365,9 +365,7 @@ static int sysfs_open_file(struct inode *inode, struct file *file)
 	/* if the kobject has no ktype, then we assume that it is a subsystem
 	 * itself, and use ops for it.
 	 */
-	if (kobj->kset && kobj->kset->ktype)
-		ops = kobj->kset->ktype->sysfs_ops;
-	else if (kobj->ktype)
+	if (kobj->ktype)
 		ops = kobj->ktype->sysfs_ops;
 	else
 		ops = &subsys_sysfs_ops;
diff --git a/include/linux/kobject.h b/include/linux/kobject.h
index e2b8c3dae425..5031565ab30d 100644
--- a/include/linux/kobject.h
+++ b/include/linux/kobject.h
@@ -135,7 +135,6 @@ struct kset_uevent_ops {
  * define the attribute callbacks and other common events that happen to
  * a kobject.
  *
- * @ktype: the struct kobj_type for this specific kset
  * @list: the list of all kobjects for this kset
  * @list_lock: a lock for iterating over the kobjects
  * @kobj: the embedded kobject for this kset (recursion, isn't it fun...)
@@ -145,7 +144,6 @@ struct kset_uevent_ops {
  * desired.
  */
 struct kset {
-	struct kobj_type	*ktype;
 	struct list_head	list;
 	spinlock_t		list_lock;
 	struct kobject		kobj;
@@ -173,12 +171,9 @@ static inline void kset_put(struct kset * k)
 	kobject_put(&k->kobj);
 }
 
-static inline struct kobj_type * get_ktype(struct kobject * k)
+static inline struct kobj_type *get_ktype(struct kobject *kobj)
 {
-	if (k->kset && k->kset->ktype)
-		return k->kset->ktype;
-	else 
-		return k->ktype;
+	return kobj->ktype;
 }
 
 extern struct kobject * kset_find_obj(struct kset *, const char *);
@@ -191,16 +186,14 @@ extern struct kobject * kset_find_obj(struct kset *, const char *);
 #define set_kset_name(str)	.kset = { .kobj = { .k_name = str } }
 
 
-#define decl_subsys(_name,_type,_uevent_ops) \
+#define decl_subsys(_name,_uevent_ops) \
 struct kset _name##_subsys = { \
 	.kobj = { .k_name = __stringify(_name) }, \
-	.ktype = _type, \
 	.uevent_ops =_uevent_ops, \
 }
-#define decl_subsys_name(_varname,_name,_type,_uevent_ops) \
+#define decl_subsys_name(_varname,_name,_uevent_ops) \
 struct kset _varname##_subsys = { \
 	.kobj = { .k_name = __stringify(_name) }, \
-	.ktype = _type, \
 	.uevent_ops =_uevent_ops, \
 }
 
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 65daa5373ca6..094e2bc101a8 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -94,7 +94,7 @@ static struct bin_attribute notes_attr = {
 	.read = &notes_read,
 };
 
-decl_subsys(kernel, NULL, NULL);
+decl_subsys(kernel, NULL);
 EXPORT_SYMBOL_GPL(kernel_subsys);
 
 static struct attribute * kernel_attrs[] = {
diff --git a/kernel/module.c b/kernel/module.c
index c2e3e2e98801..68df79738b3b 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1223,7 +1223,7 @@ int mod_sysfs_init(struct module *mod)
 	err = kobject_set_name(&mod->mkobj.kobj, "%s", mod->name);
 	if (err)
 		goto out;
-	kobj_set_kset_s(&mod->mkobj, module_subsys);
+	mod->mkobj.kobj.kset = &module_subsys;
 	mod->mkobj.mod = mod;
 
 	kobject_init(&mod->mkobj.kobj);
diff --git a/kernel/params.c b/kernel/params.c
index 7686417ee00e..9f051824097d 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -30,6 +30,8 @@
 #define DEBUGP(fmt, a...)
 #endif
 
+static struct kobj_type module_ktype;
+
 static inline char dash2underscore(char c)
 {
 	if (c == '-')
@@ -560,7 +562,8 @@ static void __init kernel_param_sysfs_setup(const char *name,
 	BUG_ON(!mk);
 
 	mk->mod = THIS_MODULE;
-	kobj_set_kset_s(mk, module_subsys);
+	mk->kobj.kset = &module_subsys;
+	mk->kobj.ktype = &module_ktype;
 	kobject_set_name(&mk->kobj, name);
 	kobject_init(&mk->kobj);
 	ret = kobject_add(&mk->kobj);
@@ -679,8 +682,6 @@ static struct sysfs_ops module_sysfs_ops = {
 	.store = module_attr_store,
 };
 
-static struct kobj_type module_ktype;
-
 static int uevent_filter(struct kset *kset, struct kobject *kobj)
 {
 	struct kobj_type *ktype = get_ktype(kobj);
@@ -694,7 +695,7 @@ static struct kset_uevent_ops module_uevent_ops = {
 	.filter = uevent_filter,
 };
 
-decl_subsys(module, &module_ktype, &module_uevent_ops);
+decl_subsys(module, &module_uevent_ops);
 int module_sysfs_initialized;
 
 static void module_release(struct kobject *kobj)
diff --git a/kernel/power/main.c b/kernel/power/main.c
index f71c9504a5c5..1ef31c91ce0e 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -276,7 +276,7 @@ EXPORT_SYMBOL(pm_suspend);
 
 #endif /* CONFIG_SUSPEND */
 
-decl_subsys(power,NULL,NULL);
+decl_subsys(power, NULL);
 
 
 /**
diff --git a/mm/slub.c b/mm/slub.c
index 474945ecd89d..40bdf41035e5 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3962,7 +3962,7 @@ static struct kset_uevent_ops slab_uevent_ops = {
 	.filter = uevent_filter,
 };
 
-static decl_subsys(slab, &slab_ktype, &slab_uevent_ops);
+static decl_subsys(slab, &slab_uevent_ops);
 
 #define ID_STR_LENGTH 64
 
@@ -4025,8 +4025,9 @@ static int sysfs_slab_add(struct kmem_cache *s)
 		name = create_unique_id(s);
 	}
 
-	kobj_set_kset_s(s, slab_subsys);
 	kobject_set_name(&s->kobj, name);
+	s->kobj.kset = &slab_subsys;
+	s->kobj.ktype = &slab_ktype;
 	kobject_init(&s->kobj);
 	err = kobject_add(&s->kobj);
 	if (err)
diff --git a/security/inode.c b/security/inode.c
index b28a8acae34d..9e42f5f705b2 100644
--- a/security/inode.c
+++ b/security/inode.c
@@ -315,13 +315,13 @@ void securityfs_remove(struct dentry *dentry)
 }
 EXPORT_SYMBOL_GPL(securityfs_remove);
 
-static decl_subsys(security, NULL, NULL);
+static decl_subsys(security, NULL);
 
 static int __init securityfs_init(void)
 {
 	int retval;
 
-	kobj_set_kset_s(&security_subsys, kernel_subsys);
+	security_subsys.kobj.kset = &kernel_subsys;
 	retval = subsystem_register(&security_subsys);
 	if (retval)
 		return retval;
-- 
cgit v1.2.3


From 4ff6abff832fbc6cb1d769f6106c841bc2b09f63 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Mon, 5 Nov 2007 22:24:43 -0800
Subject: kobject: get rid of kobject_add_dir

kobject_create_and_add is the same as kobject_add_dir, so drop
kobject_add_dir.


Cc: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/base/core.c     |  3 ++-
 fs/partitions/check.c   |  6 +++---
 include/linux/kobject.h |  1 -
 kernel/module.c         |  6 +++---
 lib/kobject.c           | 12 ------------
 5 files changed, 8 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/drivers/base/core.c b/drivers/base/core.c
index c8f2ac03d46d..992eba3289bd 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -562,7 +562,8 @@ static struct kobject *virtual_device_parent(struct device *dev)
 	static struct kobject *virtual_dir = NULL;
 
 	if (!virtual_dir)
-		virtual_dir = kobject_add_dir(&devices_subsys.kobj, "virtual");
+		virtual_dir = kobject_create_and_add("virtual",
+						     &devices_subsys.kobj);
 
 	return virtual_dir;
 }
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 722e12e5acc7..69685bb51c62 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -335,7 +335,7 @@ static inline void partition_sysfs_add_subdir(struct hd_struct *p)
 	struct kobject *k;
 
 	k = kobject_get(&p->kobj);
-	p->holder_dir = kobject_add_dir(k, "holders");
+	p->holder_dir = kobject_create_and_add("holders", k);
 	kobject_put(k);
 }
 
@@ -344,8 +344,8 @@ static inline void disk_sysfs_add_subdirs(struct gendisk *disk)
 	struct kobject *k;
 
 	k = kobject_get(&disk->kobj);
-	disk->holder_dir = kobject_add_dir(k, "holders");
-	disk->slave_dir = kobject_add_dir(k, "slaves");
+	disk->holder_dir = kobject_create_and_add("holders", k);
+	disk->slave_dir = kobject_create_and_add("slaves", k);
 	kobject_put(k);
 }
 
diff --git a/include/linux/kobject.h b/include/linux/kobject.h
index 33e7a6142a75..7b09136fb211 100644
--- a/include/linux/kobject.h
+++ b/include/linux/kobject.h
@@ -105,7 +105,6 @@ extern void kobject_put(struct kobject *);
 
 extern struct kobject *kobject_kset_add_dir(struct kset *kset,
 					    struct kobject *, const char *);
-extern struct kobject *kobject_add_dir(struct kobject *, const char *);
 
 extern char * kobject_get_path(struct kobject *, gfp_t);
 
diff --git a/kernel/module.c b/kernel/module.c
index 68df79738b3b..55142775c581 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1122,7 +1122,7 @@ static void add_notes_attrs(struct module *mod, unsigned int nsect,
 		++loaded;
 	}
 
-	notes_attrs->dir = kobject_add_dir(&mod->mkobj.kobj, "notes");
+	notes_attrs->dir = kobject_create_and_add("notes", &mod->mkobj.kobj);
 	if (!notes_attrs->dir)
 		goto out;
 
@@ -1243,7 +1243,7 @@ int mod_sysfs_setup(struct module *mod,
 	if (err)
 		goto out;
 
-	mod->holders_dir = kobject_add_dir(&mod->mkobj.kobj, "holders");
+	mod->holders_dir = kobject_create_and_add("holders", &mod->mkobj.kobj);
 	if (!mod->holders_dir) {
 		err = -ENOMEM;
 		goto out_unreg;
@@ -2521,7 +2521,7 @@ static void module_create_drivers_dir(struct module_kobject *mk)
 	if (!mk || mk->drivers_dir)
 		return;
 
-	mk->drivers_dir = kobject_add_dir(&mk->kobj, "drivers");
+	mk->drivers_dir = kobject_create_and_add("drivers", &mk->kobj);
 }
 
 void module_add_driver(struct module *mod, struct device_driver *drv)
diff --git a/lib/kobject.c b/lib/kobject.c
index 98422a3eeffc..96b61d9a9284 100644
--- a/lib/kobject.c
+++ b/lib/kobject.c
@@ -712,18 +712,6 @@ struct kobject *kobject_kset_add_dir(struct kset *kset,
 	return k;
 }
 
-/**
- *	kobject_add_dir - add sub directory of object.
- *	@parent:	object in which a directory is created.
- *	@name:	directory name.
- *
- *	Add a plain directory object as child of given object.
- */
-struct kobject *kobject_add_dir(struct kobject *parent, const char *name)
-{
-	return kobject_create_and_add(name, parent);
-}
-
 /**
  *	kset_init - initialize a kset for use
  *	@k:	kset 
-- 
cgit v1.2.3


From bd35b93d8049ab47b5bfaf6b10ba39badf21d1c3 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Mon, 29 Oct 2007 20:13:17 +0100
Subject: kset: convert kernel_subsys to use kset_create

Dynamically create the kset instead of declaring it statically.  We also
rename kernel_subsys to kernel_kset to catch all users of this symbol
with a build error instead of an easy-to-ignore build warning.

Cc: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/configfs/mount.c         |  2 +-
 fs/debugfs/inode.c          |  2 +-
 fs/dlm/lockspace.c          |  2 +-
 fs/gfs2/locking/dlm/sysfs.c |  3 +--
 include/linux/kobject.h     |  4 ++--
 kernel/ksysfs.c             | 42 ++++++++++++++++++++++++++++++------------
 kernel/user.c               |  4 ++--
 security/inode.c            |  2 +-
 8 files changed, 39 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c
index 13300466464b..c4ee7f05de8b 100644
--- a/fs/configfs/mount.c
+++ b/fs/configfs/mount.c
@@ -140,7 +140,7 @@ static int __init configfs_init(void)
 	if (!configfs_dir_cachep)
 		goto out;
 
-	config_kobj = kobject_create_and_add("config", &kernel_subsys.kobj);
+	config_kobj = kobject_create_and_add("config", &kernel_kset->kobj);
 	if (!config_kobj) {
 		kmem_cache_destroy(configfs_dir_cachep);
 		configfs_dir_cachep = NULL;
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 667214200b03..5ce92c3d3b59 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -432,7 +432,7 @@ static int __init debugfs_init(void)
 {
 	int retval;
 
-	debug_kobj = kobject_create_and_add("debug", &kernel_subsys.kobj);
+	debug_kobj = kobject_create_and_add("debug", &kernel_kset->kobj);
 	if (!debug_kobj)
 		return -EINVAL;
 
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index 83a9c4dd5114..0828beb2d35d 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -223,7 +223,7 @@ int dlm_lockspace_init(void)
 	INIT_LIST_HEAD(&lslist);
 	spin_lock_init(&lslist_lock);
 
-	dlm_kset = kset_create_and_add("dlm", NULL, &kernel_subsys.kobj);
+	dlm_kset = kset_create_and_add("dlm", NULL, &kernel_kset->kobj);
 	if (!dlm_kset) {
 		printk(KERN_WARNING "%s: can not create kset\n", __FUNCTION__);
 		return -ENOMEM;
diff --git a/fs/gfs2/locking/dlm/sysfs.c b/fs/gfs2/locking/dlm/sysfs.c
index 0a8614088ec6..1a92b6f7bc10 100644
--- a/fs/gfs2/locking/dlm/sysfs.c
+++ b/fs/gfs2/locking/dlm/sysfs.c
@@ -219,8 +219,7 @@ void gdlm_kobject_release(struct gdlm_ls *ls)
 
 int gdlm_sysfs_init(void)
 {
-	gdlm_kset = kset_create_and_add("lock_dlm", NULL,
-					&kernel_subsys.kobj);
+	gdlm_kset = kset_create_and_add("lock_dlm", NULL, &kernel_kset->kobj);
 	if (!gdlm_kset) {
 		printk(KERN_WARNING "%s: can not create kset\n", __FUNCTION__);
 		return -ENOMEM;
diff --git a/include/linux/kobject.h b/include/linux/kobject.h
index 390ae14b73e8..bd741e86c11e 100644
--- a/include/linux/kobject.h
+++ b/include/linux/kobject.h
@@ -194,8 +194,8 @@ struct kset _name##_subsys = { \
 	.uevent_ops =_uevent_ops, \
 }
 
-/* The global /sys/kernel/ subsystem for people to chain off of */
-extern struct kset kernel_subsys;
+/* The global /sys/kernel/ kset for people to chain off of */
+extern struct kset *kernel_kset;
 /* The global /sys/hypervisor/ subsystem  */
 extern struct kset hypervisor_subsys;
 
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 094e2bc101a8..cf02d4ba9add 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -94,8 +94,8 @@ static struct bin_attribute notes_attr = {
 	.read = &notes_read,
 };
 
-decl_subsys(kernel, NULL);
-EXPORT_SYMBOL_GPL(kernel_subsys);
+struct kset *kernel_kset;
+EXPORT_SYMBOL_GPL(kernel_kset);
 
 static struct attribute * kernel_attrs[] = {
 #if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET)
@@ -116,24 +116,42 @@ static struct attribute_group kernel_attr_group = {
 
 static int __init ksysfs_init(void)
 {
-	int error = subsystem_register(&kernel_subsys);
-	if (!error)
-		error = sysfs_create_group(&kernel_subsys.kobj,
-					   &kernel_attr_group);
+	int error;
 
-	if (!error && notes_size > 0) {
+	kernel_kset = kset_create_and_add("kernel", NULL, NULL);
+	if (!kernel_kset) {
+		error = -ENOMEM;
+		goto exit;
+	}
+	error = sysfs_create_group(&kernel_kset->kobj, &kernel_attr_group);
+	if (error)
+		goto kset_exit;
+
+	if (notes_size > 0) {
 		notes_attr.size = notes_size;
-		error = sysfs_create_bin_file(&kernel_subsys.kobj,
-					      &notes_attr);
+		error = sysfs_create_bin_file(&kernel_kset->kobj, &notes_attr);
+		if (error)
+			goto group_exit;
 	}
 
 	/*
 	 * Create "/sys/kernel/uids" directory and corresponding root user's
 	 * directory under it.
 	 */
-	if (!error)
-		error = uids_kobject_init();
-
+	error = uids_kobject_init();
+	if (error)
+		goto notes_exit;
+
+	return 0;
+
+notes_exit:
+	if (notes_size > 0)
+		sysfs_remove_bin_file(&kernel_kset->kobj, &notes_attr);
+group_exit:
+	sysfs_remove_group(&kernel_kset->kobj, &kernel_attr_group);
+kset_exit:
+	kset_unregister(kernel_kset);
+exit:
 	return error;
 }
 
diff --git a/kernel/user.c b/kernel/user.c
index 8320a87f3e5a..80f1116b8fcd 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -198,8 +198,8 @@ int __init uids_kobject_init(void)
 	int error;
 
 	/* create under /sys/kernel dir */
-	uids_kobject.parent = &kernel_subsys.kobj;
-	uids_kobject.kset = &kernel_subsys;
+	uids_kobject.parent = &kernel_kset->kobj;
+	uids_kobject.kset = kernel_kset;
 	kobject_set_name(&uids_kobject, "uids");
 	kobject_init(&uids_kobject);
 
diff --git a/security/inode.c b/security/inode.c
index dfc5978d4298..dbe040ac0549 100644
--- a/security/inode.c
+++ b/security/inode.c
@@ -321,7 +321,7 @@ static int __init securityfs_init(void)
 {
 	int retval;
 
-	security_kobj = kobject_create_and_add("security", &kernel_subsys.kobj);
+  	security_kobj = kobject_create_and_add("security", &kernel_kset->kobj);
 	if (!security_kobj)
 		return -EINVAL;
 
-- 
cgit v1.2.3


From 7405c1e15edfe43b137bfbc5882f1af34d6d414d Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Thu, 1 Nov 2007 10:39:50 -0700
Subject: kset: convert /sys/module to use kset_create

Dynamically create the kset instead of declaring it statically.  We also
rename module_subsys to module_kset to catch all users of the variable.

Cc: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 include/linux/module.h |  4 +++-
 kernel/module.c        |  7 +++----
 kernel/params.c        | 29 +++++++++--------------------
 3 files changed, 15 insertions(+), 25 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/module.h b/include/linux/module.h
index 2cbc0b87e329..fbe930b9b69c 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -574,7 +574,9 @@ struct device_driver;
 #ifdef CONFIG_SYSFS
 struct module;
 
-extern struct kset module_subsys;
+extern struct kset *module_kset;
+extern struct kobj_type module_ktype;
+extern int module_sysfs_initialized;
 
 int mod_sysfs_init(struct module *mod);
 int mod_sysfs_setup(struct module *mod,
diff --git a/kernel/module.c b/kernel/module.c
index 55142775c581..d03fcd9d652c 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -47,8 +47,6 @@
 #include <asm/cacheflush.h>
 #include <linux/license.h>
 
-extern int module_sysfs_initialized;
-
 #if 0
 #define DEBUGP printk
 #else
@@ -1223,7 +1221,8 @@ int mod_sysfs_init(struct module *mod)
 	err = kobject_set_name(&mod->mkobj.kobj, "%s", mod->name);
 	if (err)
 		goto out;
-	mod->mkobj.kobj.kset = &module_subsys;
+	mod->mkobj.kobj.kset = module_kset;
+	mod->mkobj.kobj.ktype = &module_ktype;
 	mod->mkobj.mod = mod;
 
 	kobject_init(&mod->mkobj.kobj);
@@ -2539,7 +2538,7 @@ void module_add_driver(struct module *mod, struct device_driver *drv)
 		struct kobject *mkobj;
 
 		/* Lookup built-in module entry in /sys/modules */
-		mkobj = kset_find_obj(&module_subsys, drv->mod_name);
+		mkobj = kset_find_obj(module_kset, drv->mod_name);
 		if (mkobj) {
 			mk = container_of(mkobj, struct module_kobject, kobj);
 			/* remember our module structure */
diff --git a/kernel/params.c b/kernel/params.c
index 9f051824097d..97e092312155 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -30,8 +30,6 @@
 #define DEBUGP(fmt, a...)
 #endif
 
-static struct kobj_type module_ktype;
-
 static inline char dash2underscore(char c)
 {
 	if (c == '-')
@@ -562,7 +560,7 @@ static void __init kernel_param_sysfs_setup(const char *name,
 	BUG_ON(!mk);
 
 	mk->mod = THIS_MODULE;
-	mk->kobj.kset = &module_subsys;
+	mk->kobj.kset = module_kset;
 	mk->kobj.ktype = &module_ktype;
 	kobject_set_name(&mk->kobj, name);
 	kobject_init(&mk->kobj);
@@ -695,7 +693,7 @@ static struct kset_uevent_ops module_uevent_ops = {
 	.filter = uevent_filter,
 };
 
-decl_subsys(module, &module_uevent_ops);
+struct kset *module_kset;
 int module_sysfs_initialized;
 
 static void module_release(struct kobject *kobj)
@@ -707,7 +705,7 @@ static void module_release(struct kobject *kobj)
 	 */
 }
 
-static struct kobj_type module_ktype = {
+struct kobj_type module_ktype = {
 	.sysfs_ops =	&module_sysfs_ops,
 	.release =	module_release,
 };
@@ -717,13 +715,11 @@ static struct kobj_type module_ktype = {
  */
 static int __init param_sysfs_init(void)
 {
-	int ret;
-
-	ret = subsystem_register(&module_subsys);
-	if (ret < 0) {
-		printk(KERN_WARNING "%s (%d): subsystem_register error: %d\n",
-			__FILE__, __LINE__, ret);
-		return ret;
+	module_kset = kset_create_and_add("module", &module_uevent_ops, NULL);
+	if (!module_kset) {
+		printk(KERN_WARNING "%s (%d): error creating kset\n",
+			__FILE__, __LINE__);
+		return -ENOMEM;
 	}
 	module_sysfs_initialized = 1;
 
@@ -733,14 +729,7 @@ static int __init param_sysfs_init(void)
 }
 subsys_initcall(param_sysfs_init);
 
-#else
-#if 0
-static struct sysfs_ops module_sysfs_ops = {
-	.show = NULL,
-	.store = NULL,
-};
-#endif
-#endif
+#endif /* CONFIG_SYSFS */
 
 EXPORT_SYMBOL(param_set_byte);
 EXPORT_SYMBOL(param_get_byte);
-- 
cgit v1.2.3


From 039a5dcd2fc45188a2d522df630db4f7ef903a0f Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Thu, 1 Nov 2007 10:39:50 -0700
Subject: kset: convert /sys/power to use kset_create

Dynamically create the kset instead of declaring it statically.  We also
rename power_subsys to power_kset to catch all users of the variable and
we properly export it so that people don't have to guess that it really
is present in the system.

The pseries code is wierd, why is it createing /sys/power if CONFIG_PM
is disabled?  Oh well, stupid big boxes ignoring config options...

Cc: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 arch/arm/mach-omap1/pm.c               |  3 +--
 arch/powerpc/platforms/pseries/power.c | 14 ++++++--------
 include/linux/kobject.h                |  2 ++
 kernel/power/disk.c                    |  2 +-
 kernel/power/main.c                    | 11 +++++------
 kernel/power/power.h                   |  2 --
 6 files changed, 15 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/arch/arm/mach-omap1/pm.c b/arch/arm/mach-omap1/pm.c
index 3bf01e28df33..402113c72981 100644
--- a/arch/arm/mach-omap1/pm.c
+++ b/arch/arm/mach-omap1/pm.c
@@ -97,7 +97,6 @@ static struct subsys_attribute sleep_while_idle_attr = {
 	.store  = omap_pm_sleep_while_idle_store,
 };
 
-extern struct kset power_subsys;
 static void (*omap_sram_idle)(void) = NULL;
 static void (*omap_sram_suspend)(unsigned long r0, unsigned long r1) = NULL;
 
@@ -726,7 +725,7 @@ static int __init omap_pm_init(void)
 	omap_pm_init_proc();
 #endif
 
-	error = subsys_create_file(&power_subsys, &sleep_while_idle_attr);
+	error = subsys_create_file(power_kset, &sleep_while_idle_attr);
 	if (error)
 		printk(KERN_ERR "subsys_create_file failed: %d\n", error);
 
diff --git a/arch/powerpc/platforms/pseries/power.c b/arch/powerpc/platforms/pseries/power.c
index 08d7a5007167..c36febe7ce7d 100644
--- a/arch/powerpc/platforms/pseries/power.c
+++ b/arch/powerpc/platforms/pseries/power.c
@@ -57,7 +57,7 @@ static struct subsys_attribute auto_poweron_attr = {
 };
 
 #ifndef CONFIG_PM
-decl_subsys(power, NULL);
+struct kset *power_kset;
 
 static struct attribute *g[] = {
         &auto_poweron_attr.attr,
@@ -70,18 +70,16 @@ static struct attribute_group attr_group = {
 
 static int __init pm_init(void)
 {
-        int error = subsystem_register(&power_subsys);
-        if (!error)
-                error = sysfs_create_group(&power_subsys.kobj, &attr_group);
-        return error;
+	power_kset = kset_create_and_add("power", NULL, NULL);
+	if (!power_kset)
+		return -ENOMEM;
+	return sysfs_create_group(&power_kset->kobj, &attr_group);
 }
 core_initcall(pm_init);
 #else
-extern struct kset power_subsys;
-
 static int __init apo_pm_init(void)
 {
-	return (subsys_create_file(&power_subsys, &auto_poweron_attr));
+	return (subsys_create_file(power_kset, &auto_poweron_attr));
 }
 __initcall(apo_pm_init);
 #endif
diff --git a/include/linux/kobject.h b/include/linux/kobject.h
index f2483f6fd639..a6dd669cda9d 100644
--- a/include/linux/kobject.h
+++ b/include/linux/kobject.h
@@ -198,6 +198,8 @@ struct kset _name##_subsys = { \
 extern struct kset *kernel_kset;
 /* The global /sys/hypervisor/ kobject for people to chain off of */
 extern struct kobject *hypervisor_kobj;
+/* The global /sys/power/ kset for people to chain off of */
+extern struct kset *power_kset;
 
 extern int __must_check subsystem_register(struct kset *);
 extern void subsystem_unregister(struct kset *);
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 05b64790fe83..c3f0e61365dd 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -708,7 +708,7 @@ static struct attribute_group attr_group = {
 
 static int __init pm_disk_init(void)
 {
-	return sysfs_create_group(&power_subsys.kobj, &attr_group);
+	return sysfs_create_group(&power_kset->kobj, &attr_group);
 }
 
 core_initcall(pm_disk_init);
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 1ef31c91ce0e..dce2d76d66de 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -276,8 +276,7 @@ EXPORT_SYMBOL(pm_suspend);
 
 #endif /* CONFIG_SUSPEND */
 
-decl_subsys(power, NULL);
-
+struct kset *power_kset;
 
 /**
  *	state - control system power state.
@@ -386,10 +385,10 @@ static struct attribute_group attr_group = {
 
 static int __init pm_init(void)
 {
-	int error = subsystem_register(&power_subsys);
-	if (!error)
-		error = sysfs_create_group(&power_subsys.kobj,&attr_group);
-	return error;
+	power_kset = kset_create_and_add("power", NULL, NULL);
+	if (!power_kset)
+		return -ENOMEM;
+	return sysfs_create_group(&power_kset->kobj, &attr_group);
 }
 
 core_initcall(pm_init);
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 195dc4611764..1083e6b188ab 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -63,8 +63,6 @@ static struct subsys_attribute _name##_attr = {	\
 	.store	= _name##_store,		\
 }
 
-extern struct kset power_subsys;
-
 /* Preferred image size in bytes (default 500 MB) */
 extern unsigned long image_size;
 extern int in_suspend;
-- 
cgit v1.2.3


From 386f275f5d097758f867bc99ddeaeb7a03b6b190 Mon Sep 17 00:00:00 2001
From: Kay Sievers <kay.sievers@vrfy.org>
Date: Fri, 2 Nov 2007 13:47:53 +0100
Subject: Driver Core: switch all dynamic ksets to kobj_sysfs_ops

Switch all dynamically created ksets, that export simple attributes,
to kobj_attribute from subsys_attribute. Struct subsys_attribute will
be removed.

Signed-off-by: Kay Sievers <kay.sievers@vrfy.org>
Cc: Mike Halcrow <mhalcrow@us.ibm.com>
Cc: Phillip Hellewell <phillip@hellewell.homeip.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/ecryptfs/main.c   | 10 ++++++----
 kernel/ksysfs.c      | 35 +++++++++++++++++++++--------------
 kernel/power/disk.c  | 18 ++++++++++++------
 kernel/power/main.c  | 12 ++++++++----
 kernel/power/power.h |  2 +-
 lib/kobject.c        | 10 ++++++----
 6 files changed, 54 insertions(+), 33 deletions(-)

(limited to 'kernel')

diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index bdeac3877a84..6ded37b467ff 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -736,12 +736,13 @@ static int ecryptfs_init_kmem_caches(void)
 
 static struct kset *ecryptfs_kset;
 
-static ssize_t version_show(struct kset *kset, char *buff)
+static ssize_t version_show(struct kobject *kobj,
+			    struct kobj_attribute *attr, char *buff)
 {
 	return snprintf(buff, PAGE_SIZE, "%d\n", ECRYPTFS_VERSIONING_MASK);
 }
 
-static struct subsys_attribute version_attr = __ATTR_RO(version);
+static struct kobj_attribute version_attr = __ATTR_RO(version);
 
 static struct ecryptfs_version_str_map_elem {
 	u32 flag;
@@ -755,7 +756,8 @@ static struct ecryptfs_version_str_map_elem {
 	{ECRYPTFS_VERSIONING_MULTKEY, "multiple keys per file"}
 };
 
-static ssize_t version_str_show(struct kset *kset, char *buff)
+static ssize_t version_str_show(struct kobject *kobj,
+				struct kobj_attribute *attr, char *buff)
 {
 	int i;
 	int remaining = PAGE_SIZE;
@@ -782,7 +784,7 @@ out:
 	return total_written;
 }
 
-static struct subsys_attribute version_attr_str = __ATTR_RO(version_str);
+static struct kobj_attribute version_attr_str = __ATTR_RO(version_str);
 
 static struct attribute *attributes[] = {
 	&version_attr.attr,
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index cf02d4ba9add..dd0f9e7f3414 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -17,30 +17,34 @@
 #include <linux/sched.h>
 
 #define KERNEL_ATTR_RO(_name) \
-static struct subsys_attribute _name##_attr = __ATTR_RO(_name)
+static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
 
 #define KERNEL_ATTR_RW(_name) \
-static struct subsys_attribute _name##_attr = \
+static struct kobj_attribute _name##_attr = \
 	__ATTR(_name, 0644, _name##_show, _name##_store)
 
 #if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET)
 /* current uevent sequence number */
-static ssize_t uevent_seqnum_show(struct kset *kset, char *page)
+static ssize_t uevent_seqnum_show(struct kobject *kobj,
+				  struct kobj_attribute *attr, char *buf)
 {
-	return sprintf(page, "%llu\n", (unsigned long long)uevent_seqnum);
+	return sprintf(buf, "%llu\n", (unsigned long long)uevent_seqnum);
 }
 KERNEL_ATTR_RO(uevent_seqnum);
 
 /* uevent helper program, used during early boo */
-static ssize_t uevent_helper_show(struct kset *kset, char *page)
+static ssize_t uevent_helper_show(struct kobject *kobj,
+				  struct kobj_attribute *attr, char *buf)
 {
-	return sprintf(page, "%s\n", uevent_helper);
+	return sprintf(buf, "%s\n", uevent_helper);
 }
-static ssize_t uevent_helper_store(struct kset *kset, const char *page, size_t count)
+static ssize_t uevent_helper_store(struct kobject *kobj,
+				   struct kobj_attribute *attr,
+				   const char *buf, size_t count)
 {
 	if (count+1 > UEVENT_HELPER_PATH_LEN)
 		return -ENOENT;
-	memcpy(uevent_helper, page, count);
+	memcpy(uevent_helper, buf, count);
 	uevent_helper[count] = '\0';
 	if (count && uevent_helper[count-1] == '\n')
 		uevent_helper[count-1] = '\0';
@@ -50,21 +54,24 @@ KERNEL_ATTR_RW(uevent_helper);
 #endif
 
 #ifdef CONFIG_KEXEC
-static ssize_t kexec_loaded_show(struct kset *kset, char *page)
+static ssize_t kexec_loaded_show(struct kobject *kobj,
+				 struct kobj_attribute *attr, char *buf)
 {
-	return sprintf(page, "%d\n", !!kexec_image);
+	return sprintf(buf, "%d\n", !!kexec_image);
 }
 KERNEL_ATTR_RO(kexec_loaded);
 
-static ssize_t kexec_crash_loaded_show(struct kset *kset, char *page)
+static ssize_t kexec_crash_loaded_show(struct kobject *kobj,
+				       struct kobj_attribute *attr, char *buf)
 {
-	return sprintf(page, "%d\n", !!kexec_crash_image);
+	return sprintf(buf, "%d\n", !!kexec_crash_image);
 }
 KERNEL_ATTR_RO(kexec_crash_loaded);
 
-static ssize_t vmcoreinfo_show(struct kset *kset, char *page)
+static ssize_t vmcoreinfo_show(struct kobject *kobj,
+			       struct kobj_attribute *attr, char *buf)
 {
-	return sprintf(page, "%lx %x\n",
+	return sprintf(buf, "%lx %x\n",
 		       paddr_vmcoreinfo_note(),
 		       (unsigned int)vmcoreinfo_max_size);
 }
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index c3f0e61365dd..ef5aa2ca0ab0 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -567,7 +567,8 @@ static const char * const hibernation_modes[] = {
  *	supports it (as determined by having hibernation_ops).
  */
 
-static ssize_t disk_show(struct kset *kset, char *buf)
+static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr,
+			 char *buf)
 {
 	int i;
 	char *start = buf;
@@ -597,7 +598,8 @@ static ssize_t disk_show(struct kset *kset, char *buf)
 }
 
 
-static ssize_t disk_store(struct kset *kset, const char *buf, size_t n)
+static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr,
+			  const char *buf, size_t n)
 {
 	int error = 0;
 	int i;
@@ -642,13 +644,15 @@ static ssize_t disk_store(struct kset *kset, const char *buf, size_t n)
 
 power_attr(disk);
 
-static ssize_t resume_show(struct kset *kset, char *buf)
+static ssize_t resume_show(struct kobject *kobj, struct kobj_attribute *attr,
+			   char *buf)
 {
 	return sprintf(buf,"%d:%d\n", MAJOR(swsusp_resume_device),
 		       MINOR(swsusp_resume_device));
 }
 
-static ssize_t resume_store(struct kset *kset, const char *buf, size_t n)
+static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr,
+			    const char *buf, size_t n)
 {
 	unsigned int maj, min;
 	dev_t res;
@@ -674,12 +678,14 @@ static ssize_t resume_store(struct kset *kset, const char *buf, size_t n)
 
 power_attr(resume);
 
-static ssize_t image_size_show(struct kset *kset, char *buf)
+static ssize_t image_size_show(struct kobject *kobj, struct kobj_attribute *attr,
+			       char *buf)
 {
 	return sprintf(buf, "%lu\n", image_size);
 }
 
-static ssize_t image_size_store(struct kset *kset, const char *buf, size_t n)
+static ssize_t image_size_store(struct kobject *kobj, struct kobj_attribute *attr,
+				const char *buf, size_t n)
 {
 	unsigned long size;
 
diff --git a/kernel/power/main.c b/kernel/power/main.c
index dce2d76d66de..b8139493b856 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -289,7 +289,8 @@ struct kset *power_kset;
  *	proper enumerated value, and initiates a suspend transition.
  */
 
-static ssize_t state_show(struct kset *kset, char *buf)
+static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr,
+			  char *buf)
 {
 	char *s = buf;
 #ifdef CONFIG_SUSPEND
@@ -310,7 +311,8 @@ static ssize_t state_show(struct kset *kset, char *buf)
 	return (s - buf);
 }
 
-static ssize_t state_store(struct kset *kset, const char *buf, size_t n)
+static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr,
+			   const char *buf, size_t n)
 {
 #ifdef CONFIG_SUSPEND
 	suspend_state_t state = PM_SUSPEND_STANDBY;
@@ -347,13 +349,15 @@ power_attr(state);
 #ifdef CONFIG_PM_TRACE
 int pm_trace_enabled;
 
-static ssize_t pm_trace_show(struct kset *kset, char *buf)
+static ssize_t pm_trace_show(struct kobject *kobj, struct kobj_attribute *attr,
+			     char *buf)
 {
 	return sprintf(buf, "%d\n", pm_trace_enabled);
 }
 
 static ssize_t
-pm_trace_store(struct kset *kset, const char *buf, size_t n)
+pm_trace_store(struct kobject *kobj, struct kobj_attribute *attr,
+	       const char *buf, size_t n)
 {
 	int val;
 
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 1083e6b188ab..2093c3a9a994 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -54,7 +54,7 @@ extern int pfn_is_nosave(unsigned long);
 extern struct mutex pm_mutex;
 
 #define power_attr(_name) \
-static struct subsys_attribute _name##_attr = {	\
+static struct kobj_attribute _name##_attr = {	\
 	.attr	= {				\
 		.name = __stringify(_name),	\
 		.mode = 0644,			\
diff --git a/lib/kobject.c b/lib/kobject.c
index 1c343fe4ba63..99f6354a5751 100644
--- a/lib/kobject.c
+++ b/lib/kobject.c
@@ -626,7 +626,8 @@ static void dynamic_kobj_release(struct kobject *kobj)
 }
 
 static struct kobj_type dynamic_kobj_ktype = {
-	.release = dynamic_kobj_release,
+	.release	= dynamic_kobj_release,
+	.sysfs_ops	= &kobj_sysfs_ops,
 };
 
 /**
@@ -836,7 +837,8 @@ static void kset_release(struct kobject *kobj)
 	kfree(kset);
 }
 
-static struct kobj_type kset_type = {
+static struct kobj_type kset_ktype = {
+	.sysfs_ops	= &kobj_sysfs_ops,
 	.release = kset_release,
 };
 
@@ -869,11 +871,11 @@ static struct kset *kset_create(const char *name,
 	kset->kobj.parent = parent_kobj;
 
 	/*
-	 * The kobject of this kset will have a type of kset_type and belong to
+	 * The kobject of this kset will have a type of kset_ktype and belong to
 	 * no kset itself.  That way we can properly free it when it is
 	 * finished being used.
 	 */
-	kset->kobj.ktype = &kset_type;
+	kset->kobj.ktype = &kset_ktype;
 	kset->kobj.kset = NULL;
 
 	return kset;
-- 
cgit v1.2.3


From eb41d9465cdafee45e0cb30f3b7338646221908e Mon Sep 17 00:00:00 2001
From: Kay Sievers <kay.sievers@vrfy.org>
Date: Fri, 2 Nov 2007 13:47:53 +0100
Subject: fix struct user_info export's sysfs interaction

Clean up the use of ksets and kobjects. Kobjects are instances of
objects (like struct user_info), ksets are collections of objects of a
similar type (like the uids directory containing the user_info directories).
So, use kobjects for the user_info directories, and a kset for the "uids"
directory.

On object cleanup, the final kobject_put() was missing.

Cc: Dhaval Giani <dhaval@linux.vnet.ibm.com>
Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Signed-off-by: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 include/linux/sched.h |   9 +----
 kernel/ksysfs.c       |   7 +---
 kernel/user.c         | 104 +++++++++++++++++++++++++-------------------------
 3 files changed, 55 insertions(+), 65 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index cc14656f8682..d6eacda765ca 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -552,18 +552,13 @@ struct user_struct {
 #ifdef CONFIG_FAIR_USER_SCHED
 	struct task_group *tg;
 #ifdef CONFIG_SYSFS
-	struct kset kset;
-	struct subsys_attribute user_attr;
+	struct kobject kobj;
 	struct work_struct work;
 #endif
 #endif
 };
 
-#ifdef CONFIG_FAIR_USER_SCHED
-extern int uids_kobject_init(void);
-#else
-static inline int uids_kobject_init(void) { return 0; }
-#endif
+extern int uids_sysfs_init(void);
 
 extern struct user_struct *find_user(uid_t);
 
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index dd0f9e7f3414..45e646585605 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -141,11 +141,8 @@ static int __init ksysfs_init(void)
 			goto group_exit;
 	}
 
-	/*
-	 * Create "/sys/kernel/uids" directory and corresponding root user's
-	 * directory under it.
-	 */
-	error = uids_kobject_init();
+	/* create the /sys/kernel/uids/ directory */
+	error = uids_sysfs_init();
 	if (error)
 		goto notes_exit;
 
diff --git a/kernel/user.c b/kernel/user.c
index 80f1116b8fcd..5a106f3fdf05 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -115,7 +115,7 @@ static void sched_switch_user(struct task_struct *p) { }
 
 #if defined(CONFIG_FAIR_USER_SCHED) && defined(CONFIG_SYSFS)
 
-static struct kobject uids_kobject; /* represents /sys/kernel/uids directory */
+static struct kset *uids_kset; /* represents the /sys/kernel/uids/ directory */
 static DEFINE_MUTEX(uids_mutex);
 
 static inline void uids_mutex_lock(void)
@@ -128,86 +128,84 @@ static inline void uids_mutex_unlock(void)
 	mutex_unlock(&uids_mutex);
 }
 
-/* return cpu shares held by the user */
-static ssize_t cpu_shares_show(struct kset *kset, char *buffer)
+/* uid directory attributes */
+static ssize_t cpu_shares_show(struct kobject *kobj,
+			       struct kobj_attribute *attr,
+			       char *buf)
 {
-	struct user_struct *up = container_of(kset, struct user_struct, kset);
+	struct user_struct *up = container_of(kobj, struct user_struct, kobj);
 
-	return sprintf(buffer, "%lu\n", sched_group_shares(up->tg));
+	return sprintf(buf, "%lu\n", sched_group_shares(up->tg));
 }
 
-/* modify cpu shares held by the user */
-static ssize_t cpu_shares_store(struct kset *kset, const char *buffer,
-				size_t size)
+static ssize_t cpu_shares_store(struct kobject *kobj,
+				struct kobj_attribute *attr,
+				const char *buf, size_t size)
 {
-	struct user_struct *up = container_of(kset, struct user_struct, kset);
+	struct user_struct *up = container_of(kobj, struct user_struct, kobj);
 	unsigned long shares;
 	int rc;
 
-	sscanf(buffer, "%lu", &shares);
+	sscanf(buf, "%lu", &shares);
 
 	rc = sched_group_set_shares(up->tg, shares);
 
 	return (rc ? rc : size);
 }
 
-static void user_attr_init(struct subsys_attribute *sa, char *name, int mode)
+static struct kobj_attribute cpu_share_attr =
+	__ATTR(cpu_share, 0644, cpu_shares_show, cpu_shares_store);
+
+/* default attributes per uid directory */
+static struct attribute *uids_attributes[] = {
+	&cpu_share_attr.attr,
+	NULL
+};
+
+/* the lifetime of user_struct is not managed by the core (now) */
+static void uids_release(struct kobject *kobj)
 {
-	sa->attr.name = name;
-	sa->attr.mode = mode;
-	sa->show = cpu_shares_show;
-	sa->store = cpu_shares_store;
+	return;
 }
 
-/* Create "/sys/kernel/uids/<uid>" directory and
- *  "/sys/kernel/uids/<uid>/cpu_share" file for this user.
- */
-static int user_kobject_create(struct user_struct *up)
+static struct kobj_type uids_ktype = {
+	.sysfs_ops = &kobj_sysfs_ops,
+	.default_attrs = uids_attributes,
+	.release = uids_release,
+};
+
+/* create /sys/kernel/uids/<uid>/cpu_share file for this user */
+static int uids_user_create(struct user_struct *up)
 {
-	struct kset *kset = &up->kset;
-	struct kobject *kobj = &kset->kobj;
+	struct kobject *kobj = &up->kobj;
 	int error;
 
-	memset(kset, 0, sizeof(struct kset));
-	kobj->parent = &uids_kobject;	/* create under /sys/kernel/uids dir */
-	kobject_set_name(kobj, "%d", up->uid);
-	kset_init(kset);
-	user_attr_init(&up->user_attr, "cpu_share", 0644);
-
+	memset(kobj, 0, sizeof(struct kobject));
+	kobj->ktype = &uids_ktype;
+	kobj->kset = uids_kset;
+	kobject_init(kobj);
+	kobject_set_name(&up->kobj, "%d", up->uid);
 	error = kobject_add(kobj);
 	if (error)
 		goto done;
 
-	error = sysfs_create_file(kobj, &up->user_attr.attr);
-	if (error)
-		kobject_del(kobj);
-
 	kobject_uevent(kobj, KOBJ_ADD);
-
 done:
 	return error;
 }
 
-/* create these in sysfs filesystem:
+/* create these entries in sysfs:
  * 	"/sys/kernel/uids" directory
  * 	"/sys/kernel/uids/0" directory (for root user)
  * 	"/sys/kernel/uids/0/cpu_share" file (for root user)
  */
-int __init uids_kobject_init(void)
+int __init uids_sysfs_init(void)
 {
-	int error;
+	uids_kset = kset_create_and_add("uids", NULL, &kernel_kset->kobj);
+	if (!uids_kset)
+		return -ENOMEM;
 
-	/* create under /sys/kernel dir */
-	uids_kobject.parent = &kernel_kset->kobj;
-	uids_kobject.kset = kernel_kset;
-	kobject_set_name(&uids_kobject, "uids");
-	kobject_init(&uids_kobject);
-
-	error = kobject_add(&uids_kobject);
-	if (!error)
-		error = user_kobject_create(&root_user);
-
-	return error;
+	return uids_user_create(&root_user);
 }
 
 /* work function to remove sysfs directory for a user and free up
@@ -216,7 +214,6 @@ int __init uids_kobject_init(void)
 static void remove_user_sysfs_dir(struct work_struct *w)
 {
 	struct user_struct *up = container_of(w, struct user_struct, work);
-	struct kobject *kobj = &up->kset.kobj;
 	unsigned long flags;
 	int remove_user = 0;
 
@@ -238,9 +235,9 @@ static void remove_user_sysfs_dir(struct work_struct *w)
 	if (!remove_user)
 		goto done;
 
-	sysfs_remove_file(kobj, &up->user_attr.attr);
-	kobject_uevent(kobj, KOBJ_REMOVE);
-	kobject_del(kobj);
+	kobject_uevent(&up->kobj, KOBJ_REMOVE);
+	kobject_del(&up->kobj);
+	kobject_put(&up->kobj);
 
 	sched_destroy_user(up);
 	key_put(up->uid_keyring);
@@ -267,7 +264,8 @@ static inline void free_user(struct user_struct *up, unsigned long flags)
 
 #else	/* CONFIG_FAIR_USER_SCHED && CONFIG_SYSFS */
 
-static inline int user_kobject_create(struct user_struct *up) { return 0; }
+int uids_sysfs_init(void) { return 0; }
+static inline int uids_user_create(struct user_struct *up) { return 0; }
 static inline void uids_mutex_lock(void) { }
 static inline void uids_mutex_unlock(void) { }
 
@@ -324,7 +322,7 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
 	struct hlist_head *hashent = uidhashentry(ns, uid);
 	struct user_struct *up;
 
-	/* Make uid_hash_find() + user_kobject_create() + uid_hash_insert()
+	/* Make uid_hash_find() + uids_user_create() + uid_hash_insert()
 	 * atomic.
 	 */
 	uids_mutex_lock();
@@ -370,7 +368,7 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
 			return NULL;
 		}
 
-		if (user_kobject_create(new)) {
+		if (uids_user_create(new)) {
 			sched_destroy_user(new);
 			key_put(new->uid_keyring);
 			key_put(new->session_keyring);
-- 
cgit v1.2.3


From 0ff21e46630abce11fdaaffabd72bbd4eed5ac2c Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Tue, 6 Nov 2007 10:36:58 -0800
Subject: kobject: convert kernel_kset to be a kobject

kernel_kset does not need to be a kset, but a much simpler kobject now
that we have kobj_attributes.

We also rename kernel_kset to kernel_kobj to catch all users of this
symbol with a build error instead of an easy-to-ignore build warning.

Cc: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/configfs/mount.c         |  2 +-
 fs/debugfs/inode.c          |  2 +-
 fs/dlm/lockspace.c          |  2 +-
 fs/gfs2/locking/dlm/sysfs.c |  2 +-
 include/linux/kobject.h     |  4 ++--
 kernel/ksysfs.c             | 18 +++++++++---------
 kernel/user.c               |  2 +-
 mm/slub.c                   |  3 +--
 security/inode.c            |  2 +-
 9 files changed, 18 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c
index c4ee7f05de8b..54bf0db0d4b0 100644
--- a/fs/configfs/mount.c
+++ b/fs/configfs/mount.c
@@ -140,7 +140,7 @@ static int __init configfs_init(void)
 	if (!configfs_dir_cachep)
 		goto out;
 
-	config_kobj = kobject_create_and_add("config", &kernel_kset->kobj);
+	config_kobj = kobject_create_and_add("config", kernel_kobj);
 	if (!config_kobj) {
 		kmem_cache_destroy(configfs_dir_cachep);
 		configfs_dir_cachep = NULL;
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 5ce92c3d3b59..97f6381c36c2 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -432,7 +432,7 @@ static int __init debugfs_init(void)
 {
 	int retval;
 
-	debug_kobj = kobject_create_and_add("debug", &kernel_kset->kobj);
+	debug_kobj = kobject_create_and_add("debug", kernel_kobj);
 	if (!debug_kobj)
 		return -EINVAL;
 
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index 0828beb2d35d..e64b0dc664f3 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -223,7 +223,7 @@ int dlm_lockspace_init(void)
 	INIT_LIST_HEAD(&lslist);
 	spin_lock_init(&lslist_lock);
 
-	dlm_kset = kset_create_and_add("dlm", NULL, &kernel_kset->kobj);
+	dlm_kset = kset_create_and_add("dlm", NULL, kernel_kobj);
 	if (!dlm_kset) {
 		printk(KERN_WARNING "%s: can not create kset\n", __FUNCTION__);
 		return -ENOMEM;
diff --git a/fs/gfs2/locking/dlm/sysfs.c b/fs/gfs2/locking/dlm/sysfs.c
index 1a92b6f7bc10..e5a4fbf7265f 100644
--- a/fs/gfs2/locking/dlm/sysfs.c
+++ b/fs/gfs2/locking/dlm/sysfs.c
@@ -219,7 +219,7 @@ void gdlm_kobject_release(struct gdlm_ls *ls)
 
 int gdlm_sysfs_init(void)
 {
-	gdlm_kset = kset_create_and_add("lock_dlm", NULL, &kernel_kset->kobj);
+	gdlm_kset = kset_create_and_add("lock_dlm", NULL, kernel_kobj);
 	if (!gdlm_kset) {
 		printk(KERN_WARNING "%s: can not create kset\n", __FUNCTION__);
 		return -ENOMEM;
diff --git a/include/linux/kobject.h b/include/linux/kobject.h
index 9da3523e4a65..0930efdcc094 100644
--- a/include/linux/kobject.h
+++ b/include/linux/kobject.h
@@ -198,8 +198,8 @@ extern struct kobject * kset_find_obj(struct kset *, const char *);
 #define set_kset_name(str)	.kset = { .kobj = { .k_name = str } }
 
 
-/* The global /sys/kernel/ kset for people to chain off of */
-extern struct kset *kernel_kset;
+/* The global /sys/kernel/ kobject for people to chain off of */
+extern struct kobject *kernel_kobj;
 /* The global /sys/hypervisor/ kobject for people to chain off of */
 extern struct kobject *hypervisor_kobj;
 /* The global /sys/power/ kset for people to chain off of */
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 45e646585605..1081aff5fb9e 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -101,8 +101,8 @@ static struct bin_attribute notes_attr = {
 	.read = &notes_read,
 };
 
-struct kset *kernel_kset;
-EXPORT_SYMBOL_GPL(kernel_kset);
+struct kobject *kernel_kobj;
+EXPORT_SYMBOL_GPL(kernel_kobj);
 
 static struct attribute * kernel_attrs[] = {
 #if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET)
@@ -125,18 +125,18 @@ static int __init ksysfs_init(void)
 {
 	int error;
 
-	kernel_kset = kset_create_and_add("kernel", NULL, NULL);
-	if (!kernel_kset) {
+	kernel_kobj = kobject_create_and_add("kernel", NULL);
+	if (!kernel_kobj) {
 		error = -ENOMEM;
 		goto exit;
 	}
-	error = sysfs_create_group(&kernel_kset->kobj, &kernel_attr_group);
+	error = sysfs_create_group(kernel_kobj, &kernel_attr_group);
 	if (error)
 		goto kset_exit;
 
 	if (notes_size > 0) {
 		notes_attr.size = notes_size;
-		error = sysfs_create_bin_file(&kernel_kset->kobj, &notes_attr);
+		error = sysfs_create_bin_file(kernel_kobj, &notes_attr);
 		if (error)
 			goto group_exit;
 	}
@@ -150,11 +150,11 @@ static int __init ksysfs_init(void)
 
 notes_exit:
 	if (notes_size > 0)
-		sysfs_remove_bin_file(&kernel_kset->kobj, &notes_attr);
+		sysfs_remove_bin_file(kernel_kobj, &notes_attr);
 group_exit:
-	sysfs_remove_group(&kernel_kset->kobj, &kernel_attr_group);
+	sysfs_remove_group(kernel_kobj, &kernel_attr_group);
 kset_exit:
-	kset_unregister(kernel_kset);
+	kobject_unregister(kernel_kobj);
 exit:
 	return error;
 }
diff --git a/kernel/user.c b/kernel/user.c
index 5a106f3fdf05..7f17e6e8fd65 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -201,7 +201,7 @@ done:
  */
 int __init uids_sysfs_init(void)
 {
-	uids_kset = kset_create_and_add("uids", NULL, &kernel_kset->kobj);
+	uids_kset = kset_create_and_add("uids", NULL, kernel_kobj);
 	if (!uids_kset)
 		return -ENOMEM;
 
diff --git a/mm/slub.c b/mm/slub.c
index b6c79462157e..d26177fb293b 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -4091,8 +4091,7 @@ static int __init slab_sysfs_init(void)
 	struct kmem_cache *s;
 	int err;
 
-	slab_kset = kset_create_and_add("slab", &slab_uevent_ops,
-					&kernel_kset->kobj);
+	slab_kset = kset_create_and_add("slab", &slab_uevent_ops, kernel_kobj);
 	if (!slab_kset) {
 		printk(KERN_ERR "Cannot register slab subsystem.\n");
 		return -ENOSYS;
diff --git a/security/inode.c b/security/inode.c
index dbe040ac0549..def0cc1b07f2 100644
--- a/security/inode.c
+++ b/security/inode.c
@@ -321,7 +321,7 @@ static int __init securityfs_init(void)
 {
 	int retval;
 
-  	security_kobj = kobject_create_and_add("security", &kernel_kset->kobj);
+	security_kobj = kobject_create_and_add("security", kernel_kobj);
 	if (!security_kobj)
 		return -EINVAL;
 
-- 
cgit v1.2.3


From d76e15fb20eeb7632ef38876a884fe3508b2c01d Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Tue, 27 Nov 2007 11:28:26 -0800
Subject: driver core: make /sys/power a kobject

/sys/power should not be a kset, that's overkill.  This patch renames it
to power_kset and fixes up all usages of it in the tree.

Cc: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 arch/arm/mach-omap1/pm.c               |  2 +-
 arch/powerpc/platforms/pseries/power.c | 10 +++++-----
 include/linux/kobject.h                |  4 ++--
 kernel/power/disk.c                    |  2 +-
 kernel/power/main.c                    |  8 ++++----
 5 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/arch/arm/mach-omap1/pm.c b/arch/arm/mach-omap1/pm.c
index 63edafb59490..d9805e3d9304 100644
--- a/arch/arm/mach-omap1/pm.c
+++ b/arch/arm/mach-omap1/pm.c
@@ -719,7 +719,7 @@ static int __init omap_pm_init(void)
 	omap_pm_init_proc();
 #endif
 
-	error = sysfs_create_file(&power_kset->kobj, &sleep_while_idle_attr);
+	error = sysfs_create_file(power_kobj, &sleep_while_idle_attr);
 	if (error)
 		printk(KERN_ERR "sysfs_create_file failed: %d\n", error);
 
diff --git a/arch/powerpc/platforms/pseries/power.c b/arch/powerpc/platforms/pseries/power.c
index 90706cf840fd..e95fc1594c84 100644
--- a/arch/powerpc/platforms/pseries/power.c
+++ b/arch/powerpc/platforms/pseries/power.c
@@ -53,7 +53,7 @@ static struct kobj_attribute auto_poweron_attr =
 	__ATTR(auto_poweron, 0644, auto_poweron_show, auto_poweron_store);
 
 #ifndef CONFIG_PM
-struct kset *power_kset;
+struct kobject *power_kobj;
 
 static struct attribute *g[] = {
         &auto_poweron_attr.attr,
@@ -66,16 +66,16 @@ static struct attribute_group attr_group = {
 
 static int __init pm_init(void)
 {
-	power_kset = kset_create_and_add("power", NULL, NULL);
-	if (!power_kset)
+	power_kobj = kobject_create_and_add("power", NULL);
+	if (!power_kobj)
 		return -ENOMEM;
-	return sysfs_create_group(&power_kset->kobj, &attr_group);
+	return sysfs_create_group(power_kobj, &attr_group);
 }
 core_initcall(pm_init);
 #else
 static int __init apo_pm_init(void)
 {
-	return (sysfs_create_file(&power_kset->kobj, &auto_poweron_attr));
+	return (sysfs_create_file(power_kobj, &auto_poweron_attr));
 }
 __initcall(apo_pm_init);
 #endif
diff --git a/include/linux/kobject.h b/include/linux/kobject.h
index 78c851b4e67e..bb6868475edb 100644
--- a/include/linux/kobject.h
+++ b/include/linux/kobject.h
@@ -202,8 +202,8 @@ extern struct kobject * kset_find_obj(struct kset *, const char *);
 extern struct kobject *kernel_kobj;
 /* The global /sys/hypervisor/ kobject for people to chain off of */
 extern struct kobject *hypervisor_kobj;
-/* The global /sys/power/ kset for people to chain off of */
-extern struct kset *power_kset;
+/* The global /sys/power/ kobject for people to chain off of */
+extern struct kobject *power_kobj;
 /* The global /sys/firmware/ kobject for people to chain off of */
 extern struct kobject *firmware_kobj;
 
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index ef5aa2ca0ab0..b138b431e271 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -714,7 +714,7 @@ static struct attribute_group attr_group = {
 
 static int __init pm_disk_init(void)
 {
-	return sysfs_create_group(&power_kset->kobj, &attr_group);
+	return sysfs_create_group(power_kobj, &attr_group);
 }
 
 core_initcall(pm_disk_init);
diff --git a/kernel/power/main.c b/kernel/power/main.c
index b8139493b856..efc08360e627 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -276,7 +276,7 @@ EXPORT_SYMBOL(pm_suspend);
 
 #endif /* CONFIG_SUSPEND */
 
-struct kset *power_kset;
+struct kobject *power_kobj;
 
 /**
  *	state - control system power state.
@@ -389,10 +389,10 @@ static struct attribute_group attr_group = {
 
 static int __init pm_init(void)
 {
-	power_kset = kset_create_and_add("power", NULL, NULL);
-	if (!power_kset)
+	power_kobj = kobject_create_and_add("power", NULL);
+	if (!power_kobj)
 		return -ENOMEM;
-	return sysfs_create_group(&power_kset->kobj, &attr_group);
+	return sysfs_create_group(power_kobj, &attr_group);
 }
 
 core_initcall(pm_init);
-- 
cgit v1.2.3


From e43b9192c59402685bd1f809068dd13aa5931570 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Mon, 17 Dec 2007 23:05:35 -0700
Subject: Kobject: convert kernel/params.c to use kobject_init/add_ng()

This converts the code to use the new kobject functions, cleaning up the
logic in doing so.

Cc: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 kernel/params.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/params.c b/kernel/params.c
index 97e092312155..1078b148ca80 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -561,11 +561,9 @@ static void __init kernel_param_sysfs_setup(const char *name,
 
 	mk->mod = THIS_MODULE;
 	mk->kobj.kset = module_kset;
-	mk->kobj.ktype = &module_ktype;
-	kobject_set_name(&mk->kobj, name);
-	kobject_init(&mk->kobj);
-	ret = kobject_add(&mk->kobj);
+	ret = kobject_init_and_add(&mk->kobj, &module_ktype, NULL, "%s", name);
 	if (ret) {
+		kobject_put(&mk->kobj);
 		printk(KERN_ERR "Module '%s' failed to be added to sysfs, "
 		      "error number %d\n", name, ret);
 		printk(KERN_ERR	"The system will be unstable now.\n");
-- 
cgit v1.2.3


From cf15126b3d4511e06e5299781ab74922590900be Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Mon, 17 Dec 2007 23:05:35 -0700
Subject: Kobject: convert kernel/user.c to use kobject_init/add_ng()

This converts the code to use the new kobject functions, cleaning up the
logic in doing so.

Cc: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 kernel/user.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/user.c b/kernel/user.c
index 7f17e6e8fd65..ab4fd706993b 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -181,13 +181,12 @@ static int uids_user_create(struct user_struct *up)
 	int error;
 
 	memset(kobj, 0, sizeof(struct kobject));
-	kobj->ktype = &uids_ktype;
 	kobj->kset = uids_kset;
-	kobject_init(kobj);
-	kobject_set_name(&up->kobj, "%d", up->uid);
-	error = kobject_add(kobj);
-	if (error)
+	error = kobject_init_and_add(kobj, &uids_ktype, NULL, "%d", up->uid);
+	if (error) {
+		kobject_put(kobj);
 		goto done;
+	}
 
 	kobject_uevent(kobj, KOBJ_ADD);
 done:
-- 
cgit v1.2.3


From c63469a3985a9771c18a916b8d42845d044ea0b1 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Wed, 28 Nov 2007 12:23:18 -0800
Subject: Driver core: move the driver specific module code into the driver
 core

The module driver specific code should belong in the driver core, not in
the kernel/ directory.  So move this code.  This is done in preparation
for some struct device_driver rework that should be confined to the
driver core code only.

This also lets us keep from exporting these functions, as no external
code should ever be calling it.

Thanks to Andrew Morton for the !CONFIG_MODULES fix.

Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/base/Makefile  |  1 +
 drivers/base/base.h    |  9 +++++
 drivers/base/module.c  | 94 ++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/module.h | 15 --------
 kernel/module.c        | 87 ----------------------------------------------
 5 files changed, 104 insertions(+), 102 deletions(-)
 create mode 100644 drivers/base/module.c

(limited to 'kernel')

diff --git a/drivers/base/Makefile b/drivers/base/Makefile
index b39ea3f59c9b..ff2696823f8d 100644
--- a/drivers/base/Makefile
+++ b/drivers/base/Makefile
@@ -11,6 +11,7 @@ obj-$(CONFIG_FW_LOADER)	+= firmware_class.o
 obj-$(CONFIG_NUMA)	+= node.o
 obj-$(CONFIG_MEMORY_HOTPLUG_SPARSE) += memory.o
 obj-$(CONFIG_SMP)	+= topology.o
+obj-$(CONFIG_MODULES)	+= module.o
 obj-$(CONFIG_SYS_HYPERVISOR) += hypervisor.o
 
 ifeq ($(CONFIG_DEBUG_DRIVER),y)
diff --git a/drivers/base/base.h b/drivers/base/base.h
index ca6d273064f8..05472360f6a2 100644
--- a/drivers/base/base.h
+++ b/drivers/base/base.h
@@ -73,3 +73,12 @@ extern char *make_class_name(const char *name, struct kobject *kobj);
 extern int devres_release_all(struct device *dev);
 
 extern struct kset *devices_kset;
+
+#ifdef CONFIG_MODULES
+extern void module_add_driver(struct module *mod, struct device_driver *drv);
+extern void module_remove_driver(struct device_driver *drv);
+#else
+static inline void module_add_driver(struct module *mod,
+				     struct device_driver *drv) { }
+static inline void module_remove_driver(struct device_driver *drv) { }
+#endif
diff --git a/drivers/base/module.c b/drivers/base/module.c
new file mode 100644
index 000000000000..cad07be5de1a
--- /dev/null
+++ b/drivers/base/module.c
@@ -0,0 +1,94 @@
+/*
+ * module.c - module sysfs fun for drivers
+ *
+ * This file is released under the GPLv2
+ *
+ */
+#include <linux/device.h>
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include "base.h"
+
+static char *make_driver_name(struct device_driver *drv)
+{
+	char *driver_name;
+
+	driver_name = kmalloc(strlen(drv->name) + strlen(drv->bus->name) + 2,
+			      GFP_KERNEL);
+	if (!driver_name)
+		return NULL;
+
+	sprintf(driver_name, "%s:%s", drv->bus->name, drv->name);
+	return driver_name;
+}
+
+static void module_create_drivers_dir(struct module_kobject *mk)
+{
+	if (!mk || mk->drivers_dir)
+		return;
+
+	mk->drivers_dir = kobject_create_and_add("drivers", &mk->kobj);
+}
+
+void module_add_driver(struct module *mod, struct device_driver *drv)
+{
+	char *driver_name;
+	int no_warn;
+	struct module_kobject *mk = NULL;
+
+	if (!drv)
+		return;
+
+	if (mod)
+		mk = &mod->mkobj;
+	else if (drv->mod_name) {
+		struct kobject *mkobj;
+
+		/* Lookup built-in module entry in /sys/modules */
+		mkobj = kset_find_obj(module_kset, drv->mod_name);
+		if (mkobj) {
+			mk = container_of(mkobj, struct module_kobject, kobj);
+			/* remember our module structure */
+			drv->mkobj = mk;
+			/* kset_find_obj took a reference */
+			kobject_put(mkobj);
+		}
+	}
+
+	if (!mk)
+		return;
+
+	/* Don't check return codes; these calls are idempotent */
+	no_warn = sysfs_create_link(&drv->kobj, &mk->kobj, "module");
+	driver_name = make_driver_name(drv);
+	if (driver_name) {
+		module_create_drivers_dir(mk);
+		no_warn = sysfs_create_link(mk->drivers_dir, &drv->kobj,
+					    driver_name);
+		kfree(driver_name);
+	}
+}
+
+void module_remove_driver(struct device_driver *drv)
+{
+	struct module_kobject *mk = NULL;
+	char *driver_name;
+
+	if (!drv)
+		return;
+
+	sysfs_remove_link(&drv->kobj, "module");
+
+	if (drv->owner)
+		mk = &drv->owner->mkobj;
+	else if (drv->mkobj)
+		mk = drv->mkobj;
+	if (mk && mk->drivers_dir) {
+		driver_name = make_driver_name(drv);
+		if (driver_name) {
+			sysfs_remove_link(mk->drivers_dir, driver_name);
+			kfree(driver_name);
+		}
+	}
+}
diff --git a/include/linux/module.h b/include/linux/module.h
index fbe930b9b69c..c97bdb7eb957 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -609,21 +609,6 @@ static inline void module_remove_modinfo_attrs(struct module *mod)
 
 #endif /* CONFIG_SYSFS */
 
-#if defined(CONFIG_SYSFS) && defined(CONFIG_MODULES)
-
-void module_add_driver(struct module *mod, struct device_driver *drv);
-void module_remove_driver(struct device_driver *drv);
-
-#else /* not both CONFIG_SYSFS && CONFIG_MODULES */
-
-static inline void module_add_driver(struct module *mod, struct device_driver *drv)
-{ }
-
-static inline void module_remove_driver(struct device_driver *drv)
-{ }
-
-#endif
-
 #define symbol_request(x) try_then_request_module(symbol_get(x), "symbol:" #x)
 
 /* BELOW HERE ALL THESE ARE OBSOLETE AND WILL VANISH */
diff --git a/kernel/module.c b/kernel/module.c
index d03fcd9d652c..dc4d3f5ce820 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2501,93 +2501,6 @@ void print_modules(void)
 	printk("\n");
 }
 
-#ifdef CONFIG_SYSFS
-static char *make_driver_name(struct device_driver *drv)
-{
-	char *driver_name;
-
-	driver_name = kmalloc(strlen(drv->name) + strlen(drv->bus->name) + 2,
-			      GFP_KERNEL);
-	if (!driver_name)
-		return NULL;
-
-	sprintf(driver_name, "%s:%s", drv->bus->name, drv->name);
-	return driver_name;
-}
-
-static void module_create_drivers_dir(struct module_kobject *mk)
-{
-	if (!mk || mk->drivers_dir)
-		return;
-
-	mk->drivers_dir = kobject_create_and_add("drivers", &mk->kobj);
-}
-
-void module_add_driver(struct module *mod, struct device_driver *drv)
-{
-	char *driver_name;
-	int no_warn;
-	struct module_kobject *mk = NULL;
-
-	if (!drv)
-		return;
-
-	if (mod)
-		mk = &mod->mkobj;
-	else if (drv->mod_name) {
-		struct kobject *mkobj;
-
-		/* Lookup built-in module entry in /sys/modules */
-		mkobj = kset_find_obj(module_kset, drv->mod_name);
-		if (mkobj) {
-			mk = container_of(mkobj, struct module_kobject, kobj);
-			/* remember our module structure */
-			drv->mkobj = mk;
-			/* kset_find_obj took a reference */
-			kobject_put(mkobj);
-		}
-	}
-
-	if (!mk)
-		return;
-
-	/* Don't check return codes; these calls are idempotent */
-	no_warn = sysfs_create_link(&drv->kobj, &mk->kobj, "module");
-	driver_name = make_driver_name(drv);
-	if (driver_name) {
-		module_create_drivers_dir(mk);
-		no_warn = sysfs_create_link(mk->drivers_dir, &drv->kobj,
-					    driver_name);
-		kfree(driver_name);
-	}
-}
-EXPORT_SYMBOL(module_add_driver);
-
-void module_remove_driver(struct device_driver *drv)
-{
-	struct module_kobject *mk = NULL;
-	char *driver_name;
-
-	if (!drv)
-		return;
-
-	sysfs_remove_link(&drv->kobj, "module");
-
-	if (drv->owner)
-		mk = &drv->owner->mkobj;
-	else if (drv->mkobj)
-		mk = drv->mkobj;
-	if (mk && mk->drivers_dir) {
-		driver_name = make_driver_name(drv);
-		if (driver_name) {
-			sysfs_remove_link(mk->drivers_dir, driver_name);
-			kfree(driver_name);
-		}
-	}
-}
-EXPORT_SYMBOL(module_remove_driver);
-#endif
-
 #ifdef CONFIG_MODVERSIONS
 /* Generate the signature for struct module here, too, for modversions. */
 void struct_module(struct module *mod) { return; }
-- 
cgit v1.2.3


From 97c146ef075dc40ae34407c03d3860fc3850b8e8 Mon Sep 17 00:00:00 2001
From: Kay Sievers <kay.sievers@vrfy.org>
Date: Thu, 29 Nov 2007 23:46:11 +0100
Subject: sysfs: fix /sys/module/*/holders after sysfs logic change

Sysfs symlinks now require fully registered kobjects as a target,
otherwise the call to create a symlink will fail. Here we register
the kobject before we request the symlink in the holders directory.

Signed-off-by: Kay Sievers <kay.sievers@vrfy.org>
Cc: Tejun Heo <teheo@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 kernel/module.c | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index dc4d3f5ce820..0ae811785c59 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1227,6 +1227,8 @@ int mod_sysfs_init(struct module *mod)
 
 	kobject_init(&mod->mkobj.kobj);
 
+	/* delay uevent until full sysfs population */
+	err = kobject_add(&mod->mkobj.kobj);
 out:
 	return err;
 }
@@ -1237,11 +1239,6 @@ int mod_sysfs_setup(struct module *mod,
 {
 	int err;
 
-	/* delay uevent until full sysfs population */
-	err = kobject_add(&mod->mkobj.kobj);
-	if (err)
-		goto out;
-
 	mod->holders_dir = kobject_create_and_add("holders", &mod->mkobj.kobj);
 	if (!mod->holders_dir) {
 		err = -ENOMEM;
@@ -1266,7 +1263,6 @@ out_unreg_holders:
 out_unreg:
 	kobject_del(&mod->mkobj.kobj);
 	kobject_put(&mod->mkobj.kobj);
-out:
 	return err;
 }
 #endif
@@ -1883,10 +1879,10 @@ static struct module *load_module(void __user *umod,
 	/* Now we've moved module, initialize linked lists, etc. */
 	module_unload_init(mod);
 
-	/* Initialize kobject, so we can reference it. */
+	/* add kobject, so we can reference it. */
 	err = mod_sysfs_init(mod);
 	if (err)
-		goto cleanup;
+		goto free_unload;
 
 	/* Set up license info based on the info section */
 	set_license(mod, get_modinfo(sechdrs, infoindex, "license"));
@@ -2056,6 +2052,9 @@ static struct module *load_module(void __user *umod,
  arch_cleanup:
 	module_arch_cleanup(mod);
  cleanup:
+	kobject_del(&mod->mkobj.kobj);
+	kobject_put(&mod->mkobj.kobj);
+ free_unload:
 	module_unload_free(mod);
 	module_free(mod, mod->module_init);
  free_core:
-- 
cgit v1.2.3


From ac3c8141f62f357169980ec21b7be6d29964a394 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Mon, 17 Dec 2007 23:05:35 -0700
Subject: Kobject: convert kernel/module.c to use kobject_init/add_ng()

This converts the code to use the new kobject functions, cleaning up the
logic in doing so.

Cc: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 kernel/module.c | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 0ae811785c59..89cd4c7361d8 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1217,18 +1217,16 @@ int mod_sysfs_init(struct module *mod)
 		err = -EINVAL;
 		goto out;
 	}
-	memset(&mod->mkobj.kobj, 0, sizeof(mod->mkobj.kobj));
-	err = kobject_set_name(&mod->mkobj.kobj, "%s", mod->name);
-	if (err)
-		goto out;
-	mod->mkobj.kobj.kset = module_kset;
-	mod->mkobj.kobj.ktype = &module_ktype;
 	mod->mkobj.mod = mod;
 
-	kobject_init(&mod->mkobj.kobj);
+	memset(&mod->mkobj.kobj, 0, sizeof(mod->mkobj.kobj));
+	mod->mkobj.kobj.kset = module_kset;
+	err = kobject_init_and_add(&mod->mkobj.kobj, &module_ktype, NULL,
+				   "%s", mod->name);
+	if (err)
+		kobject_put(&mod->mkobj.kobj);
 
 	/* delay uevent until full sysfs population */
-	err = kobject_add(&mod->mkobj.kobj);
 out:
 	return err;
 }
-- 
cgit v1.2.3


From 7a6a41615bfb2f03ce797bc24104c50b42c935e5 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Sat, 22 Dec 2007 21:18:25 -0800
Subject: Modules: remove unneeded release function

Now that kobjects properly clean up their name structures, no matter if
they have a release function or not, we can drop this empty module
kobject release function too (it was needed prior to this because of the
way we handled static kobject names, we based the fact that if a release
function was present, then we could safely free the name string, now we
are more smart about things and only free names we have previously set.)

Cc: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 kernel/params.c | 10 ----------
 1 file changed, 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/params.c b/kernel/params.c
index 1078b148ca80..b4da9505f4d2 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -694,18 +694,8 @@ static struct kset_uevent_ops module_uevent_ops = {
 struct kset *module_kset;
 int module_sysfs_initialized;
 
-static void module_release(struct kobject *kobj)
-{
-	/*
-	 * Stupid empty release function to allow the memory for the kobject to
-	 * be properly cleaned up.  This will not need to be present for 2.6.25
-	 * with the upcoming kobject core rework.
-	 */
-}
-
 struct kobj_type module_ktype = {
 	.sysfs_ops =	&module_sysfs_ops,
-	.release =	module_release,
 };
 
 /*
-- 
cgit v1.2.3


From 78a2d906b40fe530ea800c1e873bfe8f02326f1e Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Thu, 20 Dec 2007 08:13:05 -0800
Subject: Kobject: convert remaining kobject_unregister() to kobject_put()

There is no need for kobject_unregister() anymore, thanks to Kay's
kobject cleanup changes, so replace all instances of it with
kobject_put().


Cc: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 kernel/ksysfs.c          | 2 +-
 kernel/module.c          | 9 ++++-----
 lib/kobject.c            | 4 ++--
 net/bridge/br_sysfs_br.c | 2 +-
 security/inode.c         | 2 +-
 5 files changed, 9 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 1081aff5fb9e..e53bc30e9ba5 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -154,7 +154,7 @@ notes_exit:
 group_exit:
 	sysfs_remove_group(kernel_kobj, &kernel_attr_group);
 kset_exit:
-	kobject_unregister(kernel_kobj);
+	kobject_put(kernel_kobj);
 exit:
 	return error;
 }
diff --git a/kernel/module.c b/kernel/module.c
index 89cd4c7361d8..dcb8a2cbf75e 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1257,9 +1257,8 @@ int mod_sysfs_setup(struct module *mod,
 out_unreg_param:
 	module_param_sysfs_remove(mod);
 out_unreg_holders:
-	kobject_unregister(mod->holders_dir);
+	kobject_put(mod->holders_dir);
 out_unreg:
-	kobject_del(&mod->mkobj.kobj);
 	kobject_put(&mod->mkobj.kobj);
 	return err;
 }
@@ -1269,9 +1268,9 @@ static void mod_kobject_remove(struct module *mod)
 {
 	module_remove_modinfo_attrs(mod);
 	module_param_sysfs_remove(mod);
-	kobject_unregister(mod->mkobj.drivers_dir);
-	kobject_unregister(mod->holders_dir);
-	kobject_unregister(&mod->mkobj.kobj);
+	kobject_put(mod->mkobj.drivers_dir);
+	kobject_put(mod->holders_dir);
+	kobject_put(&mod->mkobj.kobj);
 }
 
 /*
diff --git a/lib/kobject.c b/lib/kobject.c
index 4fce5ca42c2e..462946ee3e64 100644
--- a/lib/kobject.c
+++ b/lib/kobject.c
@@ -664,7 +664,7 @@ struct kobject *kobject_create(void)
  *
  * This function creates a kset structure dynamically and registers it
  * with sysfs.  When you are finished with this structure, call
- * kobject_unregister() and the structure will be dynamically freed when
+ * kobject_put() and the structure will be dynamically freed when
  * it is no longer being used.
  *
  * If the kobject was not able to be created, NULL will be returned.
@@ -761,7 +761,7 @@ void kset_unregister(struct kset * k)
 {
 	if (!k)
 		return;
-	kobject_unregister(&k->kobj);
+	kobject_put(&k->kobj);
 }
 
 
diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c
index 4e7f91fafedd..9cf0538d1717 100644
--- a/net/bridge/br_sysfs_br.c
+++ b/net/bridge/br_sysfs_br.c
@@ -447,7 +447,7 @@ void br_sysfs_delbr(struct net_device *dev)
 	struct kobject *kobj = &dev->dev.kobj;
 	struct net_bridge *br = netdev_priv(dev);
 
-	kobject_unregister(br->ifobj);
+	kobject_put(br->ifobj);
 	sysfs_remove_bin_file(kobj, &bridge_forward);
 	sysfs_remove_group(kobj, &bridge_group);
 }
diff --git a/security/inode.c b/security/inode.c
index def0cc1b07f2..acc6cf0d7900 100644
--- a/security/inode.c
+++ b/security/inode.c
@@ -327,7 +327,7 @@ static int __init securityfs_init(void)
 
 	retval = register_filesystem(&fs_type);
 	if (retval)
-		kobject_unregister(security_kobj);
+		kobject_put(security_kobj);
 	return retval;
 }
 
-- 
cgit v1.2.3


From af5ca3f4ec5cc4432a42a73b050dd8898ce8fd00 Mon Sep 17 00:00:00 2001
From: Kay Sievers <kay.sievers@vrfy.org>
Date: Thu, 20 Dec 2007 02:09:39 +0100
Subject: Driver core: change sysdev classes to use dynamic kobject names

All kobjects require a dynamically allocated name now. We no longer
need to keep track if the name is statically assigned, we can just
unconditionally free() all kobject names on cleanup.

Signed-off-by: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 arch/arm/kernel/time.c                   |  4 ++--
 arch/arm/mach-integrator/integrator_ap.c |  2 +-
 arch/arm/mach-pxa/cm-x270.c              |  2 +-
 arch/arm/mach-pxa/lpd270.c               |  2 +-
 arch/arm/mach-pxa/lubbock.c              |  2 +-
 arch/arm/mach-pxa/mainstone.c            |  2 +-
 arch/arm/mach-s3c2410/s3c2410.c          |  2 +-
 arch/arm/mach-s3c2412/s3c2412.c          |  2 +-
 arch/arm/mach-s3c2440/mach-osiris.c      |  2 +-
 arch/arm/mach-s3c2443/s3c2443.c          |  2 +-
 arch/arm/mach-sa1100/irq.c               |  2 +-
 arch/arm/oprofile/common.c               |  2 +-
 arch/arm/plat-omap/gpio.c                |  2 +-
 arch/arm/plat-s3c24xx/dma.c              |  2 +-
 arch/arm/plat-s3c24xx/s3c244x.c          |  4 ++--
 arch/avr32/kernel/time.c                 |  2 +-
 arch/mips/kernel/i8259.c                 |  2 +-
 arch/powerpc/platforms/cell/spu_base.c   |  2 +-
 arch/powerpc/platforms/powermac/pic.c    |  2 +-
 arch/powerpc/sysdev/ipic.c               |  2 +-
 arch/powerpc/sysdev/mpic.c               |  2 +-
 arch/powerpc/sysdev/qe_lib/qe_ic.c       |  2 +-
 arch/ppc/syslib/ipic.c                   |  2 +-
 arch/ppc/syslib/open_pic.c               |  2 +-
 arch/ppc/syslib/open_pic2.c              |  2 +-
 arch/s390/kernel/time.c                  |  2 +-
 arch/sh/drivers/dma/dma-sysfs.c          |  2 +-
 arch/sh/kernel/time.c                    |  2 +-
 arch/x86/kernel/apic_32.c                |  2 +-
 arch/x86/kernel/apic_64.c                |  2 +-
 arch/x86/kernel/cpu/mcheck/mce_64.c      |  2 +-
 arch/x86/kernel/i8237.c                  |  2 +-
 arch/x86/kernel/i8259_32.c               |  2 +-
 arch/x86/kernel/i8259_64.c               |  2 +-
 arch/x86/kernel/io_apic_32.c             |  2 +-
 arch/x86/kernel/io_apic_64.c             |  2 +-
 arch/x86/kernel/nmi_32.c                 |  2 +-
 arch/x86/kernel/nmi_64.c                 |  2 +-
 arch/x86/oprofile/nmi_int.c              |  2 +-
 drivers/acpi/pci_link.c                  |  2 +-
 drivers/base/class.c                     |  2 +-
 drivers/base/cpu.c                       |  2 +-
 drivers/base/memory.c                    |  2 +-
 drivers/base/node.c                      |  2 +-
 drivers/base/sys.c                       |  1 +
 drivers/edac/edac_module.c               |  2 +-
 drivers/kvm/kvm_main.c                   |  2 +-
 drivers/macintosh/via-pmu.c              |  2 +-
 drivers/scsi/libsas/sas_scsi_host.c      |  2 +-
 include/linux/kobject.h                  | 13 ++-----------
 include/linux/sysdev.h                   |  1 +
 kernel/rtmutex-tester.c                  |  2 +-
 kernel/time/clocksource.c                |  2 +-
 kernel/time/timekeeping.c                |  2 +-
 lib/kobject.c                            | 14 +++++---------
 55 files changed, 62 insertions(+), 73 deletions(-)

(limited to 'kernel')

diff --git a/arch/arm/kernel/time.c b/arch/arm/kernel/time.c
index 1533d3ecd7a0..f6f3689a86ee 100644
--- a/arch/arm/kernel/time.c
+++ b/arch/arm/kernel/time.c
@@ -195,7 +195,7 @@ static int leds_shutdown(struct sys_device *dev)
 }
 
 static struct sysdev_class leds_sysclass = {
-	set_kset_name("leds"),
+	.name		= "leds",
 	.shutdown	= leds_shutdown,
 	.suspend	= leds_suspend,
 	.resume		= leds_resume,
@@ -369,7 +369,7 @@ static int timer_resume(struct sys_device *dev)
 #endif
 
 static struct sysdev_class timer_sysclass = {
-	set_kset_name("timer"),
+	.name		= "timer",
 	.suspend	= timer_suspend,
 	.resume		= timer_resume,
 };
diff --git a/arch/arm/mach-integrator/integrator_ap.c b/arch/arm/mach-integrator/integrator_ap.c
index 72280754354d..df37e93c6fc9 100644
--- a/arch/arm/mach-integrator/integrator_ap.c
+++ b/arch/arm/mach-integrator/integrator_ap.c
@@ -214,7 +214,7 @@ static int irq_resume(struct sys_device *dev)
 #endif
 
 static struct sysdev_class irq_class = {
-	set_kset_name("irq"),
+	.name		= "irq",
 	.suspend	= irq_suspend,
 	.resume		= irq_resume,
 };
diff --git a/arch/arm/mach-pxa/cm-x270.c b/arch/arm/mach-pxa/cm-x270.c
index 177664ccb2e2..a16349272f54 100644
--- a/arch/arm/mach-pxa/cm-x270.c
+++ b/arch/arm/mach-pxa/cm-x270.c
@@ -566,7 +566,7 @@ static int cmx270_resume(struct sys_device *dev)
 }
 
 static struct sysdev_class cmx270_pm_sysclass = {
-	set_kset_name("pm"),
+	.name = "pm",
 	.resume = cmx270_resume,
 	.suspend = cmx270_suspend,
 };
diff --git a/arch/arm/mach-pxa/lpd270.c b/arch/arm/mach-pxa/lpd270.c
index 26116440a7c9..78ebad063cba 100644
--- a/arch/arm/mach-pxa/lpd270.c
+++ b/arch/arm/mach-pxa/lpd270.c
@@ -122,7 +122,7 @@ static int lpd270_irq_resume(struct sys_device *dev)
 }
 
 static struct sysdev_class lpd270_irq_sysclass = {
-	set_kset_name("cpld_irq"),
+	.name = "cpld_irq",
 	.resume = lpd270_irq_resume,
 };
 
diff --git a/arch/arm/mach-pxa/lubbock.c b/arch/arm/mach-pxa/lubbock.c
index 011a1a72b61c..1d3112dc629e 100644
--- a/arch/arm/mach-pxa/lubbock.c
+++ b/arch/arm/mach-pxa/lubbock.c
@@ -126,7 +126,7 @@ static int lubbock_irq_resume(struct sys_device *dev)
 }
 
 static struct sysdev_class lubbock_irq_sysclass = {
-	set_kset_name("cpld_irq"),
+	.name = "cpld_irq",
 	.resume = lubbock_irq_resume,
 };
 
diff --git a/arch/arm/mach-pxa/mainstone.c b/arch/arm/mach-pxa/mainstone.c
index a4bc3483cbb3..41d8c6cea62b 100644
--- a/arch/arm/mach-pxa/mainstone.c
+++ b/arch/arm/mach-pxa/mainstone.c
@@ -120,7 +120,7 @@ static int mainstone_irq_resume(struct sys_device *dev)
 }
 
 static struct sysdev_class mainstone_irq_sysclass = {
-	set_kset_name("cpld_irq"),
+	.name = "cpld_irq",
 	.resume = mainstone_irq_resume,
 };
 
diff --git a/arch/arm/mach-s3c2410/s3c2410.c b/arch/arm/mach-s3c2410/s3c2410.c
index e580303cb0ab..0e7991940f81 100644
--- a/arch/arm/mach-s3c2410/s3c2410.c
+++ b/arch/arm/mach-s3c2410/s3c2410.c
@@ -100,7 +100,7 @@ void __init s3c2410_init_clocks(int xtal)
 }
 
 struct sysdev_class s3c2410_sysclass = {
-	set_kset_name("s3c2410-core"),
+	.name = "s3c2410-core",
 };
 
 static struct sys_device s3c2410_sysdev = {
diff --git a/arch/arm/mach-s3c2412/s3c2412.c b/arch/arm/mach-s3c2412/s3c2412.c
index 4f92a1562d77..265cd3f567a3 100644
--- a/arch/arm/mach-s3c2412/s3c2412.c
+++ b/arch/arm/mach-s3c2412/s3c2412.c
@@ -196,7 +196,7 @@ void __init s3c2412_init_clocks(int xtal)
 */
 
 struct sysdev_class s3c2412_sysclass = {
-	set_kset_name("s3c2412-core"),
+	.name = "s3c2412-core",
 };
 
 static int __init s3c2412_core_init(void)
diff --git a/arch/arm/mach-s3c2440/mach-osiris.c b/arch/arm/mach-s3c2440/mach-osiris.c
index c326983f4a8f..78af7664988b 100644
--- a/arch/arm/mach-s3c2440/mach-osiris.c
+++ b/arch/arm/mach-s3c2440/mach-osiris.c
@@ -312,7 +312,7 @@ static int osiris_pm_resume(struct sys_device *sd)
 #endif
 
 static struct sysdev_class osiris_pm_sysclass = {
-	set_kset_name("mach-osiris"),
+	.name		= "mach-osiris",
 	.suspend	= osiris_pm_suspend,
 	.resume		= osiris_pm_resume,
 };
diff --git a/arch/arm/mach-s3c2443/s3c2443.c b/arch/arm/mach-s3c2443/s3c2443.c
index 8d8117158d23..9ce490560af9 100644
--- a/arch/arm/mach-s3c2443/s3c2443.c
+++ b/arch/arm/mach-s3c2443/s3c2443.c
@@ -43,7 +43,7 @@ static struct map_desc s3c2443_iodesc[] __initdata = {
 };
 
 struct sysdev_class s3c2443_sysclass = {
-	set_kset_name("s3c2443-core"),
+	.name = "s3c2443-core",
 };
 
 static struct sys_device s3c2443_sysdev = {
diff --git a/arch/arm/mach-sa1100/irq.c b/arch/arm/mach-sa1100/irq.c
index edf3347d9c5b..3dc17d7bf38e 100644
--- a/arch/arm/mach-sa1100/irq.c
+++ b/arch/arm/mach-sa1100/irq.c
@@ -283,7 +283,7 @@ static int sa1100irq_resume(struct sys_device *dev)
 }
 
 static struct sysdev_class sa1100irq_sysclass = {
-	set_kset_name("sa11x0-irq"),
+	.name		= "sa11x0-irq",
 	.suspend	= sa1100irq_suspend,
 	.resume		= sa1100irq_resume,
 };
diff --git a/arch/arm/oprofile/common.c b/arch/arm/oprofile/common.c
index a9de727c9327..0a5cf3a6438b 100644
--- a/arch/arm/oprofile/common.c
+++ b/arch/arm/oprofile/common.c
@@ -96,7 +96,7 @@ static int op_arm_resume(struct sys_device *dev)
 }
 
 static struct sysdev_class oprofile_sysclass = {
-	set_kset_name("oprofile"),
+	.name		= "oprofile",
 	.resume		= op_arm_resume,
 	.suspend	= op_arm_suspend,
 };
diff --git a/arch/arm/plat-omap/gpio.c b/arch/arm/plat-omap/gpio.c
index 6097753394ad..b2a87b8ef673 100644
--- a/arch/arm/plat-omap/gpio.c
+++ b/arch/arm/plat-omap/gpio.c
@@ -1455,7 +1455,7 @@ static int omap_gpio_resume(struct sys_device *dev)
 }
 
 static struct sysdev_class omap_gpio_sysclass = {
-	set_kset_name("gpio"),
+	.name		= "gpio",
 	.suspend	= omap_gpio_suspend,
 	.resume		= omap_gpio_resume,
 };
diff --git a/arch/arm/plat-s3c24xx/dma.c b/arch/arm/plat-s3c24xx/dma.c
index 29696e46ed65..aae1b9cbaf44 100644
--- a/arch/arm/plat-s3c24xx/dma.c
+++ b/arch/arm/plat-s3c24xx/dma.c
@@ -1265,7 +1265,7 @@ static int s3c2410_dma_resume(struct sys_device *dev)
 #endif /* CONFIG_PM */
 
 struct sysdev_class dma_sysclass = {
-	set_kset_name("s3c24xx-dma"),
+	.name		= "s3c24xx-dma",
 	.suspend	= s3c2410_dma_suspend,
 	.resume		= s3c2410_dma_resume,
 };
diff --git a/arch/arm/plat-s3c24xx/s3c244x.c b/arch/arm/plat-s3c24xx/s3c244x.c
index 3444b13afac5..f197bb3a2366 100644
--- a/arch/arm/plat-s3c24xx/s3c244x.c
+++ b/arch/arm/plat-s3c24xx/s3c244x.c
@@ -151,13 +151,13 @@ static int s3c244x_resume(struct sys_device *dev)
 /* Since the S3C2442 and S3C2440 share  items, put both sysclasses here */
 
 struct sysdev_class s3c2440_sysclass = {
-	set_kset_name("s3c2440-core"),
+	.name		= "s3c2440-core",
 	.suspend	= s3c244x_suspend,
 	.resume		= s3c244x_resume
 };
 
 struct sysdev_class s3c2442_sysclass = {
-	set_kset_name("s3c2442-core"),
+	.name		= "s3c2442-core",
 	.suspend	= s3c244x_suspend,
 	.resume		= s3c244x_resume
 };
diff --git a/arch/avr32/kernel/time.c b/arch/avr32/kernel/time.c
index 7014a3571ec0..36a46c3ae308 100644
--- a/arch/avr32/kernel/time.c
+++ b/arch/avr32/kernel/time.c
@@ -214,7 +214,7 @@ void __init time_init(void)
 }
 
 static struct sysdev_class timer_class = {
-	set_kset_name("timer"),
+	.name = "timer",
 };
 
 static struct sys_device timer_device = {
diff --git a/arch/mips/kernel/i8259.c b/arch/mips/kernel/i8259.c
index 471013577108..197d7977de35 100644
--- a/arch/mips/kernel/i8259.c
+++ b/arch/mips/kernel/i8259.c
@@ -238,7 +238,7 @@ static int i8259A_shutdown(struct sys_device *dev)
 }
 
 static struct sysdev_class i8259_sysdev_class = {
-	set_kset_name("i8259"),
+	.name = "i8259",
 	.resume = i8259A_resume,
 	.shutdown = i8259A_shutdown,
 };
diff --git a/arch/powerpc/platforms/cell/spu_base.c b/arch/powerpc/platforms/cell/spu_base.c
index c83c3e3f5178..a08862203643 100644
--- a/arch/powerpc/platforms/cell/spu_base.c
+++ b/arch/powerpc/platforms/cell/spu_base.c
@@ -459,7 +459,7 @@ static int spu_shutdown(struct sys_device *sysdev)
 }
 
 static struct sysdev_class spu_sysdev_class = {
-	set_kset_name("spu"),
+	.name = "spu",
 	.shutdown = spu_shutdown,
 };
 
diff --git a/arch/powerpc/platforms/powermac/pic.c b/arch/powerpc/platforms/powermac/pic.c
index 999f5e160897..84c0d4ef76a2 100644
--- a/arch/powerpc/platforms/powermac/pic.c
+++ b/arch/powerpc/platforms/powermac/pic.c
@@ -663,7 +663,7 @@ static int pmacpic_resume(struct sys_device *sysdev)
 #endif /* CONFIG_PM && CONFIG_PPC32 */
 
 static struct sysdev_class pmacpic_sysclass = {
-	set_kset_name("pmac_pic"),
+	.name = "pmac_pic",
 };
 
 static struct sys_device device_pmacpic = {
diff --git a/arch/powerpc/sysdev/ipic.c b/arch/powerpc/sysdev/ipic.c
index 05a56e55804c..e898ff4d2b97 100644
--- a/arch/powerpc/sysdev/ipic.c
+++ b/arch/powerpc/sysdev/ipic.c
@@ -725,7 +725,7 @@ unsigned int ipic_get_irq(void)
 }
 
 static struct sysdev_class ipic_sysclass = {
-	set_kset_name("ipic"),
+	.name = "ipic",
 };
 
 static struct sys_device device_ipic = {
diff --git a/arch/powerpc/sysdev/mpic.c b/arch/powerpc/sysdev/mpic.c
index e47938899a92..212a94f5d34b 100644
--- a/arch/powerpc/sysdev/mpic.c
+++ b/arch/powerpc/sysdev/mpic.c
@@ -1584,7 +1584,7 @@ static struct sysdev_class mpic_sysclass = {
 	.resume = mpic_resume,
 	.suspend = mpic_suspend,
 #endif
-	set_kset_name("mpic"),
+	.name = "mpic",
 };
 
 static int mpic_init_sys(void)
diff --git a/arch/powerpc/sysdev/qe_lib/qe_ic.c b/arch/powerpc/sysdev/qe_lib/qe_ic.c
index e1c0fd6dbc1a..f59444d3be75 100644
--- a/arch/powerpc/sysdev/qe_lib/qe_ic.c
+++ b/arch/powerpc/sysdev/qe_lib/qe_ic.c
@@ -483,7 +483,7 @@ int qe_ic_set_high_priority(unsigned int virq, unsigned int priority, int high)
 }
 
 static struct sysdev_class qe_ic_sysclass = {
-	set_kset_name("qe_ic"),
+	.name = "qe_ic",
 };
 
 static struct sys_device device_qe_ic = {
diff --git a/arch/ppc/syslib/ipic.c b/arch/ppc/syslib/ipic.c
index 9192777d0f78..4f163e20939e 100644
--- a/arch/ppc/syslib/ipic.c
+++ b/arch/ppc/syslib/ipic.c
@@ -614,7 +614,7 @@ int ipic_get_irq(void)
 }
 
 static struct sysdev_class ipic_sysclass = {
-	set_kset_name("ipic"),
+	.name = "ipic",
 };
 
 static struct sys_device device_ipic = {
diff --git a/arch/ppc/syslib/open_pic.c b/arch/ppc/syslib/open_pic.c
index 18ec94733293..da36522d327a 100644
--- a/arch/ppc/syslib/open_pic.c
+++ b/arch/ppc/syslib/open_pic.c
@@ -1043,7 +1043,7 @@ int openpic_resume(struct sys_device *sysdev)
 #endif /* CONFIG_PM */
 
 static struct sysdev_class openpic_sysclass = {
-	set_kset_name("openpic"),
+	.name = "openpic",
 };
 
 static struct sys_device device_openpic = {
diff --git a/arch/ppc/syslib/open_pic2.c b/arch/ppc/syslib/open_pic2.c
index d585207f9f77..449075a04798 100644
--- a/arch/ppc/syslib/open_pic2.c
+++ b/arch/ppc/syslib/open_pic2.c
@@ -666,7 +666,7 @@ int openpic2_resume(struct sys_device *sysdev)
 
 /* HACK ALERT */
 static struct sysdev_class openpic2_sysclass = {
-	set_kset_name("openpic2"),
+	.name = "openpic2",
 };
 
 static struct sys_device device_openpic2 = {
diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c
index 22b800ce2126..3bbac1293be4 100644
--- a/arch/s390/kernel/time.c
+++ b/arch/s390/kernel/time.c
@@ -1145,7 +1145,7 @@ static void etr_work_fn(struct work_struct *work)
  * Sysfs interface functions
  */
 static struct sysdev_class etr_sysclass = {
-	set_kset_name("etr")
+	.name	= "etr",
 };
 
 static struct sys_device etr_port0_dev = {
diff --git a/arch/sh/drivers/dma/dma-sysfs.c b/arch/sh/drivers/dma/dma-sysfs.c
index eebcd4768bbf..51b57c0d1a3c 100644
--- a/arch/sh/drivers/dma/dma-sysfs.c
+++ b/arch/sh/drivers/dma/dma-sysfs.c
@@ -19,7 +19,7 @@
 #include <asm/dma.h>
 
 static struct sysdev_class dma_sysclass = {
-	set_kset_name("dma"),
+	.name = "dma",
 };
 EXPORT_SYMBOL(dma_sysclass);
 
diff --git a/arch/sh/kernel/time.c b/arch/sh/kernel/time.c
index a3a67d151e52..2bc04bfee738 100644
--- a/arch/sh/kernel/time.c
+++ b/arch/sh/kernel/time.c
@@ -174,7 +174,7 @@ int timer_resume(struct sys_device *dev)
 #endif
 
 static struct sysdev_class timer_sysclass = {
-	set_kset_name("timer"),
+	.name	 = "timer",
 	.suspend = timer_suspend,
 	.resume	 = timer_resume,
 };
diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c
index edb5108e5d0e..a56c782653be 100644
--- a/arch/x86/kernel/apic_32.c
+++ b/arch/x86/kernel/apic_32.c
@@ -1530,7 +1530,7 @@ static int lapic_resume(struct sys_device *dev)
  */
 
 static struct sysdev_class lapic_sysclass = {
-	set_kset_name("lapic"),
+	.name		= "lapic",
 	.resume		= lapic_resume,
 	.suspend	= lapic_suspend,
 };
diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c
index f28ccb588fba..fa6cdee6d303 100644
--- a/arch/x86/kernel/apic_64.c
+++ b/arch/x86/kernel/apic_64.c
@@ -639,7 +639,7 @@ static int lapic_resume(struct sys_device *dev)
 }
 
 static struct sysdev_class lapic_sysclass = {
-	set_kset_name("lapic"),
+	.name		= "lapic",
 	.resume		= lapic_resume,
 	.suspend	= lapic_suspend,
 };
diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index 4b21d29fb5aa..242e8668dbeb 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -745,7 +745,7 @@ static void mce_restart(void)
 
 static struct sysdev_class mce_sysclass = {
 	.resume = mce_resume,
-	set_kset_name("machinecheck"),
+	.name = "machinecheck",
 };
 
 DEFINE_PER_CPU(struct sys_device, device_mce);
diff --git a/arch/x86/kernel/i8237.c b/arch/x86/kernel/i8237.c
index 29313832df0c..dbd6c1d1b638 100644
--- a/arch/x86/kernel/i8237.c
+++ b/arch/x86/kernel/i8237.c
@@ -51,7 +51,7 @@ static int i8237A_suspend(struct sys_device *dev, pm_message_t state)
 }
 
 static struct sysdev_class i8237_sysdev_class = {
-	set_kset_name("i8237"),
+	.name = "i8237",
 	.suspend = i8237A_suspend,
 	.resume = i8237A_resume,
 };
diff --git a/arch/x86/kernel/i8259_32.c b/arch/x86/kernel/i8259_32.c
index f634fc715c99..5f3496d01984 100644
--- a/arch/x86/kernel/i8259_32.c
+++ b/arch/x86/kernel/i8259_32.c
@@ -258,7 +258,7 @@ static int i8259A_shutdown(struct sys_device *dev)
 }
 
 static struct sysdev_class i8259_sysdev_class = {
-	set_kset_name("i8259"),
+	.name = "i8259",
 	.suspend = i8259A_suspend,
 	.resume = i8259A_resume,
 	.shutdown = i8259A_shutdown,
diff --git a/arch/x86/kernel/i8259_64.c b/arch/x86/kernel/i8259_64.c
index 3f27ea0b9816..ba6d57286f56 100644
--- a/arch/x86/kernel/i8259_64.c
+++ b/arch/x86/kernel/i8259_64.c
@@ -370,7 +370,7 @@ static int i8259A_shutdown(struct sys_device *dev)
 }
 
 static struct sysdev_class i8259_sysdev_class = {
-	set_kset_name("i8259"),
+	.name = "i8259",
 	.suspend = i8259A_suspend,
 	.resume = i8259A_resume,
 	.shutdown = i8259A_shutdown,
diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c
index a6b1490e00c4..ab77f1905469 100644
--- a/arch/x86/kernel/io_apic_32.c
+++ b/arch/x86/kernel/io_apic_32.c
@@ -2401,7 +2401,7 @@ static int ioapic_resume(struct sys_device *dev)
 }
 
 static struct sysdev_class ioapic_sysdev_class = {
-	set_kset_name("ioapic"),
+	.name = "ioapic",
 	.suspend = ioapic_suspend,
 	.resume = ioapic_resume,
 };
diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c
index cbac1670c7c3..23a3ac06a23e 100644
--- a/arch/x86/kernel/io_apic_64.c
+++ b/arch/x86/kernel/io_apic_64.c
@@ -1850,7 +1850,7 @@ static int ioapic_resume(struct sys_device *dev)
 }
 
 static struct sysdev_class ioapic_sysdev_class = {
-	set_kset_name("ioapic"),
+	.name = "ioapic",
 	.suspend = ioapic_suspend,
 	.resume = ioapic_resume,
 };
diff --git a/arch/x86/kernel/nmi_32.c b/arch/x86/kernel/nmi_32.c
index 852db2906921..4f4bfd3a88b6 100644
--- a/arch/x86/kernel/nmi_32.c
+++ b/arch/x86/kernel/nmi_32.c
@@ -176,7 +176,7 @@ static int lapic_nmi_resume(struct sys_device *dev)
 
 
 static struct sysdev_class nmi_sysclass = {
-	set_kset_name("lapic_nmi"),
+	.name		= "lapic_nmi",
 	.resume		= lapic_nmi_resume,
 	.suspend	= lapic_nmi_suspend,
 };
diff --git a/arch/x86/kernel/nmi_64.c b/arch/x86/kernel/nmi_64.c
index 4253c4e8849c..c3d1476b6a11 100644
--- a/arch/x86/kernel/nmi_64.c
+++ b/arch/x86/kernel/nmi_64.c
@@ -211,7 +211,7 @@ static int lapic_nmi_resume(struct sys_device *dev)
 }
 
 static struct sysdev_class nmi_sysclass = {
-	set_kset_name("lapic_nmi"),
+	.name		= "lapic_nmi",
 	.resume		= lapic_nmi_resume,
 	.suspend	= lapic_nmi_suspend,
 };
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index 944bbcdd2b8d..c8ab79ef4276 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -51,7 +51,7 @@ static int nmi_resume(struct sys_device *dev)
 
 
 static struct sysdev_class oprofile_sysclass = {
-	set_kset_name("oprofile"),
+	.name		= "oprofile",
 	.resume		= nmi_resume,
 	.suspend	= nmi_suspend,
 };
diff --git a/drivers/acpi/pci_link.c b/drivers/acpi/pci_link.c
index c9f526e55392..5400ea173f6f 100644
--- a/drivers/acpi/pci_link.c
+++ b/drivers/acpi/pci_link.c
@@ -911,7 +911,7 @@ __setup("acpi_irq_balance", acpi_irq_balance_set);
 
 /* FIXME: we will remove this interface after all drivers call pci_disable_device */
 static struct sysdev_class irqrouter_sysdev_class = {
-	set_kset_name("irqrouter"),
+	.name = "irqrouter",
 	.resume = irqrouter_resume,
 };
 
diff --git a/drivers/base/class.c b/drivers/base/class.c
index 61fd26cc9f0e..b962a76875d2 100644
--- a/drivers/base/class.c
+++ b/drivers/base/class.c
@@ -466,7 +466,6 @@ static struct kset_uevent_ops class_uevent_ops = {
  * entirely soon.
  */
 static struct kset class_obj_subsys = {
-	.kobj = { .k_name = "class_obj", },
 	.uevent_ops = &class_uevent_ops,
 };
 
@@ -872,6 +871,7 @@ int __init classes_init(void)
 	/* ick, this is ugly, the things we go through to keep from showing up
 	 * in sysfs... */
 	kset_init(&class_obj_subsys);
+	kobject_set_name(&class_obj_subsys.kobj, "class_obj");
 	if (!class_obj_subsys.kobj.parent)
 		class_obj_subsys.kobj.parent = &class_obj_subsys.kobj;
 	return 0;
diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c
index 40545071e3c9..c5885f5ce0ac 100644
--- a/drivers/base/cpu.c
+++ b/drivers/base/cpu.c
@@ -14,7 +14,7 @@
 #include "base.h"
 
 struct sysdev_class cpu_sysdev_class = {
-	set_kset_name("cpu"),
+	.name = "cpu",
 };
 EXPORT_SYMBOL(cpu_sysdev_class);
 
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 7868707c7eda..7ae413fdd5fc 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -26,7 +26,7 @@
 #define MEMORY_CLASS_NAME	"memory"
 
 static struct sysdev_class memory_sysdev_class = {
-	set_kset_name(MEMORY_CLASS_NAME),
+	.name = MEMORY_CLASS_NAME,
 };
 
 static const char *memory_uevent_name(struct kset *kset, struct kobject *kobj)
diff --git a/drivers/base/node.c b/drivers/base/node.c
index 88eeed72b5d6..e59861f18ce5 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -15,7 +15,7 @@
 #include <linux/device.h>
 
 static struct sysdev_class node_class = {
-	set_kset_name("node"),
+	.name = "node",
 };
 
 
diff --git a/drivers/base/sys.c b/drivers/base/sys.c
index e666441dd76b..2f79c55acdcc 100644
--- a/drivers/base/sys.c
+++ b/drivers/base/sys.c
@@ -136,6 +136,7 @@ int sysdev_class_register(struct sysdev_class * cls)
 	cls->kset.kobj.parent = &system_kset->kobj;
 	cls->kset.kobj.ktype = &ktype_sysdev_class;
 	cls->kset.kobj.kset = system_kset;
+	kobject_set_name(&cls->kset.kobj, cls->name);
 	return kset_register(&cls->kset);
 }
 
diff --git a/drivers/edac/edac_module.c b/drivers/edac/edac_module.c
index e0c4a4086055..7e1374afd967 100644
--- a/drivers/edac/edac_module.c
+++ b/drivers/edac/edac_module.c
@@ -31,7 +31,7 @@ struct workqueue_struct *edac_workqueue;
  *	need to export to other files in this modules
  */
 static struct sysdev_class edac_class = {
-	set_kset_name("edac"),
+	.name = "edac",
 };
 static int edac_class_valid;
 
diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c
index 47c10b8f89b3..c0f372f1d761 100644
--- a/drivers/kvm/kvm_main.c
+++ b/drivers/kvm/kvm_main.c
@@ -3451,7 +3451,7 @@ static int kvm_resume(struct sys_device *dev)
 }
 
 static struct sysdev_class kvm_sysdev_class = {
-	set_kset_name("kvm"),
+	.name = "kvm",
 	.suspend = kvm_suspend,
 	.resume = kvm_resume,
 };
diff --git a/drivers/macintosh/via-pmu.c b/drivers/macintosh/via-pmu.c
index 6123c70153d3..ac420b17e16f 100644
--- a/drivers/macintosh/via-pmu.c
+++ b/drivers/macintosh/via-pmu.c
@@ -2796,7 +2796,7 @@ static int pmu_sys_resume(struct sys_device *sysdev)
 #endif /* CONFIG_PM_SLEEP && CONFIG_PPC32 */
 
 static struct sysdev_class pmu_sysclass = {
-	set_kset_name("pmu"),
+	.name = "pmu",
 };
 
 static struct sys_device device_pmu = {
diff --git a/drivers/scsi/libsas/sas_scsi_host.c b/drivers/scsi/libsas/sas_scsi_host.c
index 7663841eb4cf..a3fdc57e2673 100644
--- a/drivers/scsi/libsas/sas_scsi_host.c
+++ b/drivers/scsi/libsas/sas_scsi_host.c
@@ -464,7 +464,7 @@ int sas_eh_bus_reset_handler(struct scsi_cmnd *cmd)
 	res = sas_phy_reset(phy, 1);
 	if (res)
 		SAS_DPRINTK("Bus reset of %s failed 0x%x\n",
-			    phy->dev.kobj.k_name,
+			    kobject_name(&phy->dev.kobj),
 			    res);
 	if (res == TMF_RESP_FUNC_SUCC || res == TMF_RESP_FUNC_COMPLETE)
 		return SUCCESS;
diff --git a/include/linux/kobject.h b/include/linux/kobject.h
index 504ac0eb4412..4adbe1d83081 100644
--- a/include/linux/kobject.h
+++ b/include/linux/kobject.h
@@ -61,7 +61,7 @@ enum kobject_action {
 };
 
 struct kobject {
-	const char		* k_name;
+	const char		*name;
 	struct kref		kref;
 	struct list_head	entry;
 	struct kobject		* parent;
@@ -69,7 +69,6 @@ struct kobject {
 	struct kobj_type	* ktype;
 	struct sysfs_dirent	* sd;
 	unsigned int state_initialized:1;
-	unsigned int state_name_set:1;
 	unsigned int state_in_sysfs:1;
 	unsigned int state_add_uevent_sent:1;
 	unsigned int state_remove_uevent_sent:1;
@@ -80,7 +79,7 @@ extern int kobject_set_name(struct kobject *, const char *, ...)
 
 static inline const char * kobject_name(const struct kobject * kobj)
 {
-	return kobj->k_name;
+	return kobj->name;
 }
 
 extern void kobject_init(struct kobject *kobj, struct kobj_type *ktype);
@@ -189,14 +188,6 @@ static inline struct kobj_type *get_ktype(struct kobject *kobj)
 
 extern struct kobject * kset_find_obj(struct kset *, const char *);
 
-
-/*
- * Use this when initializing an embedded kset with no other 
- * fields to initialize.
- */
-#define set_kset_name(str)	.kset = { .kobj = { .k_name = str } }
-
-
 /* The global /sys/kernel/ kobject for people to chain off of */
 extern struct kobject *kernel_kobj;
 /* The global /sys/hypervisor/ kobject for people to chain off of */
diff --git a/include/linux/sysdev.h b/include/linux/sysdev.h
index e285746588d6..f752e73bf977 100644
--- a/include/linux/sysdev.h
+++ b/include/linux/sysdev.h
@@ -29,6 +29,7 @@
 struct sys_device;
 
 struct sysdev_class {
+	const char *name;
 	struct list_head	drivers;
 
 	/* Default operations for these types of devices */
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c
index e3055ba69159..092e4c620af9 100644
--- a/kernel/rtmutex-tester.c
+++ b/kernel/rtmutex-tester.c
@@ -394,7 +394,7 @@ static SYSDEV_ATTR(status, 0600, sysfs_test_status, NULL);
 static SYSDEV_ATTR(command, 0600, NULL, sysfs_test_command);
 
 static struct sysdev_class rttest_sysclass = {
-	set_kset_name("rttest"),
+	.name = "rttest",
 };
 
 static int init_test_thread(int id)
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index c8a9d13874df..8d6125ad2cf0 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -441,7 +441,7 @@ static SYSDEV_ATTR(available_clocksource, 0600,
 		   sysfs_show_available_clocksources, NULL);
 
 static struct sysdev_class clocksource_sysclass = {
-	set_kset_name("clocksource"),
+	.name = "clocksource",
 };
 
 static struct sys_device device_clocksource = {
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index e5e466b27598..ab46ae8c062b 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -335,9 +335,9 @@ static int timekeeping_suspend(struct sys_device *dev, pm_message_t state)
 
 /* sysfs resume/suspend bits for timekeeping */
 static struct sysdev_class timekeeping_sysclass = {
+	.name		= "timekeeping",
 	.resume		= timekeeping_resume,
 	.suspend	= timekeeping_suspend,
-	set_kset_name("timekeeping"),
 };
 
 static struct sys_device device_timer = {
diff --git a/lib/kobject.c b/lib/kobject.c
index a0773734545c..8dc32454661d 100644
--- a/lib/kobject.c
+++ b/lib/kobject.c
@@ -165,7 +165,7 @@ static int kobject_add_internal(struct kobject *kobj)
 	if (!kobj)
 		return -ENOENT;
 
-	if (!kobj->k_name || !kobj->k_name[0]) {
+	if (!kobj->name || !kobj->name[0]) {
 		pr_debug("kobject: (%p): attempted to be registered with empty "
 			 "name!\n", kobj);
 		WARN_ON(1);
@@ -228,13 +228,11 @@ static int kobject_set_name_vargs(struct kobject *kobj, const char *fmt,
 	if (!name)
 		return -ENOMEM;
 
-
 	/* Free the old name, if necessary. */
-	kfree(kobj->k_name);
+	kfree(kobj->name);
 
 	/* Now, set the new name */
-	kobj->k_name = name;
-	kobj->state_name_set = 1;
+	kobj->name = name;
 
 	return 0;
 }
@@ -295,7 +293,6 @@ void kobject_init(struct kobject *kobj, struct kobj_type *ktype)
 	kref_init(&kobj->kref);
 	INIT_LIST_HEAD(&kobj->entry);
 	kobj->ktype = ktype;
-	kobj->state_name_set = 0;
 	kobj->state_in_sysfs = 0;
 	kobj->state_add_uevent_sent = 0;
 	kobj->state_remove_uevent_sent = 0;
@@ -551,8 +548,7 @@ struct kobject * kobject_get(struct kobject * kobj)
 static void kobject_cleanup(struct kobject *kobj)
 {
 	struct kobj_type *t = get_ktype(kobj);
-	const char *name = kobj->k_name;
-	int name_set = kobj->state_name_set;
+	const char *name = kobj->name;
 
 	pr_debug("kobject: '%s' (%p): %s\n",
 		 kobject_name(kobj), kobj, __FUNCTION__);
@@ -583,7 +579,7 @@ static void kobject_cleanup(struct kobject *kobj)
 	}
 
 	/* free name if we allocated it */
-	if (name_set && name) {
+	if (name) {
 		pr_debug("kobject: '%s': free name\n", name);
 		kfree(name);
 	}
-- 
cgit v1.2.3


From 6ea6dd93c9454cc9521134f907bc970d09f460e4 Mon Sep 17 00:00:00 2001
From: Haavard Skinnemoen <hskinnemoen@atmel.com>
Date: Tue, 27 Nov 2007 13:02:40 +0100
Subject: ptrace: Call arch_ptrace_attach() when request=PTRACE_TRACEME

arch_ptrace_attach() is a hook that allows the architecture to do
book-keeping after a ptrace attach. This patch adds a call to this
hook when handling a PTRACE_TRACEME request as well.

Currently only one architecture, m32r, implements this hook. When
called, it initializes a number of debug trap slots in the ptraced
task's thread struct, and it looks to me like this is the right thing
to do after a PTRACE_TRACEME request as well, not only after
PTRACE_ATTACH. Please correct me if I'm wrong.

I want to use this hook on AVR32 to turn the debugging hardware on
when a process is actually being debugged and keep it off otherwise.
To be able to do this, I need to intercept PTRACE_TRACEME and
PTRACE_ATTACH, as well as PTRACE_DETACH and thread exit. The latter
two can be handled by existing hooks.

Signed-off-by: Haavard Skinnemoen <hskinnemoen@atmel.com>
---
 kernel/ptrace.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index c25db863081d..c719bb9d79ab 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -470,6 +470,8 @@ asmlinkage long sys_ptrace(long request, long pid, long addr, long data)
 	lock_kernel();
 	if (request == PTRACE_TRACEME) {
 		ret = ptrace_traceme();
+		if (!ret)
+			arch_ptrace_attach(current);
 		goto out;
 	}
 
-- 
cgit v1.2.3


From 32a76006683f7b28ae3cc491da37716e002f198e Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 25 Jan 2008 21:07:58 +0100
Subject: printk: make printk more robust by not allowing recursion

make printk more robust by allowing recursion only if there's a crash
going on. Also add recursion detection.

I've tested it with an artificially injected printk recursion - instead
of a lockup or spontaneous reboot or other crash, the output was a well
controlled:

[   41.057335] SysRq : <2>BUG: recent printk recursion!
[   41.057335] loglevel0-8 reBoot Crashdump show-all-locks(D) tErm Full kIll saK showMem Nice powerOff showPc show-all-timers(Q) unRaw Sync showTasks Unmount shoW-blocked-tasks

also do all this printk-debug logic with irqs disabled.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Reviewed-by: Nick Piggin <npiggin@suse.de>
---
 kernel/printk.c | 48 ++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 38 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk.c b/kernel/printk.c
index 89011bf8c106..e6c1f36d8c3a 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -628,30 +628,57 @@ asmlinkage int printk(const char *fmt, ...)
 /* cpu currently holding logbuf_lock */
 static volatile unsigned int printk_cpu = UINT_MAX;
 
+const char printk_recursion_bug_msg [] =
+			KERN_CRIT "BUG: recent printk recursion!\n";
+static int printk_recursion_bug;
+
 asmlinkage int vprintk(const char *fmt, va_list args)
 {
+	static int log_level_unknown = 1;
+	static char printk_buf[1024];
+
 	unsigned long flags;
-	int printed_len;
+	int printed_len = 0;
+	int this_cpu;
 	char *p;
-	static char printk_buf[1024];
-	static int log_level_unknown = 1;
 
 	boot_delay_msec();
 
 	preempt_disable();
-	if (unlikely(oops_in_progress) && printk_cpu == smp_processor_id())
-		/* If a crash is occurring during printk() on this CPU,
-		 * make sure we can't deadlock */
-		zap_locks();
-
 	/* This stops the holder of console_sem just where we want him */
 	raw_local_irq_save(flags);
+	this_cpu = smp_processor_id();
+
+	/*
+	 * Ouch, printk recursed into itself!
+	 */
+	if (unlikely(printk_cpu == this_cpu)) {
+		/*
+		 * If a crash is occurring during printk() on this CPU,
+		 * then try to get the crash message out but make sure
+		 * we can't deadlock. Otherwise just return to avoid the
+		 * recursion and return - but flag the recursion so that
+		 * it can be printed at the next appropriate moment:
+		 */
+		if (!oops_in_progress) {
+			printk_recursion_bug = 1;
+			goto out_restore_irqs;
+		}
+		zap_locks();
+	}
+
 	lockdep_off();
 	spin_lock(&logbuf_lock);
-	printk_cpu = smp_processor_id();
+	printk_cpu = this_cpu;
 
+	if (printk_recursion_bug) {
+		printk_recursion_bug = 0;
+		strcpy(printk_buf, printk_recursion_bug_msg);
+		printed_len = sizeof(printk_recursion_bug_msg);
+	}
 	/* Emit the output into the temporary buffer */
-	printed_len = vscnprintf(printk_buf, sizeof(printk_buf), fmt, args);
+	printed_len += vscnprintf(printk_buf + printed_len,
+				  sizeof(printk_buf), fmt, args);
 
 	/*
 	 * Copy the output into log_buf.  If the caller didn't provide
@@ -744,6 +771,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
 		printk_cpu = UINT_MAX;
 		spin_unlock(&logbuf_lock);
 		lockdep_on();
+out_restore_irqs:
 		raw_local_irq_restore(flags);
 	}
 
-- 
cgit v1.2.3


From d713f519332e029d43eca8462629314eee1ded86 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 25 Jan 2008 21:07:58 +0100
Subject: sched: fix CONFIG_PRINT_TIME's reliance on sched_clock()

Stefano Brivio reported weird printk timestamp behavior during
CPU frequency changes:

  http://bugzilla.kernel.org/show_bug.cgi?id=9475

fix CONFIG_PRINT_TIME's reliance on sched_clock() and use cpu_clock()
instead.

Reported-and-bisected-by: Stefano Brivio <stefano.brivio@polimi.it>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/printk.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/printk.c b/kernel/printk.c
index e6c1f36d8c3a..5f9d053699f9 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -707,7 +707,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
 					loglev_char = default_message_loglevel
 						+ '0';
 				}
-				t = printk_clock();
+				t = cpu_clock(printk_cpu);
 				nanosec_rem = do_div(t, 1000000000);
 				tlen = sprintf(tbuf,
 						"<%c>[%5lu.%06lu] ",
-- 
cgit v1.2.3


From b842271fbb9c8b5fd0e1c3e1895a3b67ba5bcc54 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 25 Jan 2008 21:07:59 +0100
Subject: sched: remove printk_clock()

printk_clock() is obsolete - it has been replaced with cpu_clock().

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/arm/kernel/time.c  | 11 -----------
 arch/ia64/kernel/time.c | 27 ---------------------------
 kernel/printk.c         |  5 -----
 3 files changed, 43 deletions(-)

(limited to 'kernel')

diff --git a/arch/arm/kernel/time.c b/arch/arm/kernel/time.c
index f6f3689a86ee..e59b5b84168d 100644
--- a/arch/arm/kernel/time.c
+++ b/arch/arm/kernel/time.c
@@ -79,17 +79,6 @@ static unsigned long dummy_gettimeoffset(void)
 }
 #endif
 
-/*
- * An implementation of printk_clock() independent from
- * sched_clock().  This avoids non-bootable kernels when
- * printk_clock is enabled.
- */
-unsigned long long printk_clock(void)
-{
-	return (unsigned long long)(jiffies - INITIAL_JIFFIES) *
-			(1000000000 / HZ);
-}
-
 static unsigned long next_rtc_update;
 
 /*
diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c
index 2bb84214e5f1..3ab042720970 100644
--- a/arch/ia64/kernel/time.c
+++ b/arch/ia64/kernel/time.c
@@ -344,33 +344,6 @@ udelay (unsigned long usecs)
 }
 EXPORT_SYMBOL(udelay);
 
-static unsigned long long ia64_itc_printk_clock(void)
-{
-	if (ia64_get_kr(IA64_KR_PER_CPU_DATA))
-		return sched_clock();
-	return 0;
-}
-
-static unsigned long long ia64_default_printk_clock(void)
-{
-	return (unsigned long long)(jiffies_64 - INITIAL_JIFFIES) *
-		(1000000000/HZ);
-}
-
-unsigned long long (*ia64_printk_clock)(void) = &ia64_default_printk_clock;
-
-unsigned long long printk_clock(void)
-{
-	return ia64_printk_clock();
-}
-
-void __init
-ia64_setup_printk_clock(void)
-{
-	if (!(sal_platform_features & IA64_SAL_PLATFORM_FEATURE_ITC_DRIFT))
-		ia64_printk_clock = ia64_itc_printk_clock;
-}
-
 /* IA64 doesn't cache the timezone */
 void update_vsyscall_tz(void)
 {
diff --git a/kernel/printk.c b/kernel/printk.c
index 5f9d053699f9..3b7c968d0ef9 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -573,11 +573,6 @@ static int __init printk_time_setup(char *str)
 
 __setup("time", printk_time_setup);
 
-__attribute__((weak)) unsigned long long printk_clock(void)
-{
-	return sched_clock();
-}
-
 /* Check if we have any console registered that can be called early in boot. */
 static int have_callable_console(void)
 {
-- 
cgit v1.2.3


From 93f992ccc008dd4030381caeebb252e85e66684b Mon Sep 17 00:00:00 2001
From: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Date: Fri, 25 Jan 2008 21:07:59 +0100
Subject: sched: group scheduling code cleanup

Minor cleanups:

- Fix coding style
- remove obsolete comment

Signed-off-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 21 +++------------------
 1 file changed, 3 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index e76b11ca6df3..7f827b70ae02 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -191,12 +191,12 @@ struct task_group init_task_group = {
 };
 
 #ifdef CONFIG_FAIR_USER_SCHED
-# define INIT_TASK_GRP_LOAD	2*NICE_0_LOAD
+# define INIT_TASK_GROUP_LOAD	2*NICE_0_LOAD
 #else
-# define INIT_TASK_GRP_LOAD	NICE_0_LOAD
+# define INIT_TASK_GROUP_LOAD	NICE_0_LOAD
 #endif
 
-static int init_task_group_load = INIT_TASK_GRP_LOAD;
+static int init_task_group_load = INIT_TASK_GROUP_LOAD;
 
 /* return group to which a task belongs */
 static inline struct task_group *task_group(struct task_struct *p)
@@ -881,21 +881,6 @@ static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
 
 #define sched_class_highest (&rt_sched_class)
 
-/*
- * Update delta_exec, delta_fair fields for rq.
- *
- * delta_fair clock advances at a rate inversely proportional to
- * total load (rq->load.weight) on the runqueue, while
- * delta_exec advances at the same rate as wall-clock (provided
- * cpu is not idle).
- *
- * delta_exec / delta_fair is a measure of the (smoothened) load on this
- * runqueue over any given interval. This (smoothened) load is used
- * during load balance.
- *
- * This function is called /before/ updating rq->load
- * and when switching tasks.
- */
 static inline void inc_load(struct rq *rq, const struct task_struct *p)
 {
 	update_load_add(&rq->load, p->se.load.weight);
-- 
cgit v1.2.3


From ec2c507fe8c8fa3c04fc6cb99a382a965c477379 Mon Sep 17 00:00:00 2001
From: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Date: Fri, 25 Jan 2008 21:07:59 +0100
Subject: sched: group scheduling, minor fixes

Minor bug fixes for the group scheduler:

- Use a mutex to serialize add/remove of task groups and also when
  changing shares of a task group. Use the same mutex when printing
  cfs_rq debugging stats for various task groups.

- Use list_for_each_entry_rcu in for_each_leaf_cfs_rq macro (when
  walking task group list)

Signed-off-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c      | 33 +++++++++++++++++++++++++--------
 kernel/sched_fair.c |  4 +++-
 2 files changed, 28 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 7f827b70ae02..cfa695819252 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -169,8 +169,6 @@ struct task_group {
 	/* runqueue "owned" by this group on each cpu */
 	struct cfs_rq **cfs_rq;
 	unsigned long shares;
-	/* spinlock to serialize modification to shares */
-	spinlock_t lock;
 	struct rcu_head rcu;
 };
 
@@ -182,6 +180,11 @@ static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
 static struct sched_entity *init_sched_entity_p[NR_CPUS];
 static struct cfs_rq *init_cfs_rq_p[NR_CPUS];
 
+/* task_group_mutex serializes add/remove of task groups and also changes to
+ * a task group's cpu shares.
+ */
+static DEFINE_MUTEX(task_group_mutex);
+
 /* Default task group.
  *	Every task in system belong to this group at bootup.
  */
@@ -221,9 +224,21 @@ static inline void set_task_cfs_rq(struct task_struct *p, unsigned int cpu)
 	p->se.parent = task_group(p)->se[cpu];
 }
 
+static inline void lock_task_group_list(void)
+{
+	mutex_lock(&task_group_mutex);
+}
+
+static inline void unlock_task_group_list(void)
+{
+	mutex_unlock(&task_group_mutex);
+}
+
 #else
 
 static inline void set_task_cfs_rq(struct task_struct *p, unsigned int cpu) { }
+static inline void lock_task_group_list(void) { }
+static inline void unlock_task_group_list(void) { }
 
 #endif	/* CONFIG_FAIR_GROUP_SCHED */
 
@@ -6768,7 +6783,6 @@ void __init sched_init(void)
 			se->parent = NULL;
 		}
 		init_task_group.shares = init_task_group_load;
-		spin_lock_init(&init_task_group.lock);
 #endif
 
 		for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
@@ -7008,14 +7022,15 @@ struct task_group *sched_create_group(void)
 		se->parent = NULL;
 	}
 
+	tg->shares = NICE_0_LOAD;
+
+	lock_task_group_list();
 	for_each_possible_cpu(i) {
 		rq = cpu_rq(i);
 		cfs_rq = tg->cfs_rq[i];
 		list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
 	}
-
-	tg->shares = NICE_0_LOAD;
-	spin_lock_init(&tg->lock);
+	unlock_task_group_list();
 
 	return tg;
 
@@ -7061,10 +7076,12 @@ void sched_destroy_group(struct task_group *tg)
 	struct cfs_rq *cfs_rq = NULL;
 	int i;
 
+	lock_task_group_list();
 	for_each_possible_cpu(i) {
 		cfs_rq = tg->cfs_rq[i];
 		list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
 	}
+	unlock_task_group_list();
 
 	BUG_ON(!cfs_rq);
 
@@ -7146,7 +7163,7 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
 	if (shares < 2)
 		shares = 2;
 
-	spin_lock(&tg->lock);
+	lock_task_group_list();
 	if (tg->shares == shares)
 		goto done;
 
@@ -7155,7 +7172,7 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
 		set_se_shares(tg->se[i], shares);
 
 done:
-	spin_unlock(&tg->lock);
+	unlock_task_group_list();
 	return 0;
 }
 
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index da7c061e7206..0c5fdce67228 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -690,7 +690,7 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
 
 /* Iterate thr' all leaf cfs_rq's on a runqueue */
 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
-	list_for_each_entry(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
+	list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
 
 /* Do the two (enqueued) entities belong to the same group ? */
 static inline int
@@ -1132,7 +1132,9 @@ static void print_cfs_stats(struct seq_file *m, int cpu)
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	print_cfs_rq(m, cpu, &cpu_rq(cpu)->cfs);
 #endif
+	lock_task_group_list();
 	for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
 		print_cfs_rq(m, cpu, cfs_rq);
+	unlock_task_group_list();
 }
 #endif
-- 
cgit v1.2.3


From 58e2d4ca581167c2a079f4ee02be2f0bc52e8729 Mon Sep 17 00:00:00 2001
From: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Date: Fri, 25 Jan 2008 21:08:00 +0100
Subject: sched: group scheduling, change how cpu load is calculated

This patch changes how the cpu load exerted by fair_sched_class tasks
is calculated. Load exerted by fair_sched_class tasks on a cpu is now
a summation of the group weights, rather than summation of task weights.
Weight exerted by a group on a cpu is dependent on the shares allocated
to it.

This version of patch has a minor impact on code size, but should have
no runtime/functional impact for !CONFIG_FAIR_GROUP_SCHED.

Signed-off-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c      | 27 +++++++++++----------------
 kernel/sched_fair.c | 31 +++++++++++++++++++++++++++----
 kernel/sched_rt.c   |  2 ++
 3 files changed, 40 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index cfa695819252..c915f3e6e593 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -886,6 +886,16 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
 static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
 #endif
 
+static inline void inc_cpu_load(struct rq *rq, unsigned long load)
+{
+	update_load_add(&rq->load, load);
+}
+
+static inline void dec_cpu_load(struct rq *rq, unsigned long load)
+{
+	update_load_sub(&rq->load, load);
+}
+
 #include "sched_stats.h"
 #include "sched_idletask.c"
 #include "sched_fair.c"
@@ -896,26 +906,14 @@ static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
 
 #define sched_class_highest (&rt_sched_class)
 
-static inline void inc_load(struct rq *rq, const struct task_struct *p)
-{
-	update_load_add(&rq->load, p->se.load.weight);
-}
-
-static inline void dec_load(struct rq *rq, const struct task_struct *p)
-{
-	update_load_sub(&rq->load, p->se.load.weight);
-}
-
 static void inc_nr_running(struct task_struct *p, struct rq *rq)
 {
 	rq->nr_running++;
-	inc_load(rq, p);
 }
 
 static void dec_nr_running(struct task_struct *p, struct rq *rq)
 {
 	rq->nr_running--;
-	dec_load(rq, p);
 }
 
 static void set_load_weight(struct task_struct *p)
@@ -4087,10 +4085,8 @@ void set_user_nice(struct task_struct *p, long nice)
 		goto out_unlock;
 	}
 	on_rq = p->se.on_rq;
-	if (on_rq) {
+	if (on_rq)
 		dequeue_task(rq, p, 0);
-		dec_load(rq, p);
-	}
 
 	p->static_prio = NICE_TO_PRIO(nice);
 	set_load_weight(p);
@@ -4100,7 +4096,6 @@ void set_user_nice(struct task_struct *p, long nice)
 
 	if (on_rq) {
 		enqueue_task(rq, p, 0);
-		inc_load(rq, p);
 		/*
 		 * If the task increased its priority or is running and
 		 * lowered its priority, then reschedule its CPU:
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 0c5fdce67228..30ae9c2a2861 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -760,15 +760,26 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se)
 static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup)
 {
 	struct cfs_rq *cfs_rq;
-	struct sched_entity *se = &p->se;
+	struct sched_entity *se = &p->se,
+			    *topse = NULL;	/* Highest schedulable entity */
+	int incload = 1;
 
 	for_each_sched_entity(se) {
-		if (se->on_rq)
+		topse = se;
+		if (se->on_rq) {
+			incload = 0;
 			break;
+		}
 		cfs_rq = cfs_rq_of(se);
 		enqueue_entity(cfs_rq, se, wakeup);
 		wakeup = 1;
 	}
+	/* Increment cpu load if we just enqueued the first task of a group on
+	 * 'rq->cpu'. 'topse' represents the group to which task 'p' belongs
+	 * at the highest grouping level.
+	 */
+	if (incload)
+		inc_cpu_load(rq, topse->load.weight);
 }
 
 /*
@@ -779,16 +790,28 @@ static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup)
 static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep)
 {
 	struct cfs_rq *cfs_rq;
-	struct sched_entity *se = &p->se;
+	struct sched_entity *se = &p->se,
+			    *topse = NULL; 	/* Highest schedulable entity */
+	int decload = 1;
 
 	for_each_sched_entity(se) {
+		topse = se;
 		cfs_rq = cfs_rq_of(se);
 		dequeue_entity(cfs_rq, se, sleep);
 		/* Don't dequeue parent if it has other entities besides us */
-		if (cfs_rq->load.weight)
+		if (cfs_rq->load.weight) {
+			if (parent_entity(se))
+				decload = 0;
 			break;
+		}
 		sleep = 1;
 	}
+	/* Decrement cpu load if we just dequeued the last task of a group on
+	 * 'rq->cpu'. 'topse' represents the group to which task 'p' belongs
+	 * at the highest grouping level.
+	 */
+	if (decload)
+		dec_cpu_load(rq, topse->load.weight);
 }
 
 /*
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 9ba3daa03475..cefcd5105146 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -32,6 +32,7 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
 
 	list_add_tail(&p->run_list, array->queue + p->prio);
 	__set_bit(p->prio, array->bitmap);
+	inc_cpu_load(rq, p->se.load.weight);
 }
 
 /*
@@ -46,6 +47,7 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
 	list_del(&p->run_list);
 	if (list_empty(array->queue + p->prio))
 		__clear_bit(p->prio, array->bitmap);
+	dec_cpu_load(rq, p->se.load.weight);
 }
 
 /*
-- 
cgit v1.2.3


From a183561567b5446d3362b4839bd4f744f4b2af1e Mon Sep 17 00:00:00 2001
From: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Date: Fri, 25 Jan 2008 21:08:00 +0100
Subject: sched: introduce a mutex and corresponding API to serialize access to
 doms_curarray

doms_cur[] array represents various scheduling domains which are
mutually exclusive. Currently cpusets code can modify this array (by
calling partition_sched_domains()) as a result of user modifying
sched_load_balance flag for various cpusets.

This patch introduces a mutex and corresponding API (only when
CONFIG_FAIR_GROUP_SCHED is defined) which allows a reader to safely read
the doms_cur[] array w/o worrying abt concurrent modifications to the
array.

The fair group scheduler code (introduced in next patch of this series)
makes use of this mutex to walk thr' doms_cur[] array while rebalancing
shares of task groups across cpus.

Signed-off-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index c915f3e6e593..d9585f15043f 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -185,6 +185,9 @@ static struct cfs_rq *init_cfs_rq_p[NR_CPUS];
  */
 static DEFINE_MUTEX(task_group_mutex);
 
+/* doms_cur_mutex serializes access to doms_cur[] array */
+static DEFINE_MUTEX(doms_cur_mutex);
+
 /* Default task group.
  *	Every task in system belong to this group at bootup.
  */
@@ -234,11 +237,23 @@ static inline void unlock_task_group_list(void)
 	mutex_unlock(&task_group_mutex);
 }
 
+static inline void lock_doms_cur(void)
+{
+	mutex_lock(&doms_cur_mutex);
+}
+
+static inline void unlock_doms_cur(void)
+{
+	mutex_unlock(&doms_cur_mutex);
+}
+
 #else
 
 static inline void set_task_cfs_rq(struct task_struct *p, unsigned int cpu) { }
 static inline void lock_task_group_list(void) { }
 static inline void unlock_task_group_list(void) { }
+static inline void lock_doms_cur(void) { }
+static inline void unlock_doms_cur(void) { }
 
 #endif	/* CONFIG_FAIR_GROUP_SCHED */
 
@@ -6543,6 +6558,8 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new)
 {
 	int i, j;
 
+	lock_doms_cur();
+
 	/* always unregister in case we don't destroy any domains */
 	unregister_sched_domain_sysctl();
 
@@ -6583,6 +6600,8 @@ match2:
 	ndoms_cur = ndoms_new;
 
 	register_sched_domain_sysctl();
+
+	unlock_doms_cur();
 }
 
 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-- 
cgit v1.2.3


From 6b2d7700266b9402e12824e11e0099ae6a4a6a79 Mon Sep 17 00:00:00 2001
From: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Date: Fri, 25 Jan 2008 21:08:00 +0100
Subject: sched: group scheduler, fix fairness of cpu bandwidth allocation for
 task groups

The current load balancing scheme isn't good enough for precise
group fairness.

For example: on a 8-cpu system, I created 3 groups as under:

	a = 8 tasks (cpu.shares = 1024)
	b = 4 tasks (cpu.shares = 1024)
	c = 3 tasks (cpu.shares = 1024)

a, b and c are task groups that have equal weight. We would expect each
of the groups to receive 33.33% of cpu bandwidth under a fair scheduler.

This is what I get with the latest scheduler git tree:

Signed-off-by: Ingo Molnar <mingo@elte.hu>
--------------------------------------------------------------------------------
Col1  | Col2    | Col3  |  Col4
------|---------|-------|-------------------------------------------------------
a     | 277.676 | 57.8% | 54.1%  54.1%  54.1%  54.2%  56.7%  62.2%  62.8% 64.5%
b     | 116.108 | 24.2% | 47.4%  48.1%  48.7%  49.3%
c     |  86.326 | 18.0% | 47.5%  47.9%  48.5%
--------------------------------------------------------------------------------

Explanation of o/p:

Col1 -> Group name
Col2 -> Cumulative execution time (in seconds) received by all tasks of that
	group in a 60sec window across 8 cpus
Col3 -> CPU bandwidth received by the group in the 60sec window, expressed in
        percentage. Col3 data is derived as:
		Col3 = 100 * Col2 / (NR_CPUS * 60)
Col4 -> CPU bandwidth received by each individual task of the group.
		Col4 = 100 * cpu_time_recd_by_task / 60

[I can share the test case that produces a similar o/p if reqd]

The deviation from desired group fairness is as below:

	a = +24.47%
	b = -9.13%
	c = -15.33%

which is quite high.

After the patch below is applied, here are the results:

--------------------------------------------------------------------------------
Col1  | Col2    | Col3  |  Col4
------|---------|-------|-------------------------------------------------------
a     | 163.112 | 34.0% | 33.2%  33.4%  33.5%  33.5%  33.7%  34.4%  34.8% 35.3%
b     | 156.220 | 32.5% | 63.3%  64.5%  66.1%  66.5%
c     | 160.653 | 33.5% | 85.8%  90.6%  91.4%
--------------------------------------------------------------------------------

Deviation from desired group fairness is as below:

	a = +0.67%
	b = -0.83%
	c = +0.17%

which is far better IMO. Most of other runs have yielded a deviation within
+-2% at the most, which is good.

Why do we see bad (group) fairness with current scheuler?
=========================================================

Currently cpu's weight is just the summation of individual task weights.
This can yield incorrect results. For ex: consider three groups as below
on a 2-cpu system:

	CPU0	CPU1
---------------------------
	A (10)  B(5)
		C(5)
---------------------------

Group A has 10 tasks, all on CPU0, Group B and C have 5 tasks each all
of which are on CPU1. Each task has the same weight (NICE_0_LOAD =
1024).

The current scheme would yield a cpu weight of 10240 (10*1024) for each cpu and
the load balancer will think both CPUs are perfectly balanced and won't
move around any tasks. This, however, would yield this bandwidth:

	A = 50%
	B = 25%
	C = 25%

which is not the desired result.

What's changing in the patch?
=============================

	- How cpu weights are calculated when CONFIF_FAIR_GROUP_SCHED is
	  defined (see below)
	- API Change
		- Two tunables introduced in sysfs (under SCHED_DEBUG) to
		  control the frequency at which the load balance monitor
		  thread runs.

The basic change made in this patch is how cpu weight (rq->load.weight) is
calculated. Its now calculated as the summation of group weights on a cpu,
rather than summation of task weights. Weight exerted by a group on a
cpu is dependent on the shares allocated to it and also the number of
tasks the group has on that cpu compared to the total number of
(runnable) tasks the group has in the system.

Let,
	W(K,i)  = Weight of group K on cpu i
	T(K,i)  = Task load present in group K's cfs_rq on cpu i
	T(K)    = Total task load of group K across various cpus
	S(K) 	= Shares allocated to group K
	NRCPUS	= Number of online cpus in the scheduler domain to
	 	  which group K is assigned.

Then,
	W(K,i) = S(K) * NRCPUS * T(K,i) / T(K)

A load balance monitor thread is created at bootup, which periodically
runs and adjusts group's weight on each cpu. To avoid its overhead, two
min/max tunables are introduced (under SCHED_DEBUG) to control the rate
at which it runs.

Fixes from: Peter Zijlstra <a.p.zijlstra@chello.nl>

- don't start the load_balance_monitor when there is only a single cpu.
- rename the kthread because its currently longer than TASK_COMM_LEN

Signed-off-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h |   4 +
 kernel/sched.c        | 270 +++++++++++++++++++++++++++++++++++++++++++++++---
 kernel/sched_fair.c   |  84 ++++++++++------
 kernel/sysctl.c       |  18 ++++
 4 files changed, 331 insertions(+), 45 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index d6eacda765ca..288245f83bd4 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1453,6 +1453,10 @@ extern unsigned int sysctl_sched_child_runs_first;
 extern unsigned int sysctl_sched_features;
 extern unsigned int sysctl_sched_migration_cost;
 extern unsigned int sysctl_sched_nr_migrate;
+#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
+extern unsigned int sysctl_sched_min_bal_int_shares;
+extern unsigned int sysctl_sched_max_bal_int_shares;
+#endif
 
 int sched_nr_latency_handler(struct ctl_table *table, int write,
 		struct file *file, void __user *buffer, size_t *length,
diff --git a/kernel/sched.c b/kernel/sched.c
index d9585f15043f..86e55a9c2de6 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -168,7 +168,43 @@ struct task_group {
 	struct sched_entity **se;
 	/* runqueue "owned" by this group on each cpu */
 	struct cfs_rq **cfs_rq;
+
+	/*
+	 * shares assigned to a task group governs how much of cpu bandwidth
+	 * is allocated to the group. The more shares a group has, the more is
+	 * the cpu bandwidth allocated to it.
+	 *
+	 * For ex, lets say that there are three task groups, A, B and C which
+	 * have been assigned shares 1000, 2000 and 3000 respectively. Then,
+	 * cpu bandwidth allocated by the scheduler to task groups A, B and C
+	 * should be:
+	 *
+	 *	Bw(A) = 1000/(1000+2000+3000) * 100 = 16.66%
+	 *	Bw(B) = 2000/(1000+2000+3000) * 100 = 33.33%
+	 * 	Bw(C) = 3000/(1000+2000+3000) * 100 = 50%
+	 *
+	 * The weight assigned to a task group's schedulable entities on every
+	 * cpu (task_group.se[a_cpu]->load.weight) is derived from the task
+	 * group's shares. For ex: lets say that task group A has been
+	 * assigned shares of 1000 and there are two CPUs in a system. Then,
+	 *
+	 *  tg_A->se[0]->load.weight = tg_A->se[1]->load.weight = 1000;
+	 *
+	 * Note: It's not necessary that each of a task's group schedulable
+	 * 	 entity have the same weight on all CPUs. If the group
+	 * 	 has 2 of its tasks on CPU0 and 1 task on CPU1, then a
+	 * 	 better distribution of weight could be:
+	 *
+	 *	tg_A->se[0]->load.weight = 2/3 * 2000 = 1333
+	 *	tg_A->se[1]->load.weight = 1/2 * 2000 =  667
+	 *
+	 * rebalance_shares() is responsible for distributing the shares of a
+	 * task groups like this among the group's schedulable entities across
+	 * cpus.
+	 *
+	 */
 	unsigned long shares;
+
 	struct rcu_head rcu;
 };
 
@@ -188,6 +224,14 @@ static DEFINE_MUTEX(task_group_mutex);
 /* doms_cur_mutex serializes access to doms_cur[] array */
 static DEFINE_MUTEX(doms_cur_mutex);
 
+#ifdef CONFIG_SMP
+/* kernel thread that runs rebalance_shares() periodically */
+static struct task_struct *lb_monitor_task;
+static int load_balance_monitor(void *unused);
+#endif
+
+static void set_se_shares(struct sched_entity *se, unsigned long shares);
+
 /* Default task group.
  *	Every task in system belong to this group at bootup.
  */
@@ -202,6 +246,8 @@ struct task_group init_task_group = {
 # define INIT_TASK_GROUP_LOAD	NICE_0_LOAD
 #endif
 
+#define MIN_GROUP_SHARES       2
+
 static int init_task_group_load = INIT_TASK_GROUP_LOAD;
 
 /* return group to which a task belongs */
@@ -6736,6 +6782,21 @@ void __init sched_init_smp(void)
 	if (set_cpus_allowed(current, non_isolated_cpus) < 0)
 		BUG();
 	sched_init_granularity();
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	if (nr_cpu_ids == 1)
+		return;
+
+	lb_monitor_task = kthread_create(load_balance_monitor, NULL,
+					 "group_balance");
+	if (!IS_ERR(lb_monitor_task)) {
+		lb_monitor_task->flags |= PF_NOFREEZE;
+		wake_up_process(lb_monitor_task);
+	} else {
+		printk(KERN_ERR "Could not create load balance monitor thread"
+			"(error = %ld) \n", PTR_ERR(lb_monitor_task));
+	}
+#endif
 }
 #else
 void __init sched_init_smp(void)
@@ -6988,6 +7049,157 @@ void set_curr_task(int cpu, struct task_struct *p)
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 
+#ifdef CONFIG_SMP
+/*
+ * distribute shares of all task groups among their schedulable entities,
+ * to reflect load distrbution across cpus.
+ */
+static int rebalance_shares(struct sched_domain *sd, int this_cpu)
+{
+	struct cfs_rq *cfs_rq;
+	struct rq *rq = cpu_rq(this_cpu);
+	cpumask_t sdspan = sd->span;
+	int balanced = 1;
+
+	/* Walk thr' all the task groups that we have */
+	for_each_leaf_cfs_rq(rq, cfs_rq) {
+		int i;
+		unsigned long total_load = 0, total_shares;
+		struct task_group *tg = cfs_rq->tg;
+
+		/* Gather total task load of this group across cpus */
+		for_each_cpu_mask(i, sdspan)
+			total_load += tg->cfs_rq[i]->load.weight;
+
+		/* Nothing to do if this group has no load  */
+		if (!total_load)
+			continue;
+
+		/*
+		 * tg->shares represents the number of cpu shares the task group
+		 * is eligible to hold on a single cpu. On N cpus, it is
+		 * eligible to hold (N * tg->shares) number of cpu shares.
+		 */
+		total_shares = tg->shares * cpus_weight(sdspan);
+
+		/*
+		 * redistribute total_shares across cpus as per the task load
+		 * distribution.
+		 */
+		for_each_cpu_mask(i, sdspan) {
+			unsigned long local_load, local_shares;
+
+			local_load = tg->cfs_rq[i]->load.weight;
+			local_shares = (local_load * total_shares) / total_load;
+			if (!local_shares)
+				local_shares = MIN_GROUP_SHARES;
+			if (local_shares == tg->se[i]->load.weight)
+				continue;
+
+			spin_lock_irq(&cpu_rq(i)->lock);
+			set_se_shares(tg->se[i], local_shares);
+			spin_unlock_irq(&cpu_rq(i)->lock);
+			balanced = 0;
+		}
+	}
+
+	return balanced;
+}
+
+/*
+ * How frequently should we rebalance_shares() across cpus?
+ *
+ * The more frequently we rebalance shares, the more accurate is the fairness
+ * of cpu bandwidth distribution between task groups. However higher frequency
+ * also implies increased scheduling overhead.
+ *
+ * sysctl_sched_min_bal_int_shares represents the minimum interval between
+ * consecutive calls to rebalance_shares() in the same sched domain.
+ *
+ * sysctl_sched_max_bal_int_shares represents the maximum interval between
+ * consecutive calls to rebalance_shares() in the same sched domain.
+ *
+ * These settings allows for the appropriate tradeoff between accuracy of
+ * fairness and the associated overhead.
+ *
+ */
+
+/* default: 8ms, units: milliseconds */
+const_debug unsigned int sysctl_sched_min_bal_int_shares = 8;
+
+/* default: 128ms, units: milliseconds */
+const_debug unsigned int sysctl_sched_max_bal_int_shares = 128;
+
+/* kernel thread that runs rebalance_shares() periodically */
+static int load_balance_monitor(void *unused)
+{
+	unsigned int timeout = sysctl_sched_min_bal_int_shares;
+	struct sched_param schedparm;
+	int ret;
+
+	/*
+	 * We don't want this thread's execution to be limited by the shares
+	 * assigned to default group (init_task_group). Hence make it run
+	 * as a SCHED_RR RT task at the lowest priority.
+	 */
+	schedparm.sched_priority = 1;
+	ret = sched_setscheduler(current, SCHED_RR, &schedparm);
+	if (ret)
+		printk(KERN_ERR "Couldn't set SCHED_RR policy for load balance"
+				" monitor thread (error = %d) \n", ret);
+
+	while (!kthread_should_stop()) {
+		int i, cpu, balanced = 1;
+
+		/* Prevent cpus going down or coming up */
+		lock_cpu_hotplug();
+		/* lockout changes to doms_cur[] array */
+		lock_doms_cur();
+		/*
+		 * Enter a rcu read-side critical section to safely walk rq->sd
+		 * chain on various cpus and to walk task group list
+		 * (rq->leaf_cfs_rq_list) in rebalance_shares().
+		 */
+		rcu_read_lock();
+
+		for (i = 0; i < ndoms_cur; i++) {
+			cpumask_t cpumap = doms_cur[i];
+			struct sched_domain *sd = NULL, *sd_prev = NULL;
+
+			cpu = first_cpu(cpumap);
+
+			/* Find the highest domain at which to balance shares */
+			for_each_domain(cpu, sd) {
+				if (!(sd->flags & SD_LOAD_BALANCE))
+					continue;
+				sd_prev = sd;
+			}
+
+			sd = sd_prev;
+			/* sd == NULL? No load balance reqd in this domain */
+			if (!sd)
+				continue;
+
+			balanced &= rebalance_shares(sd, cpu);
+		}
+
+		rcu_read_unlock();
+
+		unlock_doms_cur();
+		unlock_cpu_hotplug();
+
+		if (!balanced)
+			timeout = sysctl_sched_min_bal_int_shares;
+		else if (timeout < sysctl_sched_max_bal_int_shares)
+			timeout *= 2;
+
+		msleep_interruptible(timeout);
+	}
+
+	return 0;
+}
+#endif	/* CONFIG_SMP */
+
 /* allocate runqueue etc for a new task group */
 struct task_group *sched_create_group(void)
 {
@@ -7144,47 +7356,77 @@ done:
 	task_rq_unlock(rq, &flags);
 }
 
+/* rq->lock to be locked by caller */
 static void set_se_shares(struct sched_entity *se, unsigned long shares)
 {
 	struct cfs_rq *cfs_rq = se->cfs_rq;
 	struct rq *rq = cfs_rq->rq;
 	int on_rq;
 
-	spin_lock_irq(&rq->lock);
+	if (!shares)
+		shares = MIN_GROUP_SHARES;
 
 	on_rq = se->on_rq;
-	if (on_rq)
+	if (on_rq) {
 		dequeue_entity(cfs_rq, se, 0);
+		dec_cpu_load(rq, se->load.weight);
+	}
 
 	se->load.weight = shares;
 	se->load.inv_weight = div64_64((1ULL<<32), shares);
 
-	if (on_rq)
+	if (on_rq) {
 		enqueue_entity(cfs_rq, se, 0);
-
-	spin_unlock_irq(&rq->lock);
+		inc_cpu_load(rq, se->load.weight);
+	}
 }
 
 int sched_group_set_shares(struct task_group *tg, unsigned long shares)
 {
 	int i;
-
-	/*
-	 * A weight of 0 or 1 can cause arithmetics problems.
-	 * (The default weight is 1024 - so there's no practical
-	 *  limitation from this.)
-	 */
-	if (shares < 2)
-		shares = 2;
+	struct cfs_rq *cfs_rq;
+	struct rq *rq;
 
 	lock_task_group_list();
 	if (tg->shares == shares)
 		goto done;
 
+	if (shares < MIN_GROUP_SHARES)
+		shares = MIN_GROUP_SHARES;
+
+	/*
+	 * Prevent any load balance activity (rebalance_shares,
+	 * load_balance_fair) from referring to this group first,
+	 * by taking it off the rq->leaf_cfs_rq_list on each cpu.
+	 */
+	for_each_possible_cpu(i) {
+		cfs_rq = tg->cfs_rq[i];
+		list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
+	}
+
+	/* wait for any ongoing reference to this group to finish */
+	synchronize_sched();
+
+	/*
+	 * Now we are free to modify the group's share on each cpu
+	 * w/o tripping rebalance_share or load_balance_fair.
+	 */
 	tg->shares = shares;
-	for_each_possible_cpu(i)
+	for_each_possible_cpu(i) {
+		spin_lock_irq(&cpu_rq(i)->lock);
 		set_se_shares(tg->se[i], shares);
+		spin_unlock_irq(&cpu_rq(i)->lock);
+	}
 
+	/*
+	 * Enable load balance activity on this group, by inserting it back on
+	 * each cpu's rq->leaf_cfs_rq_list.
+	 */
+	for_each_possible_cpu(i) {
+		rq = cpu_rq(i);
+		cfs_rq = tg->cfs_rq[i];
+		list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
+	}
 done:
 	unlock_task_group_list();
 	return 0;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 30ae9c2a2861..5c208e090ae4 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -707,6 +707,8 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se)
 	return se->parent;
 }
 
+#define GROUP_IMBALANCE_PCT	20
+
 #else	/* CONFIG_FAIR_GROUP_SCHED */
 
 #define for_each_sched_entity(se) \
@@ -967,25 +969,6 @@ static struct task_struct *load_balance_next_fair(void *arg)
 	return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr);
 }
 
-#ifdef CONFIG_FAIR_GROUP_SCHED
-static int cfs_rq_best_prio(struct cfs_rq *cfs_rq)
-{
-	struct sched_entity *curr;
-	struct task_struct *p;
-
-	if (!cfs_rq->nr_running)
-		return MAX_PRIO;
-
-	curr = cfs_rq->curr;
-	if (!curr)
-		curr = __pick_next_entity(cfs_rq);
-
-	p = task_of(curr);
-
-	return p->prio;
-}
-#endif
-
 static unsigned long
 load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
 		  unsigned long max_load_move,
@@ -995,28 +978,45 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
 	struct cfs_rq *busy_cfs_rq;
 	long rem_load_move = max_load_move;
 	struct rq_iterator cfs_rq_iterator;
+	unsigned long load_moved;
 
 	cfs_rq_iterator.start = load_balance_start_fair;
 	cfs_rq_iterator.next = load_balance_next_fair;
 
 	for_each_leaf_cfs_rq(busiest, busy_cfs_rq) {
 #ifdef CONFIG_FAIR_GROUP_SCHED
-		struct cfs_rq *this_cfs_rq;
-		long imbalance;
-		unsigned long maxload;
+		struct cfs_rq *this_cfs_rq = busy_cfs_rq->tg->cfs_rq[this_cpu];
+		unsigned long maxload, task_load, group_weight;
+		unsigned long thisload, per_task_load;
+		struct sched_entity *se = busy_cfs_rq->tg->se[busiest->cpu];
 
-		this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu);
+		task_load = busy_cfs_rq->load.weight;
+		group_weight = se->load.weight;
 
-		imbalance = busy_cfs_rq->load.weight - this_cfs_rq->load.weight;
-		/* Don't pull if this_cfs_rq has more load than busy_cfs_rq */
-		if (imbalance <= 0)
+		/*
+		 * 'group_weight' is contributed by tasks of total weight
+		 * 'task_load'. To move 'rem_load_move' worth of weight only,
+		 * we need to move a maximum task load of:
+		 *
+		 * 	maxload = (remload / group_weight) * task_load;
+		 */
+		maxload = (rem_load_move * task_load) / group_weight;
+
+		if (!maxload || !task_load)
 			continue;
 
-		/* Don't pull more than imbalance/2 */
-		imbalance /= 2;
-		maxload = min(rem_load_move, imbalance);
+		per_task_load = task_load / busy_cfs_rq->nr_running;
+		/*
+		 * balance_tasks will try to forcibly move atleast one task if
+		 * possible (because of SCHED_LOAD_SCALE_FUZZ). Avoid that if
+		 * maxload is less than GROUP_IMBALANCE_FUZZ% the per_task_load.
+		 */
+		 if (100 * maxload < GROUP_IMBALANCE_PCT * per_task_load)
+			continue;
 
-		*this_best_prio = cfs_rq_best_prio(this_cfs_rq);
+		/* Disable priority-based load balance */
+		*this_best_prio = 0;
+		thisload = this_cfs_rq->load.weight;
 #else
 # define maxload rem_load_move
 #endif
@@ -1025,11 +1025,33 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
 		 * load_balance_[start|next]_fair iterators
 		 */
 		cfs_rq_iterator.arg = busy_cfs_rq;
-		rem_load_move -= balance_tasks(this_rq, this_cpu, busiest,
+		load_moved = balance_tasks(this_rq, this_cpu, busiest,
 					       maxload, sd, idle, all_pinned,
 					       this_best_prio,
 					       &cfs_rq_iterator);
 
+#ifdef CONFIG_FAIR_GROUP_SCHED
+		/*
+		 * load_moved holds the task load that was moved. The
+		 * effective (group) weight moved would be:
+		 * 	load_moved_eff = load_moved/task_load * group_weight;
+		 */
+		load_moved = (group_weight * load_moved) / task_load;
+
+		/* Adjust shares on both cpus to reflect load_moved */
+		group_weight -= load_moved;
+		set_se_shares(se, group_weight);
+
+		se = busy_cfs_rq->tg->se[this_cpu];
+		if (!thisload)
+			group_weight = load_moved;
+		else
+			group_weight = se->load.weight + load_moved;
+		set_se_shares(se, group_weight);
+#endif
+
+		rem_load_move -= load_moved;
+
 		if (rem_load_move <= 0)
 			break;
 	}
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index c68f68dcc605..c95f3ed34474 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -309,6 +309,24 @@ static struct ctl_table kern_table[] = {
 		.mode		= 644,
 		.proc_handler	= &proc_dointvec,
 	},
+#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
+	{
+		.ctl_name       = CTL_UNNUMBERED,
+		.procname       = "sched_min_bal_int_shares",
+		.data           = &sysctl_sched_min_bal_int_shares,
+		.maxlen         = sizeof(unsigned int),
+		.mode           = 0644,
+		.proc_handler   = &proc_dointvec,
+	},
+	{
+		.ctl_name       = CTL_UNNUMBERED,
+		.procname       = "sched_max_bal_int_shares",
+		.data           = &sysctl_sched_max_bal_int_shares,
+		.maxlen         = sizeof(unsigned int),
+		.mode           = 0644,
+		.proc_handler   = &proc_dointvec,
+	},
+#endif
 #endif
 	{
 		.ctl_name	= CTL_UNNUMBERED,
-- 
cgit v1.2.3


From d221938c049f4845da13c8593132595a6b9222a8 Mon Sep 17 00:00:00 2001
From: Gautham R Shenoy <ego@in.ibm.com>
Date: Fri, 25 Jan 2008 21:08:01 +0100
Subject: cpu-hotplug: refcount based cpu hotplug

This patch implements a Refcount + Waitqueue based model for
cpu-hotplug.

Now, a thread which wants to prevent cpu-hotplug, will bump up a global
refcount and the thread which wants to perform a cpu-hotplug operation
will block till the global refcount goes to zero.

The readers, if any, during an ongoing cpu-hotplug operation are blocked
until the cpu-hotplug operation is over.

Signed-off-by: Gautham R Shenoy <ego@in.ibm.com>
Signed-off-by: Paul Jackson <pj@sgi.com> [For !CONFIG_HOTPLUG_CPU ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/cpu.h |   3 ++
 init/main.c         |   1 +
 kernel/cpu.c        | 152 ++++++++++++++++++++++++++++++++++++++--------------
 3 files changed, 115 insertions(+), 41 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index 92f2029a34f3..a40247e4d462 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -83,6 +83,9 @@ static inline void unregister_cpu_notifier(struct notifier_block *nb)
 
 #endif /* CONFIG_SMP */
 extern struct sysdev_class cpu_sysdev_class;
+extern void cpu_hotplug_init(void);
+extern void cpu_maps_update_begin(void);
+extern void cpu_maps_update_done(void);
 
 #ifdef CONFIG_HOTPLUG_CPU
 /* Stop CPUs going up and down. */
diff --git a/init/main.c b/init/main.c
index 80b04b6c5157..f287ca5862b9 100644
--- a/init/main.c
+++ b/init/main.c
@@ -607,6 +607,7 @@ asmlinkage void __init start_kernel(void)
 	vfs_caches_init_early();
 	cpuset_init_early();
 	mem_init();
+	cpu_hotplug_init();
 	kmem_cache_init();
 	setup_per_cpu_pageset();
 	numa_policy_init();
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 6b3a0c15144f..656dc3fcbbae 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -15,9 +15,8 @@
 #include <linux/stop_machine.h>
 #include <linux/mutex.h>
 
-/* This protects CPUs going up and down... */
+/* Serializes the updates to cpu_online_map, cpu_present_map */
 static DEFINE_MUTEX(cpu_add_remove_lock);
-static DEFINE_MUTEX(cpu_bitmask_lock);
 
 static __cpuinitdata RAW_NOTIFIER_HEAD(cpu_chain);
 
@@ -26,52 +25,123 @@ static __cpuinitdata RAW_NOTIFIER_HEAD(cpu_chain);
  */
 static int cpu_hotplug_disabled;
 
-#ifdef CONFIG_HOTPLUG_CPU
+static struct {
+	struct task_struct *active_writer;
+	struct mutex lock; /* Synchronizes accesses to refcount, */
+	/*
+	 * Also blocks the new readers during
+	 * an ongoing cpu hotplug operation.
+	 */
+	int refcount;
+	wait_queue_head_t writer_queue;
+} cpu_hotplug;
 
-/* Crappy recursive lock-takers in cpufreq! Complain loudly about idiots */
-static struct task_struct *recursive;
-static int recursive_depth;
+#define writer_exists() (cpu_hotplug.active_writer != NULL)
+
+void __init cpu_hotplug_init(void)
+{
+	cpu_hotplug.active_writer = NULL;
+	mutex_init(&cpu_hotplug.lock);
+	cpu_hotplug.refcount = 0;
+	init_waitqueue_head(&cpu_hotplug.writer_queue);
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
 
 void lock_cpu_hotplug(void)
 {
-	struct task_struct *tsk = current;
-
-	if (tsk == recursive) {
-		static int warnings = 10;
-		if (warnings) {
-			printk(KERN_ERR "Lukewarm IQ detected in hotplug locking\n");
-			WARN_ON(1);
-			warnings--;
-		}
-		recursive_depth++;
+	might_sleep();
+	if (cpu_hotplug.active_writer == current)
 		return;
-	}
-	mutex_lock(&cpu_bitmask_lock);
-	recursive = tsk;
+	mutex_lock(&cpu_hotplug.lock);
+	cpu_hotplug.refcount++;
+	mutex_unlock(&cpu_hotplug.lock);
+
 }
 EXPORT_SYMBOL_GPL(lock_cpu_hotplug);
 
 void unlock_cpu_hotplug(void)
 {
-	WARN_ON(recursive != current);
-	if (recursive_depth) {
-		recursive_depth--;
+	if (cpu_hotplug.active_writer == current)
 		return;
-	}
-	recursive = NULL;
-	mutex_unlock(&cpu_bitmask_lock);
+	mutex_lock(&cpu_hotplug.lock);
+	cpu_hotplug.refcount--;
+
+	if (unlikely(writer_exists()) && !cpu_hotplug.refcount)
+		wake_up(&cpu_hotplug.writer_queue);
+
+	mutex_unlock(&cpu_hotplug.lock);
+
 }
 EXPORT_SYMBOL_GPL(unlock_cpu_hotplug);
 
 #endif	/* CONFIG_HOTPLUG_CPU */
 
+/*
+ * The following two API's must be used when attempting
+ * to serialize the updates to cpu_online_map, cpu_present_map.
+ */
+void cpu_maps_update_begin(void)
+{
+	mutex_lock(&cpu_add_remove_lock);
+}
+
+void cpu_maps_update_done(void)
+{
+	mutex_unlock(&cpu_add_remove_lock);
+}
+
+/*
+ * This ensures that the hotplug operation can begin only when the
+ * refcount goes to zero.
+ *
+ * Note that during a cpu-hotplug operation, the new readers, if any,
+ * will be blocked by the cpu_hotplug.lock
+ *
+ * Since cpu_maps_update_begin is always called after invoking
+ * cpu_maps_update_begin, we can be sure that only one writer is active.
+ *
+ * Note that theoretically, there is a possibility of a livelock:
+ * - Refcount goes to zero, last reader wakes up the sleeping
+ *   writer.
+ * - Last reader unlocks the cpu_hotplug.lock.
+ * - A new reader arrives at this moment, bumps up the refcount.
+ * - The writer acquires the cpu_hotplug.lock finds the refcount
+ *   non zero and goes to sleep again.
+ *
+ * However, this is very difficult to achieve in practice since
+ * lock_cpu_hotplug() not an api which is called all that often.
+ *
+ */
+static void cpu_hotplug_begin(void)
+{
+	DECLARE_WAITQUEUE(wait, current);
+
+	mutex_lock(&cpu_hotplug.lock);
+
+	cpu_hotplug.active_writer = current;
+	add_wait_queue_exclusive(&cpu_hotplug.writer_queue, &wait);
+	while (cpu_hotplug.refcount) {
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		mutex_unlock(&cpu_hotplug.lock);
+		schedule();
+		mutex_lock(&cpu_hotplug.lock);
+	}
+	remove_wait_queue_locked(&cpu_hotplug.writer_queue, &wait);
+}
+
+static void cpu_hotplug_done(void)
+{
+	cpu_hotplug.active_writer = NULL;
+	mutex_unlock(&cpu_hotplug.lock);
+}
 /* Need to know about CPUs going up/down? */
 int __cpuinit register_cpu_notifier(struct notifier_block *nb)
 {
 	int ret;
-	mutex_lock(&cpu_add_remove_lock);
+	cpu_maps_update_begin();
 	ret = raw_notifier_chain_register(&cpu_chain, nb);
-	mutex_unlock(&cpu_add_remove_lock);
+	cpu_maps_update_done();
 	return ret;
 }
 
@@ -81,9 +151,9 @@ EXPORT_SYMBOL(register_cpu_notifier);
 
 void unregister_cpu_notifier(struct notifier_block *nb)
 {
-	mutex_lock(&cpu_add_remove_lock);
+	cpu_maps_update_begin();
 	raw_notifier_chain_unregister(&cpu_chain, nb);
-	mutex_unlock(&cpu_add_remove_lock);
+	cpu_maps_update_done();
 }
 EXPORT_SYMBOL(unregister_cpu_notifier);
 
@@ -147,6 +217,7 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen)
 	if (!cpu_online(cpu))
 		return -EINVAL;
 
+	cpu_hotplug_begin();
 	raw_notifier_call_chain(&cpu_chain, CPU_LOCK_ACQUIRE, hcpu);
 	err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod,
 					hcpu, -1, &nr_calls);
@@ -166,9 +237,7 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen)
 	cpu_clear(cpu, tmp);
 	set_cpus_allowed(current, tmp);
 
-	mutex_lock(&cpu_bitmask_lock);
 	p = __stop_machine_run(take_cpu_down, &tcd_param, cpu);
-	mutex_unlock(&cpu_bitmask_lock);
 
 	if (IS_ERR(p) || cpu_online(cpu)) {
 		/* CPU didn't die: tell everyone.  Can't complain. */
@@ -203,6 +272,7 @@ out_allowed:
 	set_cpus_allowed(current, old_allowed);
 out_release:
 	raw_notifier_call_chain(&cpu_chain, CPU_LOCK_RELEASE, hcpu);
+	cpu_hotplug_done();
 	return err;
 }
 
@@ -210,13 +280,13 @@ int cpu_down(unsigned int cpu)
 {
 	int err = 0;
 
-	mutex_lock(&cpu_add_remove_lock);
+	cpu_maps_update_begin();
 	if (cpu_hotplug_disabled)
 		err = -EBUSY;
 	else
 		err = _cpu_down(cpu, 0);
 
-	mutex_unlock(&cpu_add_remove_lock);
+	cpu_maps_update_done();
 	return err;
 }
 #endif /*CONFIG_HOTPLUG_CPU*/
@@ -231,6 +301,7 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
 	if (cpu_online(cpu) || !cpu_present(cpu))
 		return -EINVAL;
 
+	cpu_hotplug_begin();
 	raw_notifier_call_chain(&cpu_chain, CPU_LOCK_ACQUIRE, hcpu);
 	ret = __raw_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE | mod, hcpu,
 							-1, &nr_calls);
@@ -243,9 +314,7 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
 	}
 
 	/* Arch-specific enabling code. */
-	mutex_lock(&cpu_bitmask_lock);
 	ret = __cpu_up(cpu);
-	mutex_unlock(&cpu_bitmask_lock);
 	if (ret != 0)
 		goto out_notify;
 	BUG_ON(!cpu_online(cpu));
@@ -258,6 +327,7 @@ out_notify:
 		__raw_notifier_call_chain(&cpu_chain,
 				CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL);
 	raw_notifier_call_chain(&cpu_chain, CPU_LOCK_RELEASE, hcpu);
+	cpu_hotplug_done();
 
 	return ret;
 }
@@ -275,13 +345,13 @@ int __cpuinit cpu_up(unsigned int cpu)
 		return -EINVAL;
 	}
 
-	mutex_lock(&cpu_add_remove_lock);
+	cpu_maps_update_begin();
 	if (cpu_hotplug_disabled)
 		err = -EBUSY;
 	else
 		err = _cpu_up(cpu, 0);
 
-	mutex_unlock(&cpu_add_remove_lock);
+	cpu_maps_update_done();
 	return err;
 }
 
@@ -292,7 +362,7 @@ int disable_nonboot_cpus(void)
 {
 	int cpu, first_cpu, error = 0;
 
-	mutex_lock(&cpu_add_remove_lock);
+	cpu_maps_update_begin();
 	first_cpu = first_cpu(cpu_online_map);
 	/* We take down all of the non-boot CPUs in one shot to avoid races
 	 * with the userspace trying to use the CPU hotplug at the same time
@@ -319,7 +389,7 @@ int disable_nonboot_cpus(void)
 	} else {
 		printk(KERN_ERR "Non-boot CPUs are not disabled\n");
 	}
-	mutex_unlock(&cpu_add_remove_lock);
+	cpu_maps_update_done();
 	return error;
 }
 
@@ -328,7 +398,7 @@ void enable_nonboot_cpus(void)
 	int cpu, error;
 
 	/* Allow everyone to use the CPU hotplug again */
-	mutex_lock(&cpu_add_remove_lock);
+	cpu_maps_update_begin();
 	cpu_hotplug_disabled = 0;
 	if (cpus_empty(frozen_cpus))
 		goto out;
@@ -344,6 +414,6 @@ void enable_nonboot_cpus(void)
 	}
 	cpus_clear(frozen_cpus);
 out:
-	mutex_unlock(&cpu_add_remove_lock);
+	cpu_maps_update_done();
 }
 #endif /* CONFIG_PM_SLEEP_SMP */
-- 
cgit v1.2.3


From 86ef5c9a8edd78e6bf92879f32329d89b2d55b5a Mon Sep 17 00:00:00 2001
From: Gautham R Shenoy <ego@in.ibm.com>
Date: Fri, 25 Jan 2008 21:08:02 +0100
Subject: cpu-hotplug: replace lock_cpu_hotplug() with get_online_cpus()

Replace all lock_cpu_hotplug/unlock_cpu_hotplug from the kernel and use
get_online_cpus and put_online_cpus instead as it highlights the
refcount semantics in these operations.

The new API guarantees protection against the cpu-hotplug operation, but
it doesn't guarantee serialized access to any of the local data
structures. Hence the changes needs to be reviewed.

In case of pseries_add_processor/pseries_remove_processor, use
cpu_maps_update_begin()/cpu_maps_update_done() as we're modifying the
cpu_present_map there.

Signed-off-by: Gautham R Shenoy <ego@in.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/cpu-hotplug.txt                | 11 ++++++-----
 arch/mips/kernel/mips-mt-fpaff.c             | 10 +++++-----
 arch/powerpc/platforms/pseries/hotplug-cpu.c |  8 ++++----
 arch/powerpc/platforms/pseries/rtasd.c       |  8 ++++----
 arch/x86/kernel/cpu/mtrr/main.c              |  8 ++++----
 arch/x86/kernel/microcode.c                  | 16 ++++++++--------
 drivers/lguest/x86/core.c                    |  8 ++++----
 drivers/s390/char/sclp_config.c              |  4 ++--
 include/linux/cpu.h                          |  8 ++++----
 kernel/cpu.c                                 | 10 +++++-----
 kernel/cpuset.c                              | 14 +++++++-------
 kernel/rcutorture.c                          |  6 +++---
 kernel/sched.c                               |  4 ++--
 kernel/stop_machine.c                        |  4 ++--
 net/core/flow.c                              |  4 ++--
 15 files changed, 62 insertions(+), 61 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/cpu-hotplug.txt b/Documentation/cpu-hotplug.txt
index a741f658a3c9..fb94f5a71b68 100644
--- a/Documentation/cpu-hotplug.txt
+++ b/Documentation/cpu-hotplug.txt
@@ -109,12 +109,13 @@ Never use anything other than cpumask_t to represent bitmap of CPUs.
 	for_each_cpu_mask(x,mask) - Iterate over some random collection of cpu mask.
 
 	#include <linux/cpu.h>
-	lock_cpu_hotplug() and unlock_cpu_hotplug():
+	get_online_cpus() and put_online_cpus():
 
-The above calls are used to inhibit cpu hotplug operations. While holding the
-cpucontrol mutex, cpu_online_map will not change. If you merely need to avoid
-cpus going away, you could also use preempt_disable() and preempt_enable()
-for those sections. Just remember the critical section cannot call any
+The above calls are used to inhibit cpu hotplug operations. While the
+cpu_hotplug.refcount is non zero, the cpu_online_map will not change.
+If you merely need to avoid cpus going away, you could also use
+preempt_disable() and preempt_enable() for those sections.
+Just remember the critical section cannot call any
 function that can sleep or schedule this process away. The preempt_disable()
 will work as long as stop_machine_run() is used to take a cpu down.
 
diff --git a/arch/mips/kernel/mips-mt-fpaff.c b/arch/mips/kernel/mips-mt-fpaff.c
index 892665bb12b1..bb4f00c0cbe9 100644
--- a/arch/mips/kernel/mips-mt-fpaff.c
+++ b/arch/mips/kernel/mips-mt-fpaff.c
@@ -58,13 +58,13 @@ asmlinkage long mipsmt_sys_sched_setaffinity(pid_t pid, unsigned int len,
 	if (copy_from_user(&new_mask, user_mask_ptr, sizeof(new_mask)))
 		return -EFAULT;
 
-	lock_cpu_hotplug();
+	get_online_cpus();
 	read_lock(&tasklist_lock);
 
 	p = find_process_by_pid(pid);
 	if (!p) {
 		read_unlock(&tasklist_lock);
-		unlock_cpu_hotplug();
+		put_online_cpus();
 		return -ESRCH;
 	}
 
@@ -106,7 +106,7 @@ asmlinkage long mipsmt_sys_sched_setaffinity(pid_t pid, unsigned int len,
 
 out_unlock:
 	put_task_struct(p);
-	unlock_cpu_hotplug();
+	put_online_cpus();
 	return retval;
 }
 
@@ -125,7 +125,7 @@ asmlinkage long mipsmt_sys_sched_getaffinity(pid_t pid, unsigned int len,
 	if (len < real_len)
 		return -EINVAL;
 
-	lock_cpu_hotplug();
+	get_online_cpus();
 	read_lock(&tasklist_lock);
 
 	retval = -ESRCH;
@@ -140,7 +140,7 @@ asmlinkage long mipsmt_sys_sched_getaffinity(pid_t pid, unsigned int len,
 
 out_unlock:
 	read_unlock(&tasklist_lock);
-	unlock_cpu_hotplug();
+	put_online_cpus();
 	if (retval)
 		return retval;
 	if (copy_to_user(user_mask_ptr, &mask, real_len))
diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c b/arch/powerpc/platforms/pseries/hotplug-cpu.c
index 412e6b42986f..c4ad54e0f288 100644
--- a/arch/powerpc/platforms/pseries/hotplug-cpu.c
+++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c
@@ -153,7 +153,7 @@ static int pseries_add_processor(struct device_node *np)
 	for (i = 0; i < nthreads; i++)
 		cpu_set(i, tmp);
 
-	lock_cpu_hotplug();
+	cpu_maps_update_begin();
 
 	BUG_ON(!cpus_subset(cpu_present_map, cpu_possible_map));
 
@@ -190,7 +190,7 @@ static int pseries_add_processor(struct device_node *np)
 	}
 	err = 0;
 out_unlock:
-	unlock_cpu_hotplug();
+	cpu_maps_update_done();
 	return err;
 }
 
@@ -211,7 +211,7 @@ static void pseries_remove_processor(struct device_node *np)
 
 	nthreads = len / sizeof(u32);
 
-	lock_cpu_hotplug();
+	cpu_maps_update_begin();
 	for (i = 0; i < nthreads; i++) {
 		for_each_present_cpu(cpu) {
 			if (get_hard_smp_processor_id(cpu) != intserv[i])
@@ -225,7 +225,7 @@ static void pseries_remove_processor(struct device_node *np)
 			printk(KERN_WARNING "Could not find cpu to remove "
 			       "with physical id 0x%x\n", intserv[i]);
 	}
-	unlock_cpu_hotplug();
+	cpu_maps_update_done();
 }
 
 static int pseries_smp_notifier(struct notifier_block *nb,
diff --git a/arch/powerpc/platforms/pseries/rtasd.c b/arch/powerpc/platforms/pseries/rtasd.c
index 73401c820110..e3078ce41518 100644
--- a/arch/powerpc/platforms/pseries/rtasd.c
+++ b/arch/powerpc/platforms/pseries/rtasd.c
@@ -382,7 +382,7 @@ static void do_event_scan_all_cpus(long delay)
 {
 	int cpu;
 
-	lock_cpu_hotplug();
+	get_online_cpus();
 	cpu = first_cpu(cpu_online_map);
 	for (;;) {
 		set_cpus_allowed(current, cpumask_of_cpu(cpu));
@@ -390,15 +390,15 @@ static void do_event_scan_all_cpus(long delay)
 		set_cpus_allowed(current, CPU_MASK_ALL);
 
 		/* Drop hotplug lock, and sleep for the specified delay */
-		unlock_cpu_hotplug();
+		put_online_cpus();
 		msleep_interruptible(delay);
-		lock_cpu_hotplug();
+		get_online_cpus();
 
 		cpu = next_cpu(cpu, cpu_online_map);
 		if (cpu == NR_CPUS)
 			break;
 	}
-	unlock_cpu_hotplug();
+	put_online_cpus();
 }
 
 static int rtasd(void *unused)
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 3b20613325dc..beb45c9c0835 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -349,7 +349,7 @@ int mtrr_add_page(unsigned long base, unsigned long size,
 	replace = -1;
 
 	/* No CPU hotplug when we change MTRR entries */
-	lock_cpu_hotplug();
+	get_online_cpus();
 	/*  Search for existing MTRR  */
 	mutex_lock(&mtrr_mutex);
 	for (i = 0; i < num_var_ranges; ++i) {
@@ -405,7 +405,7 @@ int mtrr_add_page(unsigned long base, unsigned long size,
 	error = i;
  out:
 	mutex_unlock(&mtrr_mutex);
-	unlock_cpu_hotplug();
+	put_online_cpus();
 	return error;
 }
 
@@ -495,7 +495,7 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size)
 
 	max = num_var_ranges;
 	/* No CPU hotplug when we change MTRR entries */
-	lock_cpu_hotplug();
+	get_online_cpus();
 	mutex_lock(&mtrr_mutex);
 	if (reg < 0) {
 		/*  Search for existing MTRR  */
@@ -536,7 +536,7 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size)
 	error = reg;
  out:
 	mutex_unlock(&mtrr_mutex);
-	unlock_cpu_hotplug();
+	put_online_cpus();
 	return error;
 }
 /**
diff --git a/arch/x86/kernel/microcode.c b/arch/x86/kernel/microcode.c
index 09c315214a5e..40cfd5488719 100644
--- a/arch/x86/kernel/microcode.c
+++ b/arch/x86/kernel/microcode.c
@@ -436,7 +436,7 @@ static ssize_t microcode_write (struct file *file, const char __user *buf, size_
 		return -EINVAL;
 	}
 
-	lock_cpu_hotplug();
+	get_online_cpus();
 	mutex_lock(&microcode_mutex);
 
 	user_buffer = (void __user *) buf;
@@ -447,7 +447,7 @@ static ssize_t microcode_write (struct file *file, const char __user *buf, size_
 		ret = (ssize_t)len;
 
 	mutex_unlock(&microcode_mutex);
-	unlock_cpu_hotplug();
+	put_online_cpus();
 
 	return ret;
 }
@@ -658,14 +658,14 @@ static ssize_t reload_store(struct sys_device *dev, const char *buf, size_t sz)
 
 		old = current->cpus_allowed;
 
-		lock_cpu_hotplug();
+		get_online_cpus();
 		set_cpus_allowed(current, cpumask_of_cpu(cpu));
 
 		mutex_lock(&microcode_mutex);
 		if (uci->valid)
 			err = cpu_request_microcode(cpu);
 		mutex_unlock(&microcode_mutex);
-		unlock_cpu_hotplug();
+		put_online_cpus();
 		set_cpus_allowed(current, old);
 	}
 	if (err)
@@ -817,9 +817,9 @@ static int __init microcode_init (void)
 		return PTR_ERR(microcode_pdev);
 	}
 
-	lock_cpu_hotplug();
+	get_online_cpus();
 	error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver);
-	unlock_cpu_hotplug();
+	put_online_cpus();
 	if (error) {
 		microcode_dev_exit();
 		platform_device_unregister(microcode_pdev);
@@ -839,9 +839,9 @@ static void __exit microcode_exit (void)
 
 	unregister_hotcpu_notifier(&mc_cpu_notifier);
 
-	lock_cpu_hotplug();
+	get_online_cpus();
 	sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver);
-	unlock_cpu_hotplug();
+	put_online_cpus();
 
 	platform_device_unregister(microcode_pdev);
 }
diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c
index 482aec2a9631..96d0fd07c57d 100644
--- a/drivers/lguest/x86/core.c
+++ b/drivers/lguest/x86/core.c
@@ -459,7 +459,7 @@ void __init lguest_arch_host_init(void)
 
 	/* We don't need the complexity of CPUs coming and going while we're
 	 * doing this. */
-	lock_cpu_hotplug();
+	get_online_cpus();
 	if (cpu_has_pge) { /* We have a broader idea of "global". */
 		/* Remember that this was originally set (for cleanup). */
 		cpu_had_pge = 1;
@@ -469,20 +469,20 @@ void __init lguest_arch_host_init(void)
 		/* Turn off the feature in the global feature set. */
 		clear_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability);
 	}
-	unlock_cpu_hotplug();
+	put_online_cpus();
 };
 /*:*/
 
 void __exit lguest_arch_host_fini(void)
 {
 	/* If we had PGE before we started, turn it back on now. */
-	lock_cpu_hotplug();
+	get_online_cpus();
 	if (cpu_had_pge) {
 		set_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability);
 		/* adjust_pge's argument "1" means set PGE. */
 		on_each_cpu(adjust_pge, (void *)1, 0, 1);
 	}
-	unlock_cpu_hotplug();
+	put_online_cpus();
 }
 
 
diff --git a/drivers/s390/char/sclp_config.c b/drivers/s390/char/sclp_config.c
index 5322e5e54a98..9dc77f14fa52 100644
--- a/drivers/s390/char/sclp_config.c
+++ b/drivers/s390/char/sclp_config.c
@@ -29,12 +29,12 @@ static void sclp_cpu_capability_notify(struct work_struct *work)
 	struct sys_device *sysdev;
 
 	printk(KERN_WARNING TAG "cpu capability changed.\n");
-	lock_cpu_hotplug();
+	get_online_cpus();
 	for_each_online_cpu(cpu) {
 		sysdev = get_cpu_sysdev(cpu);
 		kobject_uevent(&sysdev->kobj, KOBJ_CHANGE);
 	}
-	unlock_cpu_hotplug();
+	put_online_cpus();
 }
 
 static void sclp_conf_receiver_fn(struct evbuf_header *evbuf)
diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index a40247e4d462..3a3ff1c5cbef 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -100,8 +100,8 @@ static inline void cpuhotplug_mutex_unlock(struct mutex *cpu_hp_mutex)
 	mutex_unlock(cpu_hp_mutex);
 }
 
-extern void lock_cpu_hotplug(void);
-extern void unlock_cpu_hotplug(void);
+extern void get_online_cpus(void);
+extern void put_online_cpus(void);
 #define hotcpu_notifier(fn, pri) {				\
 	static struct notifier_block fn##_nb =			\
 		{ .notifier_call = fn, .priority = pri };	\
@@ -118,8 +118,8 @@ static inline void cpuhotplug_mutex_lock(struct mutex *cpu_hp_mutex)
 static inline void cpuhotplug_mutex_unlock(struct mutex *cpu_hp_mutex)
 { }
 
-#define lock_cpu_hotplug()	do { } while (0)
-#define unlock_cpu_hotplug()	do { } while (0)
+#define get_online_cpus()	do { } while (0)
+#define put_online_cpus()	do { } while (0)
 #define hotcpu_notifier(fn, pri)	do { (void)(fn); } while (0)
 /* These aren't inline functions due to a GCC bug. */
 #define register_hotcpu_notifier(nb)	({ (void)(nb); 0; })
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 656dc3fcbbae..b0c4152995f8 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -48,7 +48,7 @@ void __init cpu_hotplug_init(void)
 
 #ifdef CONFIG_HOTPLUG_CPU
 
-void lock_cpu_hotplug(void)
+void get_online_cpus(void)
 {
 	might_sleep();
 	if (cpu_hotplug.active_writer == current)
@@ -58,9 +58,9 @@ void lock_cpu_hotplug(void)
 	mutex_unlock(&cpu_hotplug.lock);
 
 }
-EXPORT_SYMBOL_GPL(lock_cpu_hotplug);
+EXPORT_SYMBOL_GPL(get_online_cpus);
 
-void unlock_cpu_hotplug(void)
+void put_online_cpus(void)
 {
 	if (cpu_hotplug.active_writer == current)
 		return;
@@ -73,7 +73,7 @@ void unlock_cpu_hotplug(void)
 	mutex_unlock(&cpu_hotplug.lock);
 
 }
-EXPORT_SYMBOL_GPL(unlock_cpu_hotplug);
+EXPORT_SYMBOL_GPL(put_online_cpus);
 
 #endif	/* CONFIG_HOTPLUG_CPU */
 
@@ -110,7 +110,7 @@ void cpu_maps_update_done(void)
  *   non zero and goes to sleep again.
  *
  * However, this is very difficult to achieve in practice since
- * lock_cpu_hotplug() not an api which is called all that often.
+ * get_online_cpus() not an api which is called all that often.
  *
  */
 static void cpu_hotplug_begin(void)
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 50f5dc463688..cfaf6419d817 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -537,10 +537,10 @@ static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
  *
  * Call with cgroup_mutex held.  May take callback_mutex during
  * call due to the kfifo_alloc() and kmalloc() calls.  May nest
- * a call to the lock_cpu_hotplug()/unlock_cpu_hotplug() pair.
+ * a call to the get_online_cpus()/put_online_cpus() pair.
  * Must not be called holding callback_mutex, because we must not
- * call lock_cpu_hotplug() while holding callback_mutex.  Elsewhere
- * the kernel nests callback_mutex inside lock_cpu_hotplug() calls.
+ * call get_online_cpus() while holding callback_mutex.  Elsewhere
+ * the kernel nests callback_mutex inside get_online_cpus() calls.
  * So the reverse nesting would risk an ABBA deadlock.
  *
  * The three key local variables below are:
@@ -691,9 +691,9 @@ restart:
 
 rebuild:
 	/* Have scheduler rebuild sched domains */
-	lock_cpu_hotplug();
+	get_online_cpus();
 	partition_sched_domains(ndoms, doms);
-	unlock_cpu_hotplug();
+	put_online_cpus();
 
 done:
 	if (q && !IS_ERR(q))
@@ -1617,10 +1617,10 @@ static struct cgroup_subsys_state *cpuset_create(
  *
  * If the cpuset being removed has its flag 'sched_load_balance'
  * enabled, then simulate turning sched_load_balance off, which
- * will call rebuild_sched_domains().  The lock_cpu_hotplug()
+ * will call rebuild_sched_domains().  The get_online_cpus()
  * call in rebuild_sched_domains() must not be made while holding
  * callback_mutex.  Elsewhere the kernel nests callback_mutex inside
- * lock_cpu_hotplug() calls.  So the reverse nesting would risk an
+ * get_online_cpus() calls.  So the reverse nesting would risk an
  * ABBA deadlock.
  */
 
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index c3e165c2318f..fd599829e72a 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -726,11 +726,11 @@ static void rcu_torture_shuffle_tasks(void)
 	cpumask_t tmp_mask = CPU_MASK_ALL;
 	int i;
 
-	lock_cpu_hotplug();
+	get_online_cpus();
 
 	/* No point in shuffling if there is only one online CPU (ex: UP) */
 	if (num_online_cpus() == 1) {
-		unlock_cpu_hotplug();
+		put_online_cpus();
 		return;
 	}
 
@@ -762,7 +762,7 @@ static void rcu_torture_shuffle_tasks(void)
 	else
 		rcu_idle_cpu--;
 
-	unlock_cpu_hotplug();
+	put_online_cpus();
 }
 
 /* Shuffle tasks across CPUs, with the intent of allowing each CPU in the
diff --git a/kernel/sched.c b/kernel/sched.c
index 86e55a9c2de6..672aa68bfeac 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7152,7 +7152,7 @@ static int load_balance_monitor(void *unused)
 		int i, cpu, balanced = 1;
 
 		/* Prevent cpus going down or coming up */
-		lock_cpu_hotplug();
+		get_online_cpus();
 		/* lockout changes to doms_cur[] array */
 		lock_doms_cur();
 		/*
@@ -7186,7 +7186,7 @@ static int load_balance_monitor(void *unused)
 		rcu_read_unlock();
 
 		unlock_doms_cur();
-		unlock_cpu_hotplug();
+		put_online_cpus();
 
 		if (!balanced)
 			timeout = sysctl_sched_min_bal_int_shares;
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 319821ef78af..51b5ee53571a 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -203,13 +203,13 @@ int stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu)
 	int ret;
 
 	/* No CPUs can come up or down during this. */
-	lock_cpu_hotplug();
+	get_online_cpus();
 	p = __stop_machine_run(fn, data, cpu);
 	if (!IS_ERR(p))
 		ret = kthread_stop(p);
 	else
 		ret = PTR_ERR(p);
-	unlock_cpu_hotplug();
+	put_online_cpus();
 
 	return ret;
 }
diff --git a/net/core/flow.c b/net/core/flow.c
index 3ed2b4b1d6d4..6489f4e24ecf 100644
--- a/net/core/flow.c
+++ b/net/core/flow.c
@@ -293,7 +293,7 @@ void flow_cache_flush(void)
 	static DEFINE_MUTEX(flow_flush_sem);
 
 	/* Don't want cpus going down or up during this. */
-	lock_cpu_hotplug();
+	get_online_cpus();
 	mutex_lock(&flow_flush_sem);
 	atomic_set(&info.cpuleft, num_online_cpus());
 	init_completion(&info.completion);
@@ -305,7 +305,7 @@ void flow_cache_flush(void)
 
 	wait_for_completion(&info.completion);
 	mutex_unlock(&flow_flush_sem);
-	unlock_cpu_hotplug();
+	put_online_cpus();
 }
 
 static void __devinit flow_cache_cpu_prepare(int cpu)
-- 
cgit v1.2.3


From 95402b3829010fe1e208f44e4a158ccade88969a Mon Sep 17 00:00:00 2001
From: Gautham R Shenoy <ego@in.ibm.com>
Date: Fri, 25 Jan 2008 21:08:02 +0100
Subject: cpu-hotplug: replace per-subsystem mutexes with get_online_cpus()

This patch converts the known per-subsystem mutexes to get_online_cpus
put_online_cpus. It also eliminates the CPU_LOCK_ACQUIRE and
CPU_LOCK_RELEASE hotplug notification events.

Signed-off-by: Gautham  R Shenoy <ego@in.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/notifier.h |  4 +---
 kernel/cpu.c             |  4 ----
 kernel/sched.c           | 25 +++++++++----------------
 kernel/workqueue.c       | 35 +++++++++++++++--------------------
 mm/slab.c                | 18 +++++++++++-------
 5 files changed, 36 insertions(+), 50 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/notifier.h b/include/linux/notifier.h
index 0c40cc0b4a36..5dfbc684ce7d 100644
--- a/include/linux/notifier.h
+++ b/include/linux/notifier.h
@@ -207,9 +207,7 @@ static inline int notifier_to_errno(int ret)
 #define CPU_DOWN_PREPARE	0x0005 /* CPU (unsigned)v going down */
 #define CPU_DOWN_FAILED		0x0006 /* CPU (unsigned)v NOT going down */
 #define CPU_DEAD		0x0007 /* CPU (unsigned)v dead */
-#define CPU_LOCK_ACQUIRE	0x0008 /* Acquire all hotcpu locks */
-#define CPU_LOCK_RELEASE	0x0009 /* Release all hotcpu locks */
-#define CPU_DYING		0x000A /* CPU (unsigned)v not running any task,
+#define CPU_DYING		0x0008 /* CPU (unsigned)v not running any task,
 				        * not handling interrupts, soon dead */
 
 /* Used for CPU hotplug events occuring while tasks are frozen due to a suspend
diff --git a/kernel/cpu.c b/kernel/cpu.c
index b0c4152995f8..e0d3a4f56ecb 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -218,7 +218,6 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen)
 		return -EINVAL;
 
 	cpu_hotplug_begin();
-	raw_notifier_call_chain(&cpu_chain, CPU_LOCK_ACQUIRE, hcpu);
 	err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod,
 					hcpu, -1, &nr_calls);
 	if (err == NOTIFY_BAD) {
@@ -271,7 +270,6 @@ out_thread:
 out_allowed:
 	set_cpus_allowed(current, old_allowed);
 out_release:
-	raw_notifier_call_chain(&cpu_chain, CPU_LOCK_RELEASE, hcpu);
 	cpu_hotplug_done();
 	return err;
 }
@@ -302,7 +300,6 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
 		return -EINVAL;
 
 	cpu_hotplug_begin();
-	raw_notifier_call_chain(&cpu_chain, CPU_LOCK_ACQUIRE, hcpu);
 	ret = __raw_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE | mod, hcpu,
 							-1, &nr_calls);
 	if (ret == NOTIFY_BAD) {
@@ -326,7 +323,6 @@ out_notify:
 	if (ret != 0)
 		__raw_notifier_call_chain(&cpu_chain,
 				CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL);
-	raw_notifier_call_chain(&cpu_chain, CPU_LOCK_RELEASE, hcpu);
 	cpu_hotplug_done();
 
 	return ret;
diff --git a/kernel/sched.c b/kernel/sched.c
index 672aa68bfeac..c0e2db683e29 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -439,7 +439,6 @@ struct rq {
 };
 
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
-static DEFINE_MUTEX(sched_hotcpu_mutex);
 
 static inline void check_preempt_curr(struct rq *rq, struct task_struct *p)
 {
@@ -4546,13 +4545,13 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask)
 	struct task_struct *p;
 	int retval;
 
-	mutex_lock(&sched_hotcpu_mutex);
+	get_online_cpus();
 	read_lock(&tasklist_lock);
 
 	p = find_process_by_pid(pid);
 	if (!p) {
 		read_unlock(&tasklist_lock);
-		mutex_unlock(&sched_hotcpu_mutex);
+		put_online_cpus();
 		return -ESRCH;
 	}
 
@@ -4592,7 +4591,7 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask)
 	}
 out_unlock:
 	put_task_struct(p);
-	mutex_unlock(&sched_hotcpu_mutex);
+	put_online_cpus();
 	return retval;
 }
 
@@ -4649,7 +4648,7 @@ long sched_getaffinity(pid_t pid, cpumask_t *mask)
 	struct task_struct *p;
 	int retval;
 
-	mutex_lock(&sched_hotcpu_mutex);
+	get_online_cpus();
 	read_lock(&tasklist_lock);
 
 	retval = -ESRCH;
@@ -4665,7 +4664,7 @@ long sched_getaffinity(pid_t pid, cpumask_t *mask)
 
 out_unlock:
 	read_unlock(&tasklist_lock);
-	mutex_unlock(&sched_hotcpu_mutex);
+	put_online_cpus();
 
 	return retval;
 }
@@ -5625,9 +5624,6 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 	struct rq *rq;
 
 	switch (action) {
-	case CPU_LOCK_ACQUIRE:
-		mutex_lock(&sched_hotcpu_mutex);
-		break;
 
 	case CPU_UP_PREPARE:
 	case CPU_UP_PREPARE_FROZEN:
@@ -5697,9 +5693,6 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 		spin_unlock_irq(&rq->lock);
 		break;
 #endif
-	case CPU_LOCK_RELEASE:
-		mutex_unlock(&sched_hotcpu_mutex);
-		break;
 	}
 	return NOTIFY_OK;
 }
@@ -6655,10 +6648,10 @@ static int arch_reinit_sched_domains(void)
 {
 	int err;
 
-	mutex_lock(&sched_hotcpu_mutex);
+	get_online_cpus();
 	detach_destroy_domains(&cpu_online_map);
 	err = arch_init_sched_domains(&cpu_online_map);
-	mutex_unlock(&sched_hotcpu_mutex);
+	put_online_cpus();
 
 	return err;
 }
@@ -6769,12 +6762,12 @@ void __init sched_init_smp(void)
 {
 	cpumask_t non_isolated_cpus;
 
-	mutex_lock(&sched_hotcpu_mutex);
+	get_online_cpus();
 	arch_init_sched_domains(&cpu_online_map);
 	cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map);
 	if (cpus_empty(non_isolated_cpus))
 		cpu_set(smp_processor_id(), non_isolated_cpus);
-	mutex_unlock(&sched_hotcpu_mutex);
+	put_online_cpus();
 	/* XXX: Theoretical race here - CPU may be hotplugged now */
 	hotcpu_notifier(update_sched_domains, 0);
 
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 8db0b597509e..52db48e7f6e7 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -67,9 +67,8 @@ struct workqueue_struct {
 #endif
 };
 
-/* All the per-cpu workqueues on the system, for hotplug cpu to add/remove
-   threads to each one as cpus come/go. */
-static DEFINE_MUTEX(workqueue_mutex);
+/* Serializes the accesses to the list of workqueues. */
+static DEFINE_SPINLOCK(workqueue_lock);
 static LIST_HEAD(workqueues);
 
 static int singlethread_cpu __read_mostly;
@@ -592,8 +591,6 @@ EXPORT_SYMBOL(schedule_delayed_work_on);
  * Returns zero on success.
  * Returns -ve errno on failure.
  *
- * Appears to be racy against CPU hotplug.
- *
  * schedule_on_each_cpu() is very slow.
  */
 int schedule_on_each_cpu(work_func_t func)
@@ -605,7 +602,7 @@ int schedule_on_each_cpu(work_func_t func)
 	if (!works)
 		return -ENOMEM;
 
-	preempt_disable();		/* CPU hotplug */
+	get_online_cpus();
 	for_each_online_cpu(cpu) {
 		struct work_struct *work = per_cpu_ptr(works, cpu);
 
@@ -613,8 +610,8 @@ int schedule_on_each_cpu(work_func_t func)
 		set_bit(WORK_STRUCT_PENDING, work_data_bits(work));
 		__queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu), work);
 	}
-	preempt_enable();
 	flush_workqueue(keventd_wq);
+	put_online_cpus();
 	free_percpu(works);
 	return 0;
 }
@@ -750,8 +747,10 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
 		err = create_workqueue_thread(cwq, singlethread_cpu);
 		start_workqueue_thread(cwq, -1);
 	} else {
-		mutex_lock(&workqueue_mutex);
+		get_online_cpus();
+		spin_lock(&workqueue_lock);
 		list_add(&wq->list, &workqueues);
+		spin_unlock(&workqueue_lock);
 
 		for_each_possible_cpu(cpu) {
 			cwq = init_cpu_workqueue(wq, cpu);
@@ -760,7 +759,7 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
 			err = create_workqueue_thread(cwq, cpu);
 			start_workqueue_thread(cwq, cpu);
 		}
-		mutex_unlock(&workqueue_mutex);
+		put_online_cpus();
 	}
 
 	if (err) {
@@ -775,7 +774,7 @@ static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
 {
 	/*
 	 * Our caller is either destroy_workqueue() or CPU_DEAD,
-	 * workqueue_mutex protects cwq->thread
+	 * get_online_cpus() protects cwq->thread.
 	 */
 	if (cwq->thread == NULL)
 		return;
@@ -810,9 +809,11 @@ void destroy_workqueue(struct workqueue_struct *wq)
 	struct cpu_workqueue_struct *cwq;
 	int cpu;
 
-	mutex_lock(&workqueue_mutex);
+	get_online_cpus();
+	spin_lock(&workqueue_lock);
 	list_del(&wq->list);
-	mutex_unlock(&workqueue_mutex);
+	spin_unlock(&workqueue_lock);
+	put_online_cpus();
 
 	for_each_cpu_mask(cpu, *cpu_map) {
 		cwq = per_cpu_ptr(wq->cpu_wq, cpu);
@@ -835,13 +836,6 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 	action &= ~CPU_TASKS_FROZEN;
 
 	switch (action) {
-	case CPU_LOCK_ACQUIRE:
-		mutex_lock(&workqueue_mutex);
-		return NOTIFY_OK;
-
-	case CPU_LOCK_RELEASE:
-		mutex_unlock(&workqueue_mutex);
-		return NOTIFY_OK;
 
 	case CPU_UP_PREPARE:
 		cpu_set(cpu, cpu_populated_map);
@@ -854,7 +848,8 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 		case CPU_UP_PREPARE:
 			if (!create_workqueue_thread(cwq, cpu))
 				break;
-			printk(KERN_ERR "workqueue for %i failed\n", cpu);
+			printk(KERN_ERR "workqueue [%s] for %i failed\n",
+				wq->name, cpu);
 			return NOTIFY_BAD;
 
 		case CPU_ONLINE:
diff --git a/mm/slab.c b/mm/slab.c
index ff31261fd24f..40c00dacbe4b 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -730,8 +730,7 @@ static inline void init_lock_keys(void)
 #endif
 
 /*
- * 1. Guard access to the cache-chain.
- * 2. Protect sanity of cpu_online_map against cpu hotplug events
+ * Guard access to the cache-chain.
  */
 static DEFINE_MUTEX(cache_chain_mutex);
 static struct list_head cache_chain;
@@ -1331,12 +1330,11 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,
 	int err = 0;
 
 	switch (action) {
-	case CPU_LOCK_ACQUIRE:
-		mutex_lock(&cache_chain_mutex);
-		break;
 	case CPU_UP_PREPARE:
 	case CPU_UP_PREPARE_FROZEN:
+		mutex_lock(&cache_chain_mutex);
 		err = cpuup_prepare(cpu);
+		mutex_unlock(&cache_chain_mutex);
 		break;
 	case CPU_ONLINE:
 	case CPU_ONLINE_FROZEN:
@@ -1373,9 +1371,8 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,
 #endif
 	case CPU_UP_CANCELED:
 	case CPU_UP_CANCELED_FROZEN:
+		mutex_lock(&cache_chain_mutex);
 		cpuup_canceled(cpu);
-		break;
-	case CPU_LOCK_RELEASE:
 		mutex_unlock(&cache_chain_mutex);
 		break;
 	}
@@ -2170,6 +2167,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 	 * We use cache_chain_mutex to ensure a consistent view of
 	 * cpu_online_map as well.  Please see cpuup_callback
 	 */
+	get_online_cpus();
 	mutex_lock(&cache_chain_mutex);
 
 	list_for_each_entry(pc, &cache_chain, next) {
@@ -2396,6 +2394,7 @@ oops:
 		panic("kmem_cache_create(): failed to create slab `%s'\n",
 		      name);
 	mutex_unlock(&cache_chain_mutex);
+	put_online_cpus();
 	return cachep;
 }
 EXPORT_SYMBOL(kmem_cache_create);
@@ -2547,9 +2546,11 @@ int kmem_cache_shrink(struct kmem_cache *cachep)
 	int ret;
 	BUG_ON(!cachep || in_interrupt());
 
+	get_online_cpus();
 	mutex_lock(&cache_chain_mutex);
 	ret = __cache_shrink(cachep);
 	mutex_unlock(&cache_chain_mutex);
+	put_online_cpus();
 	return ret;
 }
 EXPORT_SYMBOL(kmem_cache_shrink);
@@ -2575,6 +2576,7 @@ void kmem_cache_destroy(struct kmem_cache *cachep)
 	BUG_ON(!cachep || in_interrupt());
 
 	/* Find the cache in the chain of caches. */
+	get_online_cpus();
 	mutex_lock(&cache_chain_mutex);
 	/*
 	 * the chain is never empty, cache_cache is never destroyed
@@ -2584,6 +2586,7 @@ void kmem_cache_destroy(struct kmem_cache *cachep)
 		slab_error(cachep, "Can't free all objects");
 		list_add(&cachep->next, &cache_chain);
 		mutex_unlock(&cache_chain_mutex);
+		put_online_cpus();
 		return;
 	}
 
@@ -2592,6 +2595,7 @@ void kmem_cache_destroy(struct kmem_cache *cachep)
 
 	__kmem_cache_destroy(cachep);
 	mutex_unlock(&cache_chain_mutex);
+	put_online_cpus();
 }
 EXPORT_SYMBOL(kmem_cache_destroy);
 
-- 
cgit v1.2.3


From 82a1fcb90287052aabfa235e7ffc693ea003fe69 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 25 Jan 2008 21:08:02 +0100
Subject: softlockup: automatically detect hung TASK_UNINTERRUPTIBLE tasks

this patch extends the soft-lockup detector to automatically
detect hung TASK_UNINTERRUPTIBLE tasks. Such hung tasks are
printed the following way:

 ------------------>
 INFO: task prctl:3042 blocked for more than 120 seconds.
 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message
 prctl         D fd5e3793     0  3042   2997
        f6050f38 00000046 00000001 fd5e3793 00000009 c06d8264 c06dae80 00000286
        f6050f40 f6050f00 f7d34d90 f7d34fc8 c1e1be80 00000001 f6050000 00000000
        f7e92d00 00000286 f6050f18 c0489d1a f6050f40 00006605 00000000 c0133a5b
 Call Trace:
  [<c04883a5>] schedule_timeout+0x6d/0x8b
  [<c04883d8>] schedule_timeout_uninterruptible+0x15/0x17
  [<c0133a76>] msleep+0x10/0x16
  [<c0138974>] sys_prctl+0x30/0x1e2
  [<c0104c52>] sysenter_past_esp+0x5f/0xa5
  =======================
 2 locks held by prctl/3042:
 #0:  (&sb->s_type->i_mutex_key#5){--..}, at: [<c0197d11>] do_fsync+0x38/0x7a
 #1:  (jbd_handle){--..}, at: [<c01ca3d2>] journal_start+0xc7/0xe9
 <------------------

the current default timeout is 120 seconds. Such messages are printed
up to 10 times per bootup. If the system has crashed already then the
messages are not printed.

if lockdep is enabled then all held locks are printed as well.

this feature is a natural extension to the softlockup-detector (kernel
locked up without scheduling) and to the NMI watchdog (kernel locked up
with IRQs disabled).

[ Gautham R Shenoy <ego@in.ibm.com>: CPU hotplug fixes. ]
[ Andrew Morton <akpm@linux-foundation.org>: build warning fix. ]

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
---
 include/linux/debug_locks.h |   5 ++
 include/linux/sched.h       |  10 ++++
 kernel/fork.c               |   5 ++
 kernel/lockdep.c            |  12 ++++-
 kernel/sched.c              |   4 +-
 kernel/softlockup.c         | 114 ++++++++++++++++++++++++++++++++++++++++----
 kernel/sysctl.c             |  27 +++++++++++
 7 files changed, 164 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/debug_locks.h b/include/linux/debug_locks.h
index 1678a5de7013..f4a5871767f5 100644
--- a/include/linux/debug_locks.h
+++ b/include/linux/debug_locks.h
@@ -47,6 +47,7 @@ struct task_struct;
 
 #ifdef CONFIG_LOCKDEP
 extern void debug_show_all_locks(void);
+extern void __debug_show_held_locks(struct task_struct *task);
 extern void debug_show_held_locks(struct task_struct *task);
 extern void debug_check_no_locks_freed(const void *from, unsigned long len);
 extern void debug_check_no_locks_held(struct task_struct *task);
@@ -55,6 +56,10 @@ static inline void debug_show_all_locks(void)
 {
 }
 
+static inline void __debug_show_held_locks(struct task_struct *task)
+{
+}
+
 static inline void debug_show_held_locks(struct task_struct *task)
 {
 }
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 288245f83bd4..0846f1f9e196 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -258,12 +258,17 @@ extern void account_process_tick(struct task_struct *task, int user);
 extern void update_process_times(int user);
 extern void scheduler_tick(void);
 
+extern void sched_show_task(struct task_struct *p);
+
 #ifdef CONFIG_DETECT_SOFTLOCKUP
 extern void softlockup_tick(void);
 extern void spawn_softlockup_task(void);
 extern void touch_softlockup_watchdog(void);
 extern void touch_all_softlockup_watchdogs(void);
 extern int softlockup_thresh;
+extern unsigned long sysctl_hung_task_check_count;
+extern unsigned long sysctl_hung_task_timeout_secs;
+extern long sysctl_hung_task_warnings;
 #else
 static inline void softlockup_tick(void)
 {
@@ -1041,6 +1046,11 @@ struct task_struct {
 /* ipc stuff */
 	struct sysv_sem sysvsem;
 #endif
+#ifdef CONFIG_DETECT_SOFTLOCKUP
+/* hung task detection */
+	unsigned long last_switch_timestamp;
+	unsigned long last_switch_count;
+#endif
 /* CPU-specific state of this task */
 	struct thread_struct thread;
 /* filesystem information */
diff --git a/kernel/fork.c b/kernel/fork.c
index 8dd8ff281009..09c0b90a69cc 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1059,6 +1059,11 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	p->prev_utime = cputime_zero;
 	p->prev_stime = cputime_zero;
 
+#ifdef CONFIG_DETECT_SOFTLOCKUP
+	p->last_switch_count = 0;
+	p->last_switch_timestamp = 0;
+#endif
+
 #ifdef CONFIG_TASK_XACCT
 	p->rchar = 0;		/* I/O counter: bytes read */
 	p->wchar = 0;		/* I/O counter: bytes written */
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index e2c07ece367d..3574379f4d62 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -3206,7 +3206,11 @@ retry:
 
 EXPORT_SYMBOL_GPL(debug_show_all_locks);
 
-void debug_show_held_locks(struct task_struct *task)
+/*
+ * Careful: only use this function if you are sure that
+ * the task cannot run in parallel!
+ */
+void __debug_show_held_locks(struct task_struct *task)
 {
 	if (unlikely(!debug_locks)) {
 		printk("INFO: lockdep is turned off.\n");
@@ -3214,6 +3218,12 @@ void debug_show_held_locks(struct task_struct *task)
 	}
 	lockdep_print_held_locks(task);
 }
+EXPORT_SYMBOL_GPL(__debug_show_held_locks);
+
+void debug_show_held_locks(struct task_struct *task)
+{
+		__debug_show_held_locks(task);
+}
 
 EXPORT_SYMBOL_GPL(debug_show_held_locks);
 
diff --git a/kernel/sched.c b/kernel/sched.c
index c0e2db683e29..5b3d46574eeb 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4945,7 +4945,7 @@ out_unlock:
 
 static const char stat_nam[] = "RSDTtZX";
 
-static void show_task(struct task_struct *p)
+void sched_show_task(struct task_struct *p)
 {
 	unsigned long free = 0;
 	unsigned state;
@@ -4998,7 +4998,7 @@ void show_state_filter(unsigned long state_filter)
 		 */
 		touch_nmi_watchdog();
 		if (!state_filter || (p->state & state_filter))
-			show_task(p);
+			sched_show_task(p);
 	} while_each_thread(g, p);
 
 	touch_all_softlockup_watchdogs();
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 11df812263c8..02f0ad534441 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -8,6 +8,7 @@
  */
 #include <linux/mm.h>
 #include <linux/cpu.h>
+#include <linux/nmi.h>
 #include <linux/init.h>
 #include <linux/delay.h>
 #include <linux/freezer.h>
@@ -24,7 +25,7 @@ static DEFINE_PER_CPU(unsigned long, print_timestamp);
 static DEFINE_PER_CPU(struct task_struct *, watchdog_task);
 
 static int did_panic;
-int softlockup_thresh = 10;
+int softlockup_thresh = 60;
 
 static int
 softlock_panic(struct notifier_block *this, unsigned long event, void *ptr)
@@ -45,7 +46,7 @@ static struct notifier_block panic_block = {
  */
 static unsigned long get_timestamp(int this_cpu)
 {
-	return cpu_clock(this_cpu) >> 30;  /* 2^30 ~= 10^9 */
+	return cpu_clock(this_cpu) >> 30LL;  /* 2^30 ~= 10^9 */
 }
 
 void touch_softlockup_watchdog(void)
@@ -100,11 +101,7 @@ void softlockup_tick(void)
 
 	now = get_timestamp(this_cpu);
 
-	/* Wake up the high-prio watchdog task every second: */
-	if (now > (touch_timestamp + 1))
-		wake_up_process(per_cpu(watchdog_task, this_cpu));
-
-	/* Warn about unreasonable 10+ seconds delays: */
+	/* Warn about unreasonable delays: */
 	if (now <= (touch_timestamp + softlockup_thresh))
 		return;
 
@@ -121,12 +118,94 @@ void softlockup_tick(void)
 	spin_unlock(&print_lock);
 }
 
+/*
+ * Have a reasonable limit on the number of tasks checked:
+ */
+unsigned long sysctl_hung_task_check_count = 1024;
+
+/*
+ * Zero means infinite timeout - no checking done:
+ */
+unsigned long sysctl_hung_task_timeout_secs = 120;
+
+long sysctl_hung_task_warnings = 10;
+
+/*
+ * Only do the hung-tasks check on one CPU:
+ */
+static int check_cpu __read_mostly = -1;
+
+static void check_hung_task(struct task_struct *t, unsigned long now)
+{
+	unsigned long switch_count = t->nvcsw + t->nivcsw;
+
+	if (t->flags & PF_FROZEN)
+		return;
+
+	if (switch_count != t->last_switch_count || !t->last_switch_timestamp) {
+		t->last_switch_count = switch_count;
+		t->last_switch_timestamp = now;
+		return;
+	}
+	if ((long)(now - t->last_switch_timestamp) <
+					sysctl_hung_task_timeout_secs)
+		return;
+	if (sysctl_hung_task_warnings < 0)
+		return;
+	sysctl_hung_task_warnings--;
+
+	/*
+	 * Ok, the task did not get scheduled for more than 2 minutes,
+	 * complain:
+	 */
+	printk(KERN_ERR "INFO: task %s:%d blocked for more than "
+			"%ld seconds.\n", t->comm, t->pid,
+			sysctl_hung_task_timeout_secs);
+	printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\""
+			" disables this message.\n");
+	sched_show_task(t);
+	__debug_show_held_locks(t);
+
+	t->last_switch_timestamp = now;
+	touch_nmi_watchdog();
+}
+
+/*
+ * Check whether a TASK_UNINTERRUPTIBLE does not get woken up for
+ * a really long time (120 seconds). If that happens, print out
+ * a warning.
+ */
+static void check_hung_uninterruptible_tasks(int this_cpu)
+{
+	int max_count = sysctl_hung_task_check_count;
+	unsigned long now = get_timestamp(this_cpu);
+	struct task_struct *g, *t;
+
+	/*
+	 * If the system crashed already then all bets are off,
+	 * do not report extra hung tasks:
+	 */
+	if ((tainted & TAINT_DIE) || did_panic)
+		return;
+
+	read_lock(&tasklist_lock);
+	do_each_thread(g, t) {
+		if (!--max_count)
+			break;
+		if (t->state & TASK_UNINTERRUPTIBLE)
+			check_hung_task(t, now);
+	} while_each_thread(g, t);
+
+	read_unlock(&tasklist_lock);
+}
+
 /*
  * The watchdog thread - runs every second and touches the timestamp.
  */
 static int watchdog(void *__bind_cpu)
 {
 	struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
+	int this_cpu = (long)__bind_cpu;
 
 	sched_setscheduler(current, SCHED_FIFO, &param);
 
@@ -135,13 +214,18 @@ static int watchdog(void *__bind_cpu)
 
 	/*
 	 * Run briefly once per second to reset the softlockup timestamp.
-	 * If this gets delayed for more than 10 seconds then the
+	 * If this gets delayed for more than 60 seconds then the
 	 * debug-printout triggers in softlockup_tick().
 	 */
 	while (!kthread_should_stop()) {
-		set_current_state(TASK_INTERRUPTIBLE);
 		touch_softlockup_watchdog();
-		schedule();
+		msleep_interruptible(10000);
+
+		if (this_cpu != check_cpu)
+			continue;
+
+		if (sysctl_hung_task_timeout_secs)
+			check_hung_uninterruptible_tasks(this_cpu);
 	}
 
 	return 0;
@@ -171,6 +255,7 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
 		break;
 	case CPU_ONLINE:
 	case CPU_ONLINE_FROZEN:
+		check_cpu = any_online_cpu(cpu_online_map);
 		wake_up_process(per_cpu(watchdog_task, hotcpu));
 		break;
 #ifdef CONFIG_HOTPLUG_CPU
@@ -181,6 +266,15 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
 		/* Unbind so it can run.  Fall thru. */
 		kthread_bind(per_cpu(watchdog_task, hotcpu),
 			     any_online_cpu(cpu_online_map));
+	case CPU_DOWN_PREPARE:
+	case CPU_DOWN_PREPARE_FROZEN:
+		if (hotcpu == check_cpu) {
+			cpumask_t temp_cpu_online_map = cpu_online_map;
+
+			cpu_clear(hotcpu, temp_cpu_online_map);
+			check_cpu = any_online_cpu(temp_cpu_online_map);
+		}
+		break;
 	case CPU_DEAD:
 	case CPU_DEAD_FROZEN:
 		p = per_cpu(watchdog_task, hotcpu);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index c95f3ed34474..96f31c1bc4f0 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -753,6 +753,33 @@ static struct ctl_table kern_table[] = {
 		.extra1		= &one,
 		.extra2		= &sixty,
 	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "hung_task_check_count",
+		.data		= &sysctl_hung_task_check_count,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "hung_task_timeout_secs",
+		.data		= &sysctl_hung_task_timeout_secs,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "hung_task_warnings",
+		.data		= &sysctl_hung_task_warnings,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+	},
 #endif
 #ifdef CONFIG_COMPAT
 	{
-- 
cgit v1.2.3


From 63489e45e265f64c368882be1f01c42dec5d984c Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 25 Jan 2008 21:08:03 +0100
Subject: sched: count # of queued RT tasks

This patch adds accounting to keep track of the number of RT tasks running
on a runqueue. This information will be used in later patches.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c    |  1 +
 kernel/sched_rt.c | 17 +++++++++++++++++
 2 files changed, 18 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 5b3d46574eeb..9dd8d121eea6 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -342,6 +342,7 @@ struct rt_rq {
 	struct rt_prio_array active;
 	int rt_load_balance_idx;
 	struct list_head *rt_load_balance_head, *rt_load_balance_curr;
+	unsigned long rt_nr_running;
 };
 
 /*
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index cefcd5105146..e3d2ca8832af 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -26,6 +26,19 @@ static void update_curr_rt(struct rq *rq)
 	cpuacct_charge(curr, delta_exec);
 }
 
+static inline void inc_rt_tasks(struct task_struct *p, struct rq *rq)
+{
+	WARN_ON(!rt_task(p));
+	rq->rt.rt_nr_running++;
+}
+
+static inline void dec_rt_tasks(struct task_struct *p, struct rq *rq)
+{
+	WARN_ON(!rt_task(p));
+	WARN_ON(!rq->rt.rt_nr_running);
+	rq->rt.rt_nr_running--;
+}
+
 static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
 {
 	struct rt_prio_array *array = &rq->rt.active;
@@ -33,6 +46,8 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
 	list_add_tail(&p->run_list, array->queue + p->prio);
 	__set_bit(p->prio, array->bitmap);
 	inc_cpu_load(rq, p->se.load.weight);
+
+	inc_rt_tasks(p, rq);
 }
 
 /*
@@ -48,6 +63,8 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
 	if (list_empty(array->queue + p->prio))
 		__clear_bit(p->prio, array->bitmap);
 	dec_cpu_load(rq, p->se.load.weight);
+
+	dec_rt_tasks(p, rq);
 }
 
 /*
-- 
cgit v1.2.3


From 764a9d6fe4b52995c8aba277e3634385699354f4 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 25 Jan 2008 21:08:04 +0100
Subject: sched: track highest prio task queued

This patch adds accounting to each runqueue to keep track of the
highest prio task queued on the run queue. We only care about
RT tasks, so if the run queue does not contain any active RT tasks
its priority will be considered MAX_RT_PRIO.

This information will be used for later patches.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c    |  3 +++
 kernel/sched_rt.c | 18 ++++++++++++++++++
 2 files changed, 21 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 9dd8d121eea6..6185fa080ec8 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -343,6 +343,8 @@ struct rt_rq {
 	int rt_load_balance_idx;
 	struct list_head *rt_load_balance_head, *rt_load_balance_curr;
 	unsigned long rt_nr_running;
+	/* highest queued rt task prio */
+	int highest_prio;
 };
 
 /*
@@ -6864,6 +6866,7 @@ void __init sched_init(void)
 		rq->cpu = i;
 		rq->migration_thread = NULL;
 		INIT_LIST_HEAD(&rq->migration_queue);
+		rq->rt.highest_prio = MAX_RT_PRIO;
 #endif
 		atomic_set(&rq->nr_iowait, 0);
 
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index e3d2ca8832af..136c2857a049 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -30,6 +30,10 @@ static inline void inc_rt_tasks(struct task_struct *p, struct rq *rq)
 {
 	WARN_ON(!rt_task(p));
 	rq->rt.rt_nr_running++;
+#ifdef CONFIG_SMP
+	if (p->prio < rq->rt.highest_prio)
+		rq->rt.highest_prio = p->prio;
+#endif /* CONFIG_SMP */
 }
 
 static inline void dec_rt_tasks(struct task_struct *p, struct rq *rq)
@@ -37,6 +41,20 @@ static inline void dec_rt_tasks(struct task_struct *p, struct rq *rq)
 	WARN_ON(!rt_task(p));
 	WARN_ON(!rq->rt.rt_nr_running);
 	rq->rt.rt_nr_running--;
+#ifdef CONFIG_SMP
+	if (rq->rt.rt_nr_running) {
+		struct rt_prio_array *array;
+
+		WARN_ON(p->prio < rq->rt.highest_prio);
+		if (p->prio == rq->rt.highest_prio) {
+			/* recalculate */
+			array = &rq->rt.active;
+			rq->rt.highest_prio =
+				sched_find_first_bit(array->bitmap);
+		} /* otherwise leave rq->highest prio alone */
+	} else
+		rq->rt.highest_prio = MAX_RT_PRIO;
+#endif /* CONFIG_SMP */
 }
 
 static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
-- 
cgit v1.2.3


From e8fa136262e1121288bb93befe2295928ffd240d Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 25 Jan 2008 21:08:05 +0100
Subject: sched: add RT task pushing

This patch adds an algorithm to push extra RT tasks off a run queue to
other CPU runqueues.

When more than one RT task is added to a run queue, this algorithm takes
an assertive approach to push the RT tasks that are not running onto other
run queues that have lower priority.  The way this works is that the highest
RT task that is not running is looked at and we examine the runqueues on
the CPUS for that tasks affinity mask. We find the runqueue with the lowest
prio in the CPU affinity of the picked task, and if it is lower in prio than
the picked task, we push the task onto that CPU runqueue.

We continue pushing RT tasks off the current runqueue until we don't push any
more.  The algorithm stops when the next highest RT task can't preempt any
other processes on other CPUS.

TODO: The algorithm may stop when there are still RT tasks that can be
 migrated. Specifically, if the highest non running RT task CPU affinity
 is restricted to CPUs that are running higher priority tasks, there may
 be a lower priority task queued that has an affinity with a CPU that is
 running a lower priority task that it could be migrated to.  This
 patch set does not address this issue.

Note: checkpatch reveals two over 80 character instances. I'm not sure
 that breaking them up will help visually, so I left them as is.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c    |   8 +-
 kernel/sched_rt.c | 225 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 231 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 6185fa080ec8..97cab609fc31 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1952,6 +1952,8 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
 	prev_state = prev->state;
 	finish_arch_switch(prev);
 	finish_lock_switch(rq, prev);
+	schedule_tail_balance_rt(rq);
+
 	fire_sched_in_preempt_notifiers(current);
 	if (mm)
 		mmdrop(mm);
@@ -2185,11 +2187,13 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
 /*
  * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
  */
-static void double_lock_balance(struct rq *this_rq, struct rq *busiest)
+static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
 	__releases(this_rq->lock)
 	__acquires(busiest->lock)
 	__acquires(this_rq->lock)
 {
+	int ret = 0;
+
 	if (unlikely(!irqs_disabled())) {
 		/* printk() doesn't work good under rq->lock */
 		spin_unlock(&this_rq->lock);
@@ -2200,9 +2204,11 @@ static void double_lock_balance(struct rq *this_rq, struct rq *busiest)
 			spin_unlock(&this_rq->lock);
 			spin_lock(&busiest->lock);
 			spin_lock(&this_rq->lock);
+			ret = 1;
 		} else
 			spin_lock(&busiest->lock);
 	}
+	return ret;
 }
 
 /*
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 136c2857a049..7815e90b1147 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -137,6 +137,227 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
 }
 
 #ifdef CONFIG_SMP
+/* Only try algorithms three times */
+#define RT_MAX_TRIES 3
+
+static int double_lock_balance(struct rq *this_rq, struct rq *busiest);
+static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep);
+
+/* Return the second highest RT task, NULL otherwise */
+static struct task_struct *pick_next_highest_task_rt(struct rq *rq)
+{
+	struct rt_prio_array *array = &rq->rt.active;
+	struct task_struct *next;
+	struct list_head *queue;
+	int idx;
+
+	assert_spin_locked(&rq->lock);
+
+	if (likely(rq->rt.rt_nr_running < 2))
+		return NULL;
+
+	idx = sched_find_first_bit(array->bitmap);
+	if (unlikely(idx >= MAX_RT_PRIO)) {
+		WARN_ON(1); /* rt_nr_running is bad */
+		return NULL;
+	}
+
+	queue = array->queue + idx;
+	next = list_entry(queue->next, struct task_struct, run_list);
+	if (unlikely(next != rq->curr))
+		return next;
+
+	if (queue->next->next != queue) {
+		/* same prio task */
+		next = list_entry(queue->next->next, struct task_struct, run_list);
+		return next;
+	}
+
+	/* slower, but more flexible */
+	idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1);
+	if (unlikely(idx >= MAX_RT_PRIO)) {
+		WARN_ON(1); /* rt_nr_running was 2 and above! */
+		return NULL;
+	}
+
+	queue = array->queue + idx;
+	next = list_entry(queue->next, struct task_struct, run_list);
+
+	return next;
+}
+
+static DEFINE_PER_CPU(cpumask_t, local_cpu_mask);
+
+/* Will lock the rq it finds */
+static struct rq *find_lock_lowest_rq(struct task_struct *task,
+				      struct rq *this_rq)
+{
+	struct rq *lowest_rq = NULL;
+	int cpu;
+	int tries;
+	cpumask_t *cpu_mask = &__get_cpu_var(local_cpu_mask);
+
+	cpus_and(*cpu_mask, cpu_online_map, task->cpus_allowed);
+
+	for (tries = 0; tries < RT_MAX_TRIES; tries++) {
+		/*
+		 * Scan each rq for the lowest prio.
+		 */
+		for_each_cpu_mask(cpu, *cpu_mask) {
+			struct rq *rq = &per_cpu(runqueues, cpu);
+
+			if (cpu == this_rq->cpu)
+				continue;
+
+			/* We look for lowest RT prio or non-rt CPU */
+			if (rq->rt.highest_prio >= MAX_RT_PRIO) {
+				lowest_rq = rq;
+				break;
+			}
+
+			/* no locking for now */
+			if (rq->rt.highest_prio > task->prio &&
+			    (!lowest_rq || rq->rt.highest_prio > lowest_rq->rt.highest_prio)) {
+				lowest_rq = rq;
+			}
+		}
+
+		if (!lowest_rq)
+			break;
+
+		/* if the prio of this runqueue changed, try again */
+		if (double_lock_balance(this_rq, lowest_rq)) {
+			/*
+			 * We had to unlock the run queue. In
+			 * the mean time, task could have
+			 * migrated already or had its affinity changed.
+			 * Also make sure that it wasn't scheduled on its rq.
+			 */
+			if (unlikely(task_rq(task) != this_rq ||
+				     !cpu_isset(lowest_rq->cpu, task->cpus_allowed) ||
+				     task_running(this_rq, task) ||
+				     !task->se.on_rq)) {
+				spin_unlock(&lowest_rq->lock);
+				lowest_rq = NULL;
+				break;
+			}
+		}
+
+		/* If this rq is still suitable use it. */
+		if (lowest_rq->rt.highest_prio > task->prio)
+			break;
+
+		/* try again */
+		spin_unlock(&lowest_rq->lock);
+		lowest_rq = NULL;
+	}
+
+	return lowest_rq;
+}
+
+/*
+ * If the current CPU has more than one RT task, see if the non
+ * running task can migrate over to a CPU that is running a task
+ * of lesser priority.
+ */
+static int push_rt_task(struct rq *this_rq)
+{
+	struct task_struct *next_task;
+	struct rq *lowest_rq;
+	int ret = 0;
+	int paranoid = RT_MAX_TRIES;
+
+	assert_spin_locked(&this_rq->lock);
+
+	next_task = pick_next_highest_task_rt(this_rq);
+	if (!next_task)
+		return 0;
+
+ retry:
+	if (unlikely(next_task == this_rq->curr))
+		return 0;
+
+	/*
+	 * It's possible that the next_task slipped in of
+	 * higher priority than current. If that's the case
+	 * just reschedule current.
+	 */
+	if (unlikely(next_task->prio < this_rq->curr->prio)) {
+		resched_task(this_rq->curr);
+		return 0;
+	}
+
+	/* We might release this_rq lock */
+	get_task_struct(next_task);
+
+	/* find_lock_lowest_rq locks the rq if found */
+	lowest_rq = find_lock_lowest_rq(next_task, this_rq);
+	if (!lowest_rq) {
+		struct task_struct *task;
+		/*
+		 * find lock_lowest_rq releases this_rq->lock
+		 * so it is possible that next_task has changed.
+		 * If it has, then try again.
+		 */
+		task = pick_next_highest_task_rt(this_rq);
+		if (unlikely(task != next_task) && task && paranoid--) {
+			put_task_struct(next_task);
+			next_task = task;
+			goto retry;
+		}
+		goto out;
+	}
+
+	assert_spin_locked(&lowest_rq->lock);
+
+	deactivate_task(this_rq, next_task, 0);
+	set_task_cpu(next_task, lowest_rq->cpu);
+	activate_task(lowest_rq, next_task, 0);
+
+	resched_task(lowest_rq->curr);
+
+	spin_unlock(&lowest_rq->lock);
+
+	ret = 1;
+out:
+	put_task_struct(next_task);
+
+	return ret;
+}
+
+/*
+ * TODO: Currently we just use the second highest prio task on
+ *       the queue, and stop when it can't migrate (or there's
+ *       no more RT tasks).  There may be a case where a lower
+ *       priority RT task has a different affinity than the
+ *       higher RT task. In this case the lower RT task could
+ *       possibly be able to migrate where as the higher priority
+ *       RT task could not.  We currently ignore this issue.
+ *       Enhancements are welcome!
+ */
+static void push_rt_tasks(struct rq *rq)
+{
+	/* push_rt_task will return true if it moved an RT */
+	while (push_rt_task(rq))
+		;
+}
+
+static void schedule_tail_balance_rt(struct rq *rq)
+{
+	/*
+	 * If we have more than one rt_task queued, then
+	 * see if we can push the other rt_tasks off to other CPUS.
+	 * Note we may release the rq lock, and since
+	 * the lock was owned by prev, we need to release it
+	 * first via finish_lock_switch and then reaquire it here.
+	 */
+	if (unlikely(rq->rt.rt_nr_running > 1)) {
+		spin_lock_irq(&rq->lock);
+		push_rt_tasks(rq);
+		spin_unlock_irq(&rq->lock);
+	}
+}
+
 /*
  * Load-balancing iterator. Note: while the runqueue stays locked
  * during the whole iteration, the current task might be
@@ -241,7 +462,9 @@ move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
 	return iter_move_one_task(this_rq, this_cpu, busiest, sd, idle,
 				  &rt_rq_iterator);
 }
-#endif
+#else /* CONFIG_SMP */
+# define schedule_tail_balance_rt(rq)	do { } while (0)
+#endif /* CONFIG_SMP */
 
 static void task_tick_rt(struct rq *rq, struct task_struct *p)
 {
-- 
cgit v1.2.3


From 4fd29176b7cd24909f8ceba2105cb3ae2857b90c Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 25 Jan 2008 21:08:06 +0100
Subject: sched: add rt-overload tracking

This patch adds an RT overload accounting system. When a runqueue has
more than one RT task queued, it is marked as overloaded. That is that it
is a candidate to have RT tasks pulled from it.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_rt.c | 36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 7815e90b1147..547f858b0752 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -3,6 +3,38 @@
  * policies)
  */
 
+#ifdef CONFIG_SMP
+static cpumask_t rt_overload_mask;
+static atomic_t rto_count;
+static inline int rt_overloaded(void)
+{
+	return atomic_read(&rto_count);
+}
+static inline cpumask_t *rt_overload(void)
+{
+	return &rt_overload_mask;
+}
+static inline void rt_set_overload(struct rq *rq)
+{
+	cpu_set(rq->cpu, rt_overload_mask);
+	/*
+	 * Make sure the mask is visible before we set
+	 * the overload count. That is checked to determine
+	 * if we should look at the mask. It would be a shame
+	 * if we looked at the mask, but the mask was not
+	 * updated yet.
+	 */
+	wmb();
+	atomic_inc(&rto_count);
+}
+static inline void rt_clear_overload(struct rq *rq)
+{
+	/* the order here really doesn't matter */
+	atomic_dec(&rto_count);
+	cpu_clear(rq->cpu, rt_overload_mask);
+}
+#endif /* CONFIG_SMP */
+
 /*
  * Update the current task's runtime statistics. Skip current tasks that
  * are not in our scheduling class.
@@ -33,6 +65,8 @@ static inline void inc_rt_tasks(struct task_struct *p, struct rq *rq)
 #ifdef CONFIG_SMP
 	if (p->prio < rq->rt.highest_prio)
 		rq->rt.highest_prio = p->prio;
+	if (rq->rt.rt_nr_running > 1)
+		rt_set_overload(rq);
 #endif /* CONFIG_SMP */
 }
 
@@ -54,6 +88,8 @@ static inline void dec_rt_tasks(struct task_struct *p, struct rq *rq)
 		} /* otherwise leave rq->highest prio alone */
 	} else
 		rq->rt.highest_prio = MAX_RT_PRIO;
+	if (rq->rt.rt_nr_running < 2)
+		rt_clear_overload(rq);
 #endif /* CONFIG_SMP */
 }
 
-- 
cgit v1.2.3


From f65eda4f789168ba5ff3fa75546c29efeed19f58 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 25 Jan 2008 21:08:07 +0100
Subject: sched: pull RT tasks from overloaded runqueues

This patch adds the algorithm to pull tasks from RT overloaded runqueues.

When a pull RT is initiated, all overloaded runqueues are examined for
a RT task that is higher in prio than the highest prio task queued on the
target runqueue. If another runqueue holds a RT task that is of higher
prio than the highest prio task on the target runqueue is found it is pulled
to the target runqueue.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c    |   2 +
 kernel/sched_rt.c | 187 ++++++++++++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 178 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 97cab609fc31..c91797107913 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3721,6 +3721,8 @@ need_resched_nonpreemptible:
 		switch_count = &prev->nvcsw;
 	}
 
+	schedule_balance_rt(rq, prev);
+
 	if (unlikely(!rq->nr_running))
 		idle_balance(cpu, rq);
 
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 547f858b0752..bacb32039e95 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -179,8 +179,17 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
 static int double_lock_balance(struct rq *this_rq, struct rq *busiest);
 static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep);
 
+static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
+{
+	if (!task_running(rq, p) &&
+	    (cpu < 0 || cpu_isset(cpu, p->cpus_allowed)))
+		return 1;
+	return 0;
+}
+
 /* Return the second highest RT task, NULL otherwise */
-static struct task_struct *pick_next_highest_task_rt(struct rq *rq)
+static struct task_struct *pick_next_highest_task_rt(struct rq *rq,
+						     int cpu)
 {
 	struct rt_prio_array *array = &rq->rt.active;
 	struct task_struct *next;
@@ -199,26 +208,36 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq)
 	}
 
 	queue = array->queue + idx;
+	BUG_ON(list_empty(queue));
+
 	next = list_entry(queue->next, struct task_struct, run_list);
-	if (unlikely(next != rq->curr))
-		return next;
+	if (unlikely(pick_rt_task(rq, next, cpu)))
+		goto out;
 
 	if (queue->next->next != queue) {
 		/* same prio task */
 		next = list_entry(queue->next->next, struct task_struct, run_list);
-		return next;
+		if (pick_rt_task(rq, next, cpu))
+			goto out;
 	}
 
+ retry:
 	/* slower, but more flexible */
 	idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1);
-	if (unlikely(idx >= MAX_RT_PRIO)) {
-		WARN_ON(1); /* rt_nr_running was 2 and above! */
+	if (unlikely(idx >= MAX_RT_PRIO))
 		return NULL;
-	}
 
 	queue = array->queue + idx;
-	next = list_entry(queue->next, struct task_struct, run_list);
+	BUG_ON(list_empty(queue));
+
+	list_for_each_entry(next, queue, run_list) {
+		if (pick_rt_task(rq, next, cpu))
+			goto out;
+	}
+
+	goto retry;
 
+ out:
 	return next;
 }
 
@@ -305,13 +324,15 @@ static int push_rt_task(struct rq *this_rq)
 
 	assert_spin_locked(&this_rq->lock);
 
-	next_task = pick_next_highest_task_rt(this_rq);
+	next_task = pick_next_highest_task_rt(this_rq, -1);
 	if (!next_task)
 		return 0;
 
  retry:
-	if (unlikely(next_task == this_rq->curr))
+	if (unlikely(next_task == this_rq->curr)) {
+		WARN_ON(1);
 		return 0;
+	}
 
 	/*
 	 * It's possible that the next_task slipped in of
@@ -335,7 +356,7 @@ static int push_rt_task(struct rq *this_rq)
 		 * so it is possible that next_task has changed.
 		 * If it has, then try again.
 		 */
-		task = pick_next_highest_task_rt(this_rq);
+		task = pick_next_highest_task_rt(this_rq, -1);
 		if (unlikely(task != next_task) && task && paranoid--) {
 			put_task_struct(next_task);
 			next_task = task;
@@ -378,6 +399,149 @@ static void push_rt_tasks(struct rq *rq)
 		;
 }
 
+static int pull_rt_task(struct rq *this_rq)
+{
+	struct task_struct *next;
+	struct task_struct *p;
+	struct rq *src_rq;
+	cpumask_t *rto_cpumask;
+	int this_cpu = this_rq->cpu;
+	int cpu;
+	int ret = 0;
+
+	assert_spin_locked(&this_rq->lock);
+
+	/*
+	 * If cpusets are used, and we have overlapping
+	 * run queue cpusets, then this algorithm may not catch all.
+	 * This is just the price you pay on trying to keep
+	 * dirtying caches down on large SMP machines.
+	 */
+	if (likely(!rt_overloaded()))
+		return 0;
+
+	next = pick_next_task_rt(this_rq);
+
+	rto_cpumask = rt_overload();
+
+	for_each_cpu_mask(cpu, *rto_cpumask) {
+		if (this_cpu == cpu)
+			continue;
+
+		src_rq = cpu_rq(cpu);
+		if (unlikely(src_rq->rt.rt_nr_running <= 1)) {
+			/*
+			 * It is possible that overlapping cpusets
+			 * will miss clearing a non overloaded runqueue.
+			 * Clear it now.
+			 */
+			if (double_lock_balance(this_rq, src_rq)) {
+				/* unlocked our runqueue lock */
+				struct task_struct *old_next = next;
+				next = pick_next_task_rt(this_rq);
+				if (next != old_next)
+					ret = 1;
+			}
+			if (likely(src_rq->rt.rt_nr_running <= 1))
+				/*
+				 * Small chance that this_rq->curr changed
+				 * but it's really harmless here.
+				 */
+				rt_clear_overload(this_rq);
+			else
+				/*
+				 * Heh, the src_rq is now overloaded, since
+				 * we already have the src_rq lock, go straight
+				 * to pulling tasks from it.
+				 */
+				goto try_pulling;
+			spin_unlock(&src_rq->lock);
+			continue;
+		}
+
+		/*
+		 * We can potentially drop this_rq's lock in
+		 * double_lock_balance, and another CPU could
+		 * steal our next task - hence we must cause
+		 * the caller to recalculate the next task
+		 * in that case:
+		 */
+		if (double_lock_balance(this_rq, src_rq)) {
+			struct task_struct *old_next = next;
+			next = pick_next_task_rt(this_rq);
+			if (next != old_next)
+				ret = 1;
+		}
+
+		/*
+		 * Are there still pullable RT tasks?
+		 */
+		if (src_rq->rt.rt_nr_running <= 1) {
+			spin_unlock(&src_rq->lock);
+			continue;
+		}
+
+ try_pulling:
+		p = pick_next_highest_task_rt(src_rq, this_cpu);
+
+		/*
+		 * Do we have an RT task that preempts
+		 * the to-be-scheduled task?
+		 */
+		if (p && (!next || (p->prio < next->prio))) {
+			WARN_ON(p == src_rq->curr);
+			WARN_ON(!p->se.on_rq);
+
+			/*
+			 * There's a chance that p is higher in priority
+			 * than what's currently running on its cpu.
+			 * This is just that p is wakeing up and hasn't
+			 * had a chance to schedule. We only pull
+			 * p if it is lower in priority than the
+			 * current task on the run queue or
+			 * this_rq next task is lower in prio than
+			 * the current task on that rq.
+			 */
+			if (p->prio < src_rq->curr->prio ||
+			    (next && next->prio < src_rq->curr->prio))
+				goto bail;
+
+			ret = 1;
+
+			deactivate_task(src_rq, p, 0);
+			set_task_cpu(p, this_cpu);
+			activate_task(this_rq, p, 0);
+			/*
+			 * We continue with the search, just in
+			 * case there's an even higher prio task
+			 * in another runqueue. (low likelyhood
+			 * but possible)
+			 */
+
+			/*
+			 * Update next so that we won't pick a task
+			 * on another cpu with a priority lower (or equal)
+			 * than the one we just picked.
+			 */
+			next = p;
+
+		}
+ bail:
+		spin_unlock(&src_rq->lock);
+	}
+
+	return ret;
+}
+
+static void schedule_balance_rt(struct rq *rq,
+				struct task_struct *prev)
+{
+	/* Try to pull RT tasks here if we lower this rq's prio */
+	if (unlikely(rt_task(prev)) &&
+	    rq->rt.highest_prio > prev->prio)
+		pull_rt_task(rq);
+}
+
 static void schedule_tail_balance_rt(struct rq *rq)
 {
 	/*
@@ -500,6 +664,7 @@ move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
 }
 #else /* CONFIG_SMP */
 # define schedule_tail_balance_rt(rq)	do { } while (0)
+# define schedule_balance_rt(rq, prev)	do { } while (0)
 #endif /* CONFIG_SMP */
 
 static void task_tick_rt(struct rq *rq, struct task_struct *p)
-- 
cgit v1.2.3


From 4642dafdf93dc7d66ee33437b93a5e6b8cea20d2 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 25 Jan 2008 21:08:07 +0100
Subject: sched: push RT tasks from overloaded CPUs

This patch adds pushing of overloaded RT tasks from a runqueue that is
having tasks (most likely RT tasks) added to the run queue.

TODO: We don't cover the case of waking of new RT tasks (yet).

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c    |  1 +
 kernel/sched_rt.c | 10 ++++++++++
 2 files changed, 11 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index c91797107913..357d3a084de8 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1710,6 +1710,7 @@ out_activate:
 
 out_running:
 	p->state = TASK_RUNNING;
+	wakeup_balance_rt(rq, p);
 out:
 	task_rq_unlock(rq, &flags);
 
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index bacb32039e95..d38a8a559aa5 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -558,6 +558,15 @@ static void schedule_tail_balance_rt(struct rq *rq)
 	}
 }
 
+
+static void wakeup_balance_rt(struct rq *rq, struct task_struct *p)
+{
+	if (unlikely(rt_task(p)) &&
+	    !task_running(rq, p) &&
+	    (p->prio >= rq->curr->prio))
+		push_rt_tasks(rq);
+}
+
 /*
  * Load-balancing iterator. Note: while the runqueue stays locked
  * during the whole iteration, the current task might be
@@ -665,6 +674,7 @@ move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
 #else /* CONFIG_SMP */
 # define schedule_tail_balance_rt(rq)	do { } while (0)
 # define schedule_balance_rt(rq, prev)	do { } while (0)
+# define wakeup_balance_rt(rq, p)	do { } while (0)
 #endif /* CONFIG_SMP */
 
 static void task_tick_rt(struct rq *rq, struct task_struct *p)
-- 
cgit v1.2.3


From c7a1e46aa9782a947cf2ed506245d43396dbf991 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 25 Jan 2008 21:08:07 +0100
Subject: sched: disable standard balancer for RT tasks

Since we now take an active approach to load balancing, we don't need to
balance RT tasks via the normal task balancer. In fact, this code was
found to pull RT tasks away from CPUS that the active movement performed,
resulting in large latencies.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_rt.c | 95 +++----------------------------------------------------
 1 file changed, 4 insertions(+), 91 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index d38a8a559aa5..c492fd2b2eec 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -567,109 +567,22 @@ static void wakeup_balance_rt(struct rq *rq, struct task_struct *p)
 		push_rt_tasks(rq);
 }
 
-/*
- * Load-balancing iterator. Note: while the runqueue stays locked
- * during the whole iteration, the current task might be
- * dequeued so the iterator has to be dequeue-safe. Here we
- * achieve that by always pre-iterating before returning
- * the current task:
- */
-static struct task_struct *load_balance_start_rt(void *arg)
-{
-	struct rq *rq = arg;
-	struct rt_prio_array *array = &rq->rt.active;
-	struct list_head *head, *curr;
-	struct task_struct *p;
-	int idx;
-
-	idx = sched_find_first_bit(array->bitmap);
-	if (idx >= MAX_RT_PRIO)
-		return NULL;
-
-	head = array->queue + idx;
-	curr = head->prev;
-
-	p = list_entry(curr, struct task_struct, run_list);
-
-	curr = curr->prev;
-
-	rq->rt.rt_load_balance_idx = idx;
-	rq->rt.rt_load_balance_head = head;
-	rq->rt.rt_load_balance_curr = curr;
-
-	return p;
-}
-
-static struct task_struct *load_balance_next_rt(void *arg)
-{
-	struct rq *rq = arg;
-	struct rt_prio_array *array = &rq->rt.active;
-	struct list_head *head, *curr;
-	struct task_struct *p;
-	int idx;
-
-	idx = rq->rt.rt_load_balance_idx;
-	head = rq->rt.rt_load_balance_head;
-	curr = rq->rt.rt_load_balance_curr;
-
-	/*
-	 * If we arrived back to the head again then
-	 * iterate to the next queue (if any):
-	 */
-	if (unlikely(head == curr)) {
-		int next_idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1);
-
-		if (next_idx >= MAX_RT_PRIO)
-			return NULL;
-
-		idx = next_idx;
-		head = array->queue + idx;
-		curr = head->prev;
-
-		rq->rt.rt_load_balance_idx = idx;
-		rq->rt.rt_load_balance_head = head;
-	}
-
-	p = list_entry(curr, struct task_struct, run_list);
-
-	curr = curr->prev;
-
-	rq->rt.rt_load_balance_curr = curr;
-
-	return p;
-}
-
 static unsigned long
 load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
 		unsigned long max_load_move,
 		struct sched_domain *sd, enum cpu_idle_type idle,
 		int *all_pinned, int *this_best_prio)
 {
-	struct rq_iterator rt_rq_iterator;
-
-	rt_rq_iterator.start = load_balance_start_rt;
-	rt_rq_iterator.next = load_balance_next_rt;
-	/* pass 'busiest' rq argument into
-	 * load_balance_[start|next]_rt iterators
-	 */
-	rt_rq_iterator.arg = busiest;
-
-	return balance_tasks(this_rq, this_cpu, busiest, max_load_move, sd,
-			     idle, all_pinned, this_best_prio, &rt_rq_iterator);
+	/* don't touch RT tasks */
+	return 0;
 }
 
 static int
 move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
 		 struct sched_domain *sd, enum cpu_idle_type idle)
 {
-	struct rq_iterator rt_rq_iterator;
-
-	rt_rq_iterator.start = load_balance_start_rt;
-	rt_rq_iterator.next = load_balance_next_rt;
-	rt_rq_iterator.arg = busiest;
-
-	return iter_move_one_task(this_rq, this_cpu, busiest, sd, idle,
-				  &rt_rq_iterator);
+	/* don't touch RT tasks */
+	return 0;
 }
 #else /* CONFIG_SMP */
 # define schedule_tail_balance_rt(rq)	do { } while (0)
-- 
cgit v1.2.3


From 73fe6aae84400e2b475e2a1dc4e8592cd3ed6e69 Mon Sep 17 00:00:00 2001
From: Gregory Haskins <ghaskins@novell.com>
Date: Fri, 25 Jan 2008 21:08:07 +0100
Subject: sched: add RT-balance cpu-weight

Some RT tasks (particularly kthreads) are bound to one specific CPU.
It is fairly common for two or more bound tasks to get queued up at the
same time.  Consider, for instance, softirq_timer and softirq_sched.  A
timer goes off in an ISR which schedules softirq_thread to run at RT50.
Then the timer handler determines that it's time to smp-rebalance the
system so it schedules softirq_sched to run.  So we are in a situation
where we have two RT50 tasks queued, and the system will go into
rt-overload condition to request other CPUs for help.

This causes two problems in the current code:

1) If a high-priority bound task and a low-priority unbounded task queue
   up behind the running task, we will fail to ever relocate the unbounded
   task because we terminate the search on the first unmovable task.

2) We spend precious futile cycles in the fast-path trying to pull
   overloaded tasks over.  It is therefore optimial to strive to avoid the
   overhead all together if we can cheaply detect the condition before
   overload even occurs.

This patch tries to achieve this optimization by utilizing the hamming
weight of the task->cpus_allowed mask.  A weight of 1 indicates that
the task cannot be migrated.  We will then utilize this information to
skip non-migratable tasks and to eliminate uncessary rebalance attempts.

We introduce a per-rq variable to count the number of migratable tasks
that are currently running.  We only go into overload if we have more
than one rt task, AND at least one of them is migratable.

In addition, we introduce a per-task variable to cache the cpus_allowed
weight, since the hamming calculation is probably relatively expensive.
We only update the cached value when the mask is updated which should be
relatively infrequent, especially compared to scheduling frequency
in the fast path.

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/init_task.h |  1 +
 include/linux/sched.h     |  2 ++
 kernel/fork.c             |  1 +
 kernel/sched.c            |  9 ++++++++-
 kernel/sched_rt.c         | 50 ++++++++++++++++++++++++++++++++++++++++++-----
 5 files changed, 57 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index cae35b6b9aec..572c65bcc80f 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -130,6 +130,7 @@ extern struct group_info init_groups;
 	.normal_prio	= MAX_PRIO-20,					\
 	.policy		= SCHED_NORMAL,					\
 	.cpus_allowed	= CPU_MASK_ALL,					\
+	.nr_cpus_allowed = NR_CPUS,					\
 	.mm		= NULL,						\
 	.active_mm	= &init_mm,					\
 	.run_list	= LIST_HEAD_INIT(tsk.run_list),			\
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 0846f1f9e196..b07a2cf76401 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -847,6 +847,7 @@ struct sched_class {
 	void (*set_curr_task) (struct rq *rq);
 	void (*task_tick) (struct rq *rq, struct task_struct *p);
 	void (*task_new) (struct rq *rq, struct task_struct *p);
+	void (*set_cpus_allowed)(struct task_struct *p, cpumask_t *newmask);
 };
 
 struct load_weight {
@@ -956,6 +957,7 @@ struct task_struct {
 
 	unsigned int policy;
 	cpumask_t cpus_allowed;
+	int nr_cpus_allowed;
 	unsigned int time_slice;
 
 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
diff --git a/kernel/fork.c b/kernel/fork.c
index 09c0b90a69cc..930c51865ab4 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1242,6 +1242,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	 * parent's CPU). This avoids alot of nasty races.
 	 */
 	p->cpus_allowed = current->cpus_allowed;
+	p->nr_cpus_allowed = current->nr_cpus_allowed;
 	if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) ||
 			!cpu_online(task_cpu(p))))
 		set_task_cpu(p, smp_processor_id());
diff --git a/kernel/sched.c b/kernel/sched.c
index 357d3a084de8..66e99b419b31 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -343,6 +343,7 @@ struct rt_rq {
 	int rt_load_balance_idx;
 	struct list_head *rt_load_balance_head, *rt_load_balance_curr;
 	unsigned long rt_nr_running;
+	unsigned long rt_nr_migratory;
 	/* highest queued rt task prio */
 	int highest_prio;
 };
@@ -5144,7 +5145,13 @@ int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
 		goto out;
 	}
 
-	p->cpus_allowed = new_mask;
+	if (p->sched_class->set_cpus_allowed)
+		p->sched_class->set_cpus_allowed(p, &new_mask);
+	else {
+		p->cpus_allowed    = new_mask;
+		p->nr_cpus_allowed = cpus_weight(new_mask);
+	}
+
 	/* Can the task run on the task's current CPU? If so, we're done */
 	if (cpu_isset(task_cpu(p), new_mask))
 		goto out;
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index c492fd2b2eec..ae4995c09aac 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -33,6 +33,14 @@ static inline void rt_clear_overload(struct rq *rq)
 	atomic_dec(&rto_count);
 	cpu_clear(rq->cpu, rt_overload_mask);
 }
+
+static void update_rt_migration(struct rq *rq)
+{
+	if (rq->rt.rt_nr_migratory && (rq->rt.rt_nr_running > 1))
+		rt_set_overload(rq);
+	else
+		rt_clear_overload(rq);
+}
 #endif /* CONFIG_SMP */
 
 /*
@@ -65,8 +73,10 @@ static inline void inc_rt_tasks(struct task_struct *p, struct rq *rq)
 #ifdef CONFIG_SMP
 	if (p->prio < rq->rt.highest_prio)
 		rq->rt.highest_prio = p->prio;
-	if (rq->rt.rt_nr_running > 1)
-		rt_set_overload(rq);
+	if (p->nr_cpus_allowed > 1)
+		rq->rt.rt_nr_migratory++;
+
+	update_rt_migration(rq);
 #endif /* CONFIG_SMP */
 }
 
@@ -88,8 +98,10 @@ static inline void dec_rt_tasks(struct task_struct *p, struct rq *rq)
 		} /* otherwise leave rq->highest prio alone */
 	} else
 		rq->rt.highest_prio = MAX_RT_PRIO;
-	if (rq->rt.rt_nr_running < 2)
-		rt_clear_overload(rq);
+	if (p->nr_cpus_allowed > 1)
+		rq->rt.rt_nr_migratory--;
+
+	update_rt_migration(rq);
 #endif /* CONFIG_SMP */
 }
 
@@ -182,7 +194,8 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep);
 static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
 {
 	if (!task_running(rq, p) &&
-	    (cpu < 0 || cpu_isset(cpu, p->cpus_allowed)))
+	    (cpu < 0 || cpu_isset(cpu, p->cpus_allowed)) &&
+	    (p->nr_cpus_allowed > 1))
 		return 1;
 	return 0;
 }
@@ -584,6 +597,32 @@ move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
 	/* don't touch RT tasks */
 	return 0;
 }
+static void set_cpus_allowed_rt(struct task_struct *p, cpumask_t *new_mask)
+{
+	int weight = cpus_weight(*new_mask);
+
+	BUG_ON(!rt_task(p));
+
+	/*
+	 * Update the migration status of the RQ if we have an RT task
+	 * which is running AND changing its weight value.
+	 */
+	if (p->se.on_rq && (weight != p->nr_cpus_allowed)) {
+		struct rq *rq = task_rq(p);
+
+		if ((p->nr_cpus_allowed <= 1) && (weight > 1))
+			rq->rt.rt_nr_migratory++;
+		else if((p->nr_cpus_allowed > 1) && (weight <= 1)) {
+			BUG_ON(!rq->rt.rt_nr_migratory);
+			rq->rt.rt_nr_migratory--;
+		}
+
+		update_rt_migration(rq);
+	}
+
+	p->cpus_allowed    = *new_mask;
+	p->nr_cpus_allowed = weight;
+}
 #else /* CONFIG_SMP */
 # define schedule_tail_balance_rt(rq)	do { } while (0)
 # define schedule_balance_rt(rq, prev)	do { } while (0)
@@ -637,6 +676,7 @@ const struct sched_class rt_sched_class = {
 #ifdef CONFIG_SMP
 	.load_balance		= load_balance_rt,
 	.move_one_task		= move_one_task_rt,
+	.set_cpus_allowed       = set_cpus_allowed_rt,
 #endif
 
 	.set_curr_task          = set_curr_task_rt,
-- 
cgit v1.2.3


From 697f0a487f294e634a342764472b79375bb3158a Mon Sep 17 00:00:00 2001
From: Gregory Haskins <ghaskins@novell.com>
Date: Fri, 25 Jan 2008 21:08:09 +0100
Subject: sched: clean up this_rq use in kernel/sched_rt.c

"this_rq" is normally used to denote the RQ on the current cpu
(i.e. "cpu_rq(this_cpu)").  So clean up the usage of this_rq to be
more consistent with the rest of the code.

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_rt.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index ae4995c09aac..b788e35ffd3f 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -328,21 +328,21 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task,
  * running task can migrate over to a CPU that is running a task
  * of lesser priority.
  */
-static int push_rt_task(struct rq *this_rq)
+static int push_rt_task(struct rq *rq)
 {
 	struct task_struct *next_task;
 	struct rq *lowest_rq;
 	int ret = 0;
 	int paranoid = RT_MAX_TRIES;
 
-	assert_spin_locked(&this_rq->lock);
+	assert_spin_locked(&rq->lock);
 
-	next_task = pick_next_highest_task_rt(this_rq, -1);
+	next_task = pick_next_highest_task_rt(rq, -1);
 	if (!next_task)
 		return 0;
 
  retry:
-	if (unlikely(next_task == this_rq->curr)) {
+	if (unlikely(next_task == rq->curr)) {
 		WARN_ON(1);
 		return 0;
 	}
@@ -352,24 +352,24 @@ static int push_rt_task(struct rq *this_rq)
 	 * higher priority than current. If that's the case
 	 * just reschedule current.
 	 */
-	if (unlikely(next_task->prio < this_rq->curr->prio)) {
-		resched_task(this_rq->curr);
+	if (unlikely(next_task->prio < rq->curr->prio)) {
+		resched_task(rq->curr);
 		return 0;
 	}
 
-	/* We might release this_rq lock */
+	/* We might release rq lock */
 	get_task_struct(next_task);
 
 	/* find_lock_lowest_rq locks the rq if found */
-	lowest_rq = find_lock_lowest_rq(next_task, this_rq);
+	lowest_rq = find_lock_lowest_rq(next_task, rq);
 	if (!lowest_rq) {
 		struct task_struct *task;
 		/*
-		 * find lock_lowest_rq releases this_rq->lock
+		 * find lock_lowest_rq releases rq->lock
 		 * so it is possible that next_task has changed.
 		 * If it has, then try again.
 		 */
-		task = pick_next_highest_task_rt(this_rq, -1);
+		task = pick_next_highest_task_rt(rq, -1);
 		if (unlikely(task != next_task) && task && paranoid--) {
 			put_task_struct(next_task);
 			next_task = task;
@@ -380,7 +380,7 @@ static int push_rt_task(struct rq *this_rq)
 
 	assert_spin_locked(&lowest_rq->lock);
 
-	deactivate_task(this_rq, next_task, 0);
+	deactivate_task(rq, next_task, 0);
 	set_task_cpu(next_task, lowest_rq->cpu);
 	activate_task(lowest_rq, next_task, 0);
 
-- 
cgit v1.2.3


From e7693a362ec84bb5b6fd441d8a8b4b9d568a7a0c Mon Sep 17 00:00:00 2001
From: Gregory Haskins <ghaskins@novell.com>
Date: Fri, 25 Jan 2008 21:08:09 +0100
Subject: sched: de-SCHED_OTHER-ize the RT path

The current wake-up code path tries to determine if it can optimize the
wake-up to "this_cpu" by computing load calculations.  The problem is that
these calculations are only relevant to SCHED_OTHER tasks where load is king.
For RT tasks, priority is king.  So the load calculation is completely wasted
bandwidth.

Therefore, we create a new sched_class interface to help with
pre-wakeup routing decisions and move the load calculation as a function
of CFS task's class.

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h   |   1 +
 kernel/sched.c          | 167 ++++++++----------------------------------------
 kernel/sched_fair.c     | 148 ++++++++++++++++++++++++++++++++++++++++++
 kernel/sched_idletask.c |   9 +++
 kernel/sched_rt.c       |  10 +++
 5 files changed, 195 insertions(+), 140 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index b07a2cf76401..6fbbf38555ac 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -827,6 +827,7 @@ struct sched_class {
 	void (*enqueue_task) (struct rq *rq, struct task_struct *p, int wakeup);
 	void (*dequeue_task) (struct rq *rq, struct task_struct *p, int sleep);
 	void (*yield_task) (struct rq *rq);
+	int  (*select_task_rq)(struct task_struct *p, int sync);
 
 	void (*check_preempt_curr) (struct rq *rq, struct task_struct *p);
 
diff --git a/kernel/sched.c b/kernel/sched.c
index 66e99b419b31..3344ba776b97 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -960,6 +960,13 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load)
 	update_load_sub(&rq->load, load);
 }
 
+#ifdef CONFIG_SMP
+static unsigned long source_load(int cpu, int type);
+static unsigned long target_load(int cpu, int type);
+static unsigned long cpu_avg_load_per_task(int cpu);
+static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
+#endif /* CONFIG_SMP */
+
 #include "sched_stats.h"
 #include "sched_idletask.c"
 #include "sched_fair.c"
@@ -1118,7 +1125,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
 /*
  * Is this task likely cache-hot:
  */
-static inline int
+static int
 task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
 {
 	s64 delta;
@@ -1343,7 +1350,7 @@ static unsigned long target_load(int cpu, int type)
 /*
  * Return the average load per task on the cpu's run queue
  */
-static inline unsigned long cpu_avg_load_per_task(int cpu)
+static unsigned long cpu_avg_load_per_task(int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
 	unsigned long total = weighted_cpuload(cpu);
@@ -1500,58 +1507,6 @@ static int sched_balance_self(int cpu, int flag)
 
 #endif /* CONFIG_SMP */
 
-/*
- * wake_idle() will wake a task on an idle cpu if task->cpu is
- * not idle and an idle cpu is available.  The span of cpus to
- * search starts with cpus closest then further out as needed,
- * so we always favor a closer, idle cpu.
- *
- * Returns the CPU we should wake onto.
- */
-#if defined(ARCH_HAS_SCHED_WAKE_IDLE)
-static int wake_idle(int cpu, struct task_struct *p)
-{
-	cpumask_t tmp;
-	struct sched_domain *sd;
-	int i;
-
-	/*
-	 * If it is idle, then it is the best cpu to run this task.
-	 *
-	 * This cpu is also the best, if it has more than one task already.
-	 * Siblings must be also busy(in most cases) as they didn't already
-	 * pickup the extra load from this cpu and hence we need not check
-	 * sibling runqueue info. This will avoid the checks and cache miss
-	 * penalities associated with that.
-	 */
-	if (idle_cpu(cpu) || cpu_rq(cpu)->nr_running > 1)
-		return cpu;
-
-	for_each_domain(cpu, sd) {
-		if (sd->flags & SD_WAKE_IDLE) {
-			cpus_and(tmp, sd->span, p->cpus_allowed);
-			for_each_cpu_mask(i, tmp) {
-				if (idle_cpu(i)) {
-					if (i != task_cpu(p)) {
-						schedstat_inc(p,
-							se.nr_wakeups_idle);
-					}
-					return i;
-				}
-			}
-		} else {
-			break;
-		}
-	}
-	return cpu;
-}
-#else
-static inline int wake_idle(int cpu, struct task_struct *p)
-{
-	return cpu;
-}
-#endif
-
 /***
  * try_to_wake_up - wake up a thread
  * @p: the to-be-woken-up thread
@@ -1573,8 +1528,6 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
 	long old_state;
 	struct rq *rq;
 #ifdef CONFIG_SMP
-	struct sched_domain *sd, *this_sd = NULL;
-	unsigned long load, this_load;
 	int new_cpu;
 #endif
 
@@ -1594,90 +1547,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
 	if (unlikely(task_running(rq, p)))
 		goto out_activate;
 
-	new_cpu = cpu;
-
-	schedstat_inc(rq, ttwu_count);
-	if (cpu == this_cpu) {
-		schedstat_inc(rq, ttwu_local);
-		goto out_set_cpu;
-	}
-
-	for_each_domain(this_cpu, sd) {
-		if (cpu_isset(cpu, sd->span)) {
-			schedstat_inc(sd, ttwu_wake_remote);
-			this_sd = sd;
-			break;
-		}
-	}
-
-	if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
-		goto out_set_cpu;
-
-	/*
-	 * Check for affine wakeup and passive balancing possibilities.
-	 */
-	if (this_sd) {
-		int idx = this_sd->wake_idx;
-		unsigned int imbalance;
-
-		imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
-
-		load = source_load(cpu, idx);
-		this_load = target_load(this_cpu, idx);
-
-		new_cpu = this_cpu; /* Wake to this CPU if we can */
-
-		if (this_sd->flags & SD_WAKE_AFFINE) {
-			unsigned long tl = this_load;
-			unsigned long tl_per_task;
-
-			/*
-			 * Attract cache-cold tasks on sync wakeups:
-			 */
-			if (sync && !task_hot(p, rq->clock, this_sd))
-				goto out_set_cpu;
-
-			schedstat_inc(p, se.nr_wakeups_affine_attempts);
-			tl_per_task = cpu_avg_load_per_task(this_cpu);
-
-			/*
-			 * If sync wakeup then subtract the (maximum possible)
-			 * effect of the currently running task from the load
-			 * of the current CPU:
-			 */
-			if (sync)
-				tl -= current->se.load.weight;
-
-			if ((tl <= load &&
-				tl + target_load(cpu, idx) <= tl_per_task) ||
-			       100*(tl + p->se.load.weight) <= imbalance*load) {
-				/*
-				 * This domain has SD_WAKE_AFFINE and
-				 * p is cache cold in this domain, and
-				 * there is no bad imbalance.
-				 */
-				schedstat_inc(this_sd, ttwu_move_affine);
-				schedstat_inc(p, se.nr_wakeups_affine);
-				goto out_set_cpu;
-			}
-		}
-
-		/*
-		 * Start passive balancing when half the imbalance_pct
-		 * limit is reached.
-		 */
-		if (this_sd->flags & SD_WAKE_BALANCE) {
-			if (imbalance*this_load <= 100*load) {
-				schedstat_inc(this_sd, ttwu_move_balance);
-				schedstat_inc(p, se.nr_wakeups_passive);
-				goto out_set_cpu;
-			}
-		}
-	}
-
-	new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */
-out_set_cpu:
-	new_cpu = wake_idle(new_cpu, p);
+	new_cpu = p->sched_class->select_task_rq(p, sync);
 	if (new_cpu != cpu) {
 		set_task_cpu(p, new_cpu);
 		task_rq_unlock(rq, &flags);
@@ -1693,6 +1563,23 @@ out_set_cpu:
 		cpu = task_cpu(p);
 	}
 
+#ifdef CONFIG_SCHEDSTATS
+	schedstat_inc(rq, ttwu_count);
+	if (cpu == this_cpu)
+		schedstat_inc(rq, ttwu_local);
+	else {
+		struct sched_domain *sd;
+		for_each_domain(this_cpu, sd) {
+			if (cpu_isset(cpu, sd->span)) {
+				schedstat_inc(sd, ttwu_wake_remote);
+				break;
+			}
+		}
+	}
+
+#endif
+
+
 out_activate:
 #endif /* CONFIG_SMP */
 	schedstat_inc(p, se.nr_wakeups);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 5c208e090ae4..f881fc5e035c 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -860,6 +860,151 @@ static void yield_task_fair(struct rq *rq)
 	se->vruntime = rightmost->vruntime + 1;
 }
 
+/*
+ * wake_idle() will wake a task on an idle cpu if task->cpu is
+ * not idle and an idle cpu is available.  The span of cpus to
+ * search starts with cpus closest then further out as needed,
+ * so we always favor a closer, idle cpu.
+ *
+ * Returns the CPU we should wake onto.
+ */
+#if defined(ARCH_HAS_SCHED_WAKE_IDLE)
+static int wake_idle(int cpu, struct task_struct *p)
+{
+	cpumask_t tmp;
+	struct sched_domain *sd;
+	int i;
+
+	/*
+	 * If it is idle, then it is the best cpu to run this task.
+	 *
+	 * This cpu is also the best, if it has more than one task already.
+	 * Siblings must be also busy(in most cases) as they didn't already
+	 * pickup the extra load from this cpu and hence we need not check
+	 * sibling runqueue info. This will avoid the checks and cache miss
+	 * penalities associated with that.
+	 */
+	if (idle_cpu(cpu) || cpu_rq(cpu)->nr_running > 1)
+		return cpu;
+
+	for_each_domain(cpu, sd) {
+		if (sd->flags & SD_WAKE_IDLE) {
+			cpus_and(tmp, sd->span, p->cpus_allowed);
+			for_each_cpu_mask(i, tmp) {
+				if (idle_cpu(i)) {
+					if (i != task_cpu(p)) {
+						schedstat_inc(p,
+						       se.nr_wakeups_idle);
+					}
+					return i;
+				}
+			}
+		} else {
+			break;
+		}
+	}
+	return cpu;
+}
+#else
+static inline int wake_idle(int cpu, struct task_struct *p)
+{
+	return cpu;
+}
+#endif
+
+#ifdef CONFIG_SMP
+static int select_task_rq_fair(struct task_struct *p, int sync)
+{
+	int cpu, this_cpu;
+	struct rq *rq;
+	struct sched_domain *sd, *this_sd = NULL;
+	int new_cpu;
+
+	cpu      = task_cpu(p);
+	rq       = task_rq(p);
+	this_cpu = smp_processor_id();
+	new_cpu  = cpu;
+
+	for_each_domain(this_cpu, sd) {
+		if (cpu_isset(cpu, sd->span)) {
+			this_sd = sd;
+			break;
+		}
+	}
+
+	if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
+		goto out_set_cpu;
+
+	/*
+	 * Check for affine wakeup and passive balancing possibilities.
+	 */
+	if (this_sd) {
+		int idx = this_sd->wake_idx;
+		unsigned int imbalance;
+		unsigned long load, this_load;
+
+		imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
+
+		load = source_load(cpu, idx);
+		this_load = target_load(this_cpu, idx);
+
+		new_cpu = this_cpu; /* Wake to this CPU if we can */
+
+		if (this_sd->flags & SD_WAKE_AFFINE) {
+			unsigned long tl = this_load;
+			unsigned long tl_per_task;
+
+			/*
+			 * Attract cache-cold tasks on sync wakeups:
+			 */
+			if (sync && !task_hot(p, rq->clock, this_sd))
+				goto out_set_cpu;
+
+			schedstat_inc(p, se.nr_wakeups_affine_attempts);
+			tl_per_task = cpu_avg_load_per_task(this_cpu);
+
+			/*
+			 * If sync wakeup then subtract the (maximum possible)
+			 * effect of the currently running task from the load
+			 * of the current CPU:
+			 */
+			if (sync)
+				tl -= current->se.load.weight;
+
+			if ((tl <= load &&
+				tl + target_load(cpu, idx) <= tl_per_task) ||
+			       100*(tl + p->se.load.weight) <= imbalance*load) {
+				/*
+				 * This domain has SD_WAKE_AFFINE and
+				 * p is cache cold in this domain, and
+				 * there is no bad imbalance.
+				 */
+				schedstat_inc(this_sd, ttwu_move_affine);
+				schedstat_inc(p, se.nr_wakeups_affine);
+				goto out_set_cpu;
+			}
+		}
+
+		/*
+		 * Start passive balancing when half the imbalance_pct
+		 * limit is reached.
+		 */
+		if (this_sd->flags & SD_WAKE_BALANCE) {
+			if (imbalance*this_load <= 100*load) {
+				schedstat_inc(this_sd, ttwu_move_balance);
+				schedstat_inc(p, se.nr_wakeups_passive);
+				goto out_set_cpu;
+			}
+		}
+	}
+
+	new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */
+out_set_cpu:
+	return wake_idle(new_cpu, p);
+}
+#endif /* CONFIG_SMP */
+
+
 /*
  * Preempt the current task with a newly woken task if needed:
  */
@@ -1153,6 +1298,9 @@ static const struct sched_class fair_sched_class = {
 	.enqueue_task		= enqueue_task_fair,
 	.dequeue_task		= dequeue_task_fair,
 	.yield_task		= yield_task_fair,
+#ifdef CONFIG_SMP
+	.select_task_rq		= select_task_rq_fair,
+#endif /* CONFIG_SMP */
 
 	.check_preempt_curr	= check_preempt_wakeup,
 
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index bf9c25c15b8b..ca5374860aef 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -5,6 +5,12 @@
  *  handled in sched_fair.c)
  */
 
+#ifdef CONFIG_SMP
+static int select_task_rq_idle(struct task_struct *p, int sync)
+{
+	return task_cpu(p); /* IDLE tasks as never migrated */
+}
+#endif /* CONFIG_SMP */
 /*
  * Idle tasks are unconditionally rescheduled:
  */
@@ -72,6 +78,9 @@ const struct sched_class idle_sched_class = {
 
 	/* dequeue is not valid, we print a debug message there: */
 	.dequeue_task		= dequeue_task_idle,
+#ifdef CONFIG_SMP
+	.select_task_rq		= select_task_rq_idle,
+#endif /* CONFIG_SMP */
 
 	.check_preempt_curr	= check_preempt_curr_idle,
 
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index b788e35ffd3f..5de1aebdbd1b 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -150,6 +150,13 @@ yield_task_rt(struct rq *rq)
 	requeue_task_rt(rq, rq->curr);
 }
 
+#ifdef CONFIG_SMP
+static int select_task_rq_rt(struct task_struct *p, int sync)
+{
+	return task_cpu(p);
+}
+#endif /* CONFIG_SMP */
+
 /*
  * Preempt the current task with a newly woken task if needed:
  */
@@ -667,6 +674,9 @@ const struct sched_class rt_sched_class = {
 	.enqueue_task		= enqueue_task_rt,
 	.dequeue_task		= dequeue_task_rt,
 	.yield_task		= yield_task_rt,
+#ifdef CONFIG_SMP
+	.select_task_rq		= select_task_rq_rt,
+#endif /* CONFIG_SMP */
 
 	.check_preempt_curr	= check_preempt_curr_rt,
 
-- 
cgit v1.2.3


From 07b4032c9e505e2a1fbe7703aff64a153c3249be Mon Sep 17 00:00:00 2001
From: Gregory Haskins <ghaskins@novell.com>
Date: Fri, 25 Jan 2008 21:08:10 +0100
Subject: sched: break out search for RT tasks

Isolate the search logic into a function so that it can be used later
in places other than find_locked_lowest_rq().

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_rt.c | 66 ++++++++++++++++++++++++++++++++-----------------------
 1 file changed, 39 insertions(+), 27 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 5de1aebdbd1b..ffd02720b58f 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -263,54 +263,66 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq,
 
 static DEFINE_PER_CPU(cpumask_t, local_cpu_mask);
 
-/* Will lock the rq it finds */
-static struct rq *find_lock_lowest_rq(struct task_struct *task,
-				      struct rq *this_rq)
+static int find_lowest_rq(struct task_struct *task)
 {
-	struct rq *lowest_rq = NULL;
 	int cpu;
-	int tries;
 	cpumask_t *cpu_mask = &__get_cpu_var(local_cpu_mask);
+	struct rq *lowest_rq = NULL;
 
 	cpus_and(*cpu_mask, cpu_online_map, task->cpus_allowed);
 
-	for (tries = 0; tries < RT_MAX_TRIES; tries++) {
-		/*
-		 * Scan each rq for the lowest prio.
-		 */
-		for_each_cpu_mask(cpu, *cpu_mask) {
-			struct rq *rq = &per_cpu(runqueues, cpu);
+	/*
+	 * Scan each rq for the lowest prio.
+	 */
+	for_each_cpu_mask(cpu, *cpu_mask) {
+		struct rq *rq = cpu_rq(cpu);
 
-			if (cpu == this_rq->cpu)
-				continue;
+		if (cpu == rq->cpu)
+			continue;
 
-			/* We look for lowest RT prio or non-rt CPU */
-			if (rq->rt.highest_prio >= MAX_RT_PRIO) {
-				lowest_rq = rq;
-				break;
-			}
+		/* We look for lowest RT prio or non-rt CPU */
+		if (rq->rt.highest_prio >= MAX_RT_PRIO) {
+			lowest_rq = rq;
+			break;
+		}
 
-			/* no locking for now */
-			if (rq->rt.highest_prio > task->prio &&
-			    (!lowest_rq || rq->rt.highest_prio > lowest_rq->rt.highest_prio)) {
-				lowest_rq = rq;
-			}
+		/* no locking for now */
+		if (rq->rt.highest_prio > task->prio &&
+		    (!lowest_rq || rq->rt.highest_prio > lowest_rq->rt.highest_prio)) {
+			lowest_rq = rq;
 		}
+	}
+
+	return lowest_rq ? lowest_rq->cpu : -1;
+}
+
+/* Will lock the rq it finds */
+static struct rq *find_lock_lowest_rq(struct task_struct *task,
+				      struct rq *rq)
+{
+	struct rq *lowest_rq = NULL;
+	int cpu;
+	int tries;
 
-		if (!lowest_rq)
+	for (tries = 0; tries < RT_MAX_TRIES; tries++) {
+		cpu = find_lowest_rq(task);
+
+		if (cpu == -1)
 			break;
 
+		lowest_rq = cpu_rq(cpu);
+
 		/* if the prio of this runqueue changed, try again */
-		if (double_lock_balance(this_rq, lowest_rq)) {
+		if (double_lock_balance(rq, lowest_rq)) {
 			/*
 			 * We had to unlock the run queue. In
 			 * the mean time, task could have
 			 * migrated already or had its affinity changed.
 			 * Also make sure that it wasn't scheduled on its rq.
 			 */
-			if (unlikely(task_rq(task) != this_rq ||
+			if (unlikely(task_rq(task) != rq ||
 				     !cpu_isset(lowest_rq->cpu, task->cpus_allowed) ||
-				     task_running(this_rq, task) ||
+				     task_running(rq, task) ||
 				     !task->se.on_rq)) {
 				spin_unlock(&lowest_rq->lock);
 				lowest_rq = NULL;
-- 
cgit v1.2.3


From 2de0b4639f4b1b6bfe31f795e5855f041f177170 Mon Sep 17 00:00:00 2001
From: Gregory Haskins <ghaskins@novell.com>
Date: Fri, 25 Jan 2008 21:08:10 +0100
Subject: sched: RT balancing: include current CPU

It doesn't hurt if we allow the current CPU to be included in the
search.  We will just simply skip it later if the current CPU turns out
to be the lowest.

We will use this later in the series

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_rt.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index ffd02720b58f..95f36f61ae1d 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -277,9 +277,6 @@ static int find_lowest_rq(struct task_struct *task)
 	for_each_cpu_mask(cpu, *cpu_mask) {
 		struct rq *rq = cpu_rq(cpu);
 
-		if (cpu == rq->cpu)
-			continue;
-
 		/* We look for lowest RT prio or non-rt CPU */
 		if (rq->rt.highest_prio >= MAX_RT_PRIO) {
 			lowest_rq = rq;
@@ -307,7 +304,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task,
 	for (tries = 0; tries < RT_MAX_TRIES; tries++) {
 		cpu = find_lowest_rq(task);
 
-		if (cpu == -1)
+		if ((cpu == -1) || (cpu == rq->cpu))
 			break;
 
 		lowest_rq = cpu_rq(cpu);
-- 
cgit v1.2.3


From 318e0893ce3f524ca045f9fd9dfd567c0a6f9446 Mon Sep 17 00:00:00 2001
From: Gregory Haskins <ghaskins@novell.com>
Date: Fri, 25 Jan 2008 21:08:10 +0100
Subject: sched: pre-route RT tasks on wakeup

In the original patch series that Steven Rostedt and I worked on together,
we both took different approaches to low-priority wakeup path.  I utilized
"pre-routing" (push the task away to a less important RQ before activating)
approach, while Steve utilized a "post-routing" approach.  The advantage of
my approach is that you avoid the overhead of a wasted activate/deactivate
cycle and peripherally related burdens.  The advantage of Steve's method is
that it neatly solves an issue preventing a "pull" optimization from being
deployed.

In the end, we ended up deploying Steve's idea.  But it later dawned on me
that we could get the best of both worlds by deploying both ideas together,
albeit slightly modified.

The idea is simple:  Use a "light-weight" lookup for pre-routing, since we
only need to approximate a good home for the task.  And we also retain the
post-routing push logic to clean up any inaccuracies caused by a condition
of "priority mistargeting" caused by the lightweight lookup.  Most of the
time, the pre-routing should work and yield lower overhead.  In the cases
where it doesnt, the post-router will bat cleanup.

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_rt.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 95f36f61ae1d..ac7d06786454 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -151,8 +151,27 @@ yield_task_rt(struct rq *rq)
 }
 
 #ifdef CONFIG_SMP
+static int find_lowest_rq(struct task_struct *task);
+
 static int select_task_rq_rt(struct task_struct *p, int sync)
 {
+	struct rq *rq = task_rq(p);
+
+	/*
+	 * If the task will not preempt the RQ, try to find a better RQ
+	 * before we even activate the task
+	 */
+	if ((p->prio >= rq->rt.highest_prio)
+	    && (p->nr_cpus_allowed > 1)) {
+		int cpu = find_lowest_rq(p);
+
+		return (cpu == -1) ? task_cpu(p) : cpu;
+	}
+
+	/*
+	 * Otherwise, just let it ride on the affined RQ and the
+	 * post-schedule router will push the preempted task away
+	 */
 	return task_cpu(p);
 }
 #endif /* CONFIG_SMP */
-- 
cgit v1.2.3


From 6e1254d2c41215da27025add8900ed187bca121d Mon Sep 17 00:00:00 2001
From: Gregory Haskins <ghaskins@novell.com>
Date: Fri, 25 Jan 2008 21:08:11 +0100
Subject: sched: optimize RT affinity

The current code base assumes a relatively flat CPU/core topology and will
route RT tasks to any CPU fairly equally.  In the real world, there are
various toplogies and affinities that govern where a task is best suited to
run with the smallest amount of overhead.  NUMA and multi-core CPUs are
prime examples of topologies that can impact cache performance.

Fortunately, linux is already structured to represent these topologies via
the sched_domains interface.  So we change our RT router to consult a
combination of topology and affinity policy to best place tasks during
migration.

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_rt.c | 100 +++++++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 88 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index ac7d06786454..a9d7d4408160 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -281,35 +281,111 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq,
 }
 
 static DEFINE_PER_CPU(cpumask_t, local_cpu_mask);
+static DEFINE_PER_CPU(cpumask_t, valid_cpu_mask);
 
-static int find_lowest_rq(struct task_struct *task)
+static int find_lowest_cpus(struct task_struct *task, cpumask_t *lowest_mask)
 {
-	int cpu;
-	cpumask_t *cpu_mask = &__get_cpu_var(local_cpu_mask);
-	struct rq *lowest_rq = NULL;
+	int       cpu;
+	cpumask_t *valid_mask = &__get_cpu_var(valid_cpu_mask);
+	int       lowest_prio = -1;
+	int       ret         = 0;
 
-	cpus_and(*cpu_mask, cpu_online_map, task->cpus_allowed);
+	cpus_clear(*lowest_mask);
+	cpus_and(*valid_mask, cpu_online_map, task->cpus_allowed);
 
 	/*
 	 * Scan each rq for the lowest prio.
 	 */
-	for_each_cpu_mask(cpu, *cpu_mask) {
+	for_each_cpu_mask(cpu, *valid_mask) {
 		struct rq *rq = cpu_rq(cpu);
 
 		/* We look for lowest RT prio or non-rt CPU */
 		if (rq->rt.highest_prio >= MAX_RT_PRIO) {
-			lowest_rq = rq;
-			break;
+			if (ret)
+				cpus_clear(*lowest_mask);
+			cpu_set(rq->cpu, *lowest_mask);
+			return 1;
 		}
 
 		/* no locking for now */
-		if (rq->rt.highest_prio > task->prio &&
-		    (!lowest_rq || rq->rt.highest_prio > lowest_rq->rt.highest_prio)) {
-			lowest_rq = rq;
+		if ((rq->rt.highest_prio > task->prio)
+		    && (rq->rt.highest_prio >= lowest_prio)) {
+			if (rq->rt.highest_prio > lowest_prio) {
+				/* new low - clear old data */
+				lowest_prio = rq->rt.highest_prio;
+				cpus_clear(*lowest_mask);
+			}
+			cpu_set(rq->cpu, *lowest_mask);
+			ret = 1;
+		}
+	}
+
+	return ret;
+}
+
+static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask)
+{
+	int first;
+
+	/* "this_cpu" is cheaper to preempt than a remote processor */
+	if ((this_cpu != -1) && cpu_isset(this_cpu, *mask))
+		return this_cpu;
+
+	first = first_cpu(*mask);
+	if (first != NR_CPUS)
+		return first;
+
+	return -1;
+}
+
+static int find_lowest_rq(struct task_struct *task)
+{
+	struct sched_domain *sd;
+	cpumask_t *lowest_mask = &__get_cpu_var(local_cpu_mask);
+	int this_cpu = smp_processor_id();
+	int cpu      = task_cpu(task);
+
+	if (!find_lowest_cpus(task, lowest_mask))
+		return -1;
+
+	/*
+	 * At this point we have built a mask of cpus representing the
+	 * lowest priority tasks in the system.  Now we want to elect
+	 * the best one based on our affinity and topology.
+	 *
+	 * We prioritize the last cpu that the task executed on since
+	 * it is most likely cache-hot in that location.
+	 */
+	if (cpu_isset(cpu, *lowest_mask))
+		return cpu;
+
+	/*
+	 * Otherwise, we consult the sched_domains span maps to figure
+	 * out which cpu is logically closest to our hot cache data.
+	 */
+	if (this_cpu == cpu)
+		this_cpu = -1; /* Skip this_cpu opt if the same */
+
+	for_each_domain(cpu, sd) {
+		if (sd->flags & SD_WAKE_AFFINE) {
+			cpumask_t domain_mask;
+			int       best_cpu;
+
+			cpus_and(domain_mask, sd->span, *lowest_mask);
+
+			best_cpu = pick_optimal_cpu(this_cpu,
+						    &domain_mask);
+			if (best_cpu != -1)
+				return best_cpu;
 		}
 	}
 
-	return lowest_rq ? lowest_rq->cpu : -1;
+	/*
+	 * And finally, if there were no matches within the domains
+	 * just give the caller *something* to work with from the compatible
+	 * locations.
+	 */
+	return pick_optimal_cpu(this_cpu, lowest_mask);
 }
 
 /* Will lock the rq it finds */
-- 
cgit v1.2.3


From a22d7fc187ed996b66d8439db27b2303f79a8e7b Mon Sep 17 00:00:00 2001
From: Gregory Haskins <ghaskins@novell.com>
Date: Fri, 25 Jan 2008 21:08:12 +0100
Subject: sched: wake-balance fixes

We have logic to detect whether the system has migratable tasks, but we are
not using it when deciding whether to push tasks away.  So we add support
for considering this new information.

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c    |  2 ++
 kernel/sched_rt.c | 10 ++++++++--
 2 files changed, 10 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 3344ba776b97..c591abd9ca38 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -346,6 +346,7 @@ struct rt_rq {
 	unsigned long rt_nr_migratory;
 	/* highest queued rt task prio */
 	int highest_prio;
+	int overloaded;
 };
 
 /*
@@ -6770,6 +6771,7 @@ void __init sched_init(void)
 		rq->migration_thread = NULL;
 		INIT_LIST_HEAD(&rq->migration_queue);
 		rq->rt.highest_prio = MAX_RT_PRIO;
+		rq->rt.overloaded = 0;
 #endif
 		atomic_set(&rq->nr_iowait, 0);
 
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index a9d7d4408160..87d7b3ff3861 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -16,6 +16,7 @@ static inline cpumask_t *rt_overload(void)
 }
 static inline void rt_set_overload(struct rq *rq)
 {
+	rq->rt.overloaded = 1;
 	cpu_set(rq->cpu, rt_overload_mask);
 	/*
 	 * Make sure the mask is visible before we set
@@ -32,6 +33,7 @@ static inline void rt_clear_overload(struct rq *rq)
 	/* the order here really doesn't matter */
 	atomic_dec(&rto_count);
 	cpu_clear(rq->cpu, rt_overload_mask);
+	rq->rt.overloaded = 0;
 }
 
 static void update_rt_migration(struct rq *rq)
@@ -448,6 +450,9 @@ static int push_rt_task(struct rq *rq)
 
 	assert_spin_locked(&rq->lock);
 
+	if (!rq->rt.overloaded)
+		return 0;
+
 	next_task = pick_next_highest_task_rt(rq, -1);
 	if (!next_task)
 		return 0;
@@ -675,7 +680,7 @@ static void schedule_tail_balance_rt(struct rq *rq)
 	 * the lock was owned by prev, we need to release it
 	 * first via finish_lock_switch and then reaquire it here.
 	 */
-	if (unlikely(rq->rt.rt_nr_running > 1)) {
+	if (unlikely(rq->rt.overloaded)) {
 		spin_lock_irq(&rq->lock);
 		push_rt_tasks(rq);
 		spin_unlock_irq(&rq->lock);
@@ -687,7 +692,8 @@ static void wakeup_balance_rt(struct rq *rq, struct task_struct *p)
 {
 	if (unlikely(rt_task(p)) &&
 	    !task_running(rq, p) &&
-	    (p->prio >= rq->curr->prio))
+	    (p->prio >= rq->rt.highest_prio) &&
+	    rq->rt.overloaded)
 		push_rt_tasks(rq);
 }
 
-- 
cgit v1.2.3


From e1f47d891c0f00769d6d40ac5740f943e998d089 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 25 Jan 2008 21:08:12 +0100
Subject: sched: RT-balance, avoid overloading

This patch changes the searching for a run queue by a waking RT task
to try to pick another runqueue if the currently running task
is an RT task.

The reason is that RT tasks behave different than normal
tasks. Preempting a normal task to run a RT task to keep
its cache hot is fine, because the preempted non-RT task
may wait on that same runqueue to run again unless the
migration thread comes along and pulls it off.

RT tasks behave differently. If one is preempted, it makes
an active effort to continue to run. So by having a high
priority task preempt a lower priority RT task, that lower
RT task will then quickly try to run on another runqueue.
This will cause that lower RT task to replace its nice
hot cache (and TLB) with a completely cold one. This is
for the hope that the new high priority RT task will keep
 its cache hot.

Remeber that this high priority RT task was just woken up.
So it may likely have been sleeping for several milliseconds,
and will end up with a cold cache anyway. RT tasks run till
they voluntarily stop, or are preempted by a higher priority
task. This means that it is unlikely that the woken RT task
will have a hot cache to wake up to. So pushing off a lower
RT task is just killing its cache for no good reason.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_rt.c | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 87d7b3ff3861..9becc3710b60 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -160,11 +160,23 @@ static int select_task_rq_rt(struct task_struct *p, int sync)
 	struct rq *rq = task_rq(p);
 
 	/*
-	 * If the task will not preempt the RQ, try to find a better RQ
-	 * before we even activate the task
+	 * If the current task is an RT task, then
+	 * try to see if we can wake this RT task up on another
+	 * runqueue. Otherwise simply start this RT task
+	 * on its current runqueue.
+	 *
+	 * We want to avoid overloading runqueues. Even if
+	 * the RT task is of higher priority than the current RT task.
+	 * RT tasks behave differently than other tasks. If
+	 * one gets preempted, we try to push it off to another queue.
+	 * So trying to keep a preempting RT task on the same
+	 * cache hot CPU will force the running RT task to
+	 * a cold CPU. So we waste all the cache for the lower
+	 * RT task in hopes of saving some of a RT task
+	 * that is just being woken and probably will have
+	 * cold cache anyway.
 	 */
-	if ((p->prio >= rq->rt.highest_prio)
-	    && (p->nr_cpus_allowed > 1)) {
+	if (unlikely(rt_task(rq->curr))) {
 		int cpu = find_lowest_rq(p);
 
 		return (cpu == -1) ? task_cpu(p) : cpu;
-- 
cgit v1.2.3


From 17b3279b48835eb522d842eae16f541da3729c8a Mon Sep 17 00:00:00 2001
From: Gregory Haskins <ghaskins@novell.com>
Date: Fri, 25 Jan 2008 21:08:13 +0100
Subject: sched: break out early if RT task cannot be migrated

We don't need to bother searching if the task cannot be migrated

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_rt.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 9becc3710b60..72c81322fb9a 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -176,7 +176,8 @@ static int select_task_rq_rt(struct task_struct *p, int sync)
 	 * that is just being woken and probably will have
 	 * cold cache anyway.
 	 */
-	if (unlikely(rt_task(rq->curr))) {
+	if (unlikely(rt_task(rq->curr)) &&
+	    (p->nr_cpus_allowed > 1)) {
 		int cpu = find_lowest_rq(p);
 
 		return (cpu == -1) ? task_cpu(p) : cpu;
-- 
cgit v1.2.3


From 06f90dbd7610d51549004ea9c2ada337831eb292 Mon Sep 17 00:00:00 2001
From: Gregory Haskins <ghaskins@novell.com>
Date: Fri, 25 Jan 2008 21:08:13 +0100
Subject: sched: RT-balance, optimize

We can cheaply track the number of bits set in the cpumask for the lowest
priority CPUs.  Therefore, compute the mask's weight and use it to skip
the optimal domain search logic when there is only one CPU available.

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_rt.c | 25 ++++++++++++++++++-------
 1 file changed, 18 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 72c81322fb9a..52d88f193afc 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -303,7 +303,7 @@ static int find_lowest_cpus(struct task_struct *task, cpumask_t *lowest_mask)
 	int       cpu;
 	cpumask_t *valid_mask = &__get_cpu_var(valid_cpu_mask);
 	int       lowest_prio = -1;
-	int       ret         = 0;
+	int       count       = 0;
 
 	cpus_clear(*lowest_mask);
 	cpus_and(*valid_mask, cpu_online_map, task->cpus_allowed);
@@ -316,7 +316,7 @@ static int find_lowest_cpus(struct task_struct *task, cpumask_t *lowest_mask)
 
 		/* We look for lowest RT prio or non-rt CPU */
 		if (rq->rt.highest_prio >= MAX_RT_PRIO) {
-			if (ret)
+			if (count)
 				cpus_clear(*lowest_mask);
 			cpu_set(rq->cpu, *lowest_mask);
 			return 1;
@@ -328,14 +328,17 @@ static int find_lowest_cpus(struct task_struct *task, cpumask_t *lowest_mask)
 			if (rq->rt.highest_prio > lowest_prio) {
 				/* new low - clear old data */
 				lowest_prio = rq->rt.highest_prio;
-				cpus_clear(*lowest_mask);
+				if (count) {
+					cpus_clear(*lowest_mask);
+					count = 0;
+				}
 			}
 			cpu_set(rq->cpu, *lowest_mask);
-			ret = 1;
+			count++;
 		}
 	}
 
-	return ret;
+	return count;
 }
 
 static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask)
@@ -359,9 +362,17 @@ static int find_lowest_rq(struct task_struct *task)
 	cpumask_t *lowest_mask = &__get_cpu_var(local_cpu_mask);
 	int this_cpu = smp_processor_id();
 	int cpu      = task_cpu(task);
+	int count    = find_lowest_cpus(task, lowest_mask);
 
-	if (!find_lowest_cpus(task, lowest_mask))
-		return -1;
+	if (!count)
+		return -1; /* No targets found */
+
+	/*
+	 * There is no sense in performing an optimal search if only one
+	 * target is found.
+	 */
+	if (count == 1)
+		return first_cpu(*lowest_mask);
 
 	/*
 	 * At this point we have built a mask of cpus representing the
-- 
cgit v1.2.3


From 610bf05645a7ac6ea104a474e328eeaaea148870 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 25 Jan 2008 21:08:13 +0100
Subject: sched: RT-balance, optimize cpu search

This patch removes several cpumask operations by keeping track
of the first of the CPUS that is of the lowest priority. When
the search for the lowest priority runqueue is completed, all
the bits up to the first CPU with the lowest priority runqueue
is cleared.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_rt.c | 49 ++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 36 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 52d88f193afc..61d198845f00 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -296,29 +296,36 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq,
 }
 
 static DEFINE_PER_CPU(cpumask_t, local_cpu_mask);
-static DEFINE_PER_CPU(cpumask_t, valid_cpu_mask);
 
 static int find_lowest_cpus(struct task_struct *task, cpumask_t *lowest_mask)
 {
-	int       cpu;
-	cpumask_t *valid_mask = &__get_cpu_var(valid_cpu_mask);
 	int       lowest_prio = -1;
+	int       lowest_cpu  = -1;
 	int       count       = 0;
+	int       cpu;
 
-	cpus_clear(*lowest_mask);
-	cpus_and(*valid_mask, cpu_online_map, task->cpus_allowed);
+	cpus_and(*lowest_mask, cpu_online_map, task->cpus_allowed);
 
 	/*
 	 * Scan each rq for the lowest prio.
 	 */
-	for_each_cpu_mask(cpu, *valid_mask) {
+	for_each_cpu_mask(cpu, *lowest_mask) {
 		struct rq *rq = cpu_rq(cpu);
 
 		/* We look for lowest RT prio or non-rt CPU */
 		if (rq->rt.highest_prio >= MAX_RT_PRIO) {
-			if (count)
+			/*
+			 * if we already found a low RT queue
+			 * and now we found this non-rt queue
+			 * clear the mask and set our bit.
+			 * Otherwise just return the queue as is
+			 * and the count==1 will cause the algorithm
+			 * to use the first bit found.
+			 */
+			if (lowest_cpu != -1) {
 				cpus_clear(*lowest_mask);
-			cpu_set(rq->cpu, *lowest_mask);
+				cpu_set(rq->cpu, *lowest_mask);
+			}
 			return 1;
 		}
 
@@ -328,13 +335,29 @@ static int find_lowest_cpus(struct task_struct *task, cpumask_t *lowest_mask)
 			if (rq->rt.highest_prio > lowest_prio) {
 				/* new low - clear old data */
 				lowest_prio = rq->rt.highest_prio;
-				if (count) {
-					cpus_clear(*lowest_mask);
-					count = 0;
-				}
+				lowest_cpu = cpu;
+				count = 0;
 			}
-			cpu_set(rq->cpu, *lowest_mask);
 			count++;
+		} else
+			cpu_clear(cpu, *lowest_mask);
+	}
+
+	/*
+	 * Clear out all the set bits that represent
+	 * runqueues that were of higher prio than
+	 * the lowest_prio.
+	 */
+	if (lowest_cpu > 0) {
+		/*
+		 * Perhaps we could add another cpumask op to
+		 * zero out bits. Like cpu_zero_bits(cpumask, nrbits);
+		 * Then that could be optimized to use memset and such.
+		 */
+		for_each_cpu_mask(cpu, *lowest_mask) {
+			if (cpu >= lowest_cpu)
+				break;
+			cpu_clear(cpu, *lowest_mask);
 		}
 	}
 
-- 
cgit v1.2.3


From 0d1311a536a0face6267e7346223f2e68b002018 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 25 Jan 2008 21:08:14 +0100
Subject: sched: RT-balance on new task

rt-balance when creating new tasks.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index c591abd9ca38..36bd8ff2a669 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1722,6 +1722,7 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
 		inc_nr_running(p, rq);
 	}
 	check_preempt_curr(rq, p);
+	wakeup_balance_rt(rq, p);
 	task_rq_unlock(rq, &flags);
 }
 
-- 
cgit v1.2.3


From 79064fbf75796c4c6a53e40729dbe52f789a91fd Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 25 Jan 2008 21:08:14 +0100
Subject: sched: clean up pick_next_highest_task_rt()

clean up pick_next_highest_task_rt().

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_rt.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 61d198845f00..b8435fd47f78 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -242,8 +242,7 @@ static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
 }
 
 /* Return the second highest RT task, NULL otherwise */
-static struct task_struct *pick_next_highest_task_rt(struct rq *rq,
-						     int cpu)
+static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
 {
 	struct rt_prio_array *array = &rq->rt.active;
 	struct task_struct *next;
@@ -270,7 +269,8 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq,
 
 	if (queue->next->next != queue) {
 		/* same prio task */
-		next = list_entry(queue->next->next, struct task_struct, run_list);
+		next = list_entry(queue->next->next, struct task_struct,
+				  run_list);
 		if (pick_rt_task(rq, next, cpu))
 			goto out;
 	}
-- 
cgit v1.2.3


From 4df64c0bfb7e0e260d10ebc005f7d0ba1308eed7 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 25 Jan 2008 21:08:15 +0100
Subject: sched: clean up find_lock_lowest_rq()

clean up find_lock_lowest_rq().

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_rt.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index b8435fd47f78..0749c1837b10 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -438,12 +438,11 @@ static int find_lowest_rq(struct task_struct *task)
 }
 
 /* Will lock the rq it finds */
-static struct rq *find_lock_lowest_rq(struct task_struct *task,
-				      struct rq *rq)
+static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
 {
 	struct rq *lowest_rq = NULL;
-	int cpu;
 	int tries;
+	int cpu;
 
 	for (tries = 0; tries < RT_MAX_TRIES; tries++) {
 		cpu = find_lowest_rq(task);
@@ -462,9 +461,11 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task,
 			 * Also make sure that it wasn't scheduled on its rq.
 			 */
 			if (unlikely(task_rq(task) != rq ||
-				     !cpu_isset(lowest_rq->cpu, task->cpus_allowed) ||
+				     !cpu_isset(lowest_rq->cpu,
+						task->cpus_allowed) ||
 				     task_running(rq, task) ||
 				     !task->se.on_rq)) {
+
 				spin_unlock(&lowest_rq->lock);
 				lowest_rq = NULL;
 				break;
-- 
cgit v1.2.3


From deeeccd41bd94a9db133d7b923f9a7479a47305d Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 25 Jan 2008 21:08:15 +0100
Subject: sched: clean up overlong line in kernel/sched_debug.c

clean up overlong line in kernel/sched_debug.c.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_rt.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 0749c1837b10..b591b89710a4 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -762,6 +762,7 @@ move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
 	/* don't touch RT tasks */
 	return 0;
 }
+
 static void set_cpus_allowed_rt(struct task_struct *p, cpumask_t *new_mask)
 {
 	int weight = cpus_weight(*new_mask);
@@ -775,9 +776,9 @@ static void set_cpus_allowed_rt(struct task_struct *p, cpumask_t *new_mask)
 	if (p->se.on_rq && (weight != p->nr_cpus_allowed)) {
 		struct rq *rq = task_rq(p);
 
-		if ((p->nr_cpus_allowed <= 1) && (weight > 1))
+		if ((p->nr_cpus_allowed <= 1) && (weight > 1)) {
 			rq->rt.rt_nr_migratory++;
-		else if((p->nr_cpus_allowed > 1) && (weight <= 1)) {
+		} else if ((p->nr_cpus_allowed > 1) && (weight <= 1)) {
 			BUG_ON(!rq->rt.rt_nr_migratory);
 			rq->rt.rt_nr_migratory--;
 		}
@@ -788,6 +789,7 @@ static void set_cpus_allowed_rt(struct task_struct *p, cpumask_t *new_mask)
 	p->cpus_allowed    = *new_mask;
 	p->nr_cpus_allowed = weight;
 }
+
 #else /* CONFIG_SMP */
 # define schedule_tail_balance_rt(rq)	do { } while (0)
 # define schedule_balance_rt(rq, prev)	do { } while (0)
-- 
cgit v1.2.3


From 84de4274893691aa8c471a1f7336d51e555d23a0 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 25 Jan 2008 21:08:15 +0100
Subject: sched: clean up kernel/sched_rt.c

clean up whitespace damage and missing comments in kernel/sched_rt.c.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_rt.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index b591b89710a4..1a2d8f0aa659 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -4,16 +4,24 @@
  */
 
 #ifdef CONFIG_SMP
+
+/*
+ * The "RT overload" flag: it gets set if a CPU has more than
+ * one runnable RT task.
+ */
 static cpumask_t rt_overload_mask;
 static atomic_t rto_count;
+
 static inline int rt_overloaded(void)
 {
 	return atomic_read(&rto_count);
 }
+
 static inline cpumask_t *rt_overload(void)
 {
 	return &rt_overload_mask;
 }
+
 static inline void rt_set_overload(struct rq *rq)
 {
 	rq->rt.overloaded = 1;
@@ -28,6 +36,7 @@ static inline void rt_set_overload(struct rq *rq)
 	wmb();
 	atomic_inc(&rto_count);
 }
+
 static inline void rt_clear_overload(struct rq *rq)
 {
 	/* the order here really doesn't matter */
-- 
cgit v1.2.3


From 6e1938d3ad58c940ec4119d387dd92a787cb238c Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 25 Jan 2008 21:08:16 +0100
Subject: sched: remove rt_overload()

remove rt_overload() - it's an unnecessary indirection.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_rt.c | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 1a2d8f0aa659..deff0c77d705 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -17,11 +17,6 @@ static inline int rt_overloaded(void)
 	return atomic_read(&rto_count);
 }
 
-static inline cpumask_t *rt_overload(void)
-{
-	return &rt_overload_mask;
-}
-
 static inline void rt_set_overload(struct rq *rq)
 {
 	rq->rt.overloaded = 1;
@@ -590,7 +585,6 @@ static int pull_rt_task(struct rq *this_rq)
 	struct task_struct *next;
 	struct task_struct *p;
 	struct rq *src_rq;
-	cpumask_t *rto_cpumask;
 	int this_cpu = this_rq->cpu;
 	int cpu;
 	int ret = 0;
@@ -608,9 +602,7 @@ static int pull_rt_task(struct rq *this_rq)
 
 	next = pick_next_task_rt(this_rq);
 
-	rto_cpumask = rt_overload();
-
-	for_each_cpu_mask(cpu, *rto_cpumask) {
+	for_each_cpu_mask(cpu, rt_overload_mask) {
 		if (this_cpu == cpu)
 			continue;
 
-- 
cgit v1.2.3


From 00597c3ed78e424bdafff123565c078d8b6088cf Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 25 Jan 2008 21:08:16 +0100
Subject: sched: remove leftover debugging

remove leftover debugging.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_rt.c | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index deff0c77d705..cc38521c5723 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -253,8 +253,6 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
 	struct list_head *queue;
 	int idx;
 
-	assert_spin_locked(&rq->lock);
-
 	if (likely(rq->rt.rt_nr_running < 2))
 		return NULL;
 
@@ -500,8 +498,6 @@ static int push_rt_task(struct rq *rq)
 	int ret = 0;
 	int paranoid = RT_MAX_TRIES;
 
-	assert_spin_locked(&rq->lock);
-
 	if (!rq->rt.overloaded)
 		return 0;
 
@@ -546,8 +542,6 @@ static int push_rt_task(struct rq *rq)
 		goto out;
 	}
 
-	assert_spin_locked(&lowest_rq->lock);
-
 	deactivate_task(rq, next_task, 0);
 	set_task_cpu(next_task, lowest_rq->cpu);
 	activate_task(lowest_rq, next_task, 0);
@@ -589,8 +583,6 @@ static int pull_rt_task(struct rq *this_rq)
 	int cpu;
 	int ret = 0;
 
-	assert_spin_locked(&this_rq->lock);
-
 	/*
 	 * If cpusets are used, and we have overlapping
 	 * run queue cpusets, then this algorithm may not catch all.
-- 
cgit v1.2.3


From 80bf3171dcdf0f5d236e2e48afe2a95c7ce23879 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 25 Jan 2008 21:08:17 +0100
Subject: sched: clean up pull_rt_task()

clean up pull_rt_task().

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_rt.c | 22 ++++++++++------------
 1 file changed, 10 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index cc38521c5723..05ada7d44800 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -576,12 +576,9 @@ static void push_rt_tasks(struct rq *rq)
 
 static int pull_rt_task(struct rq *this_rq)
 {
-	struct task_struct *next;
-	struct task_struct *p;
+	int this_cpu = this_rq->cpu, ret = 0, cpu;
+	struct task_struct *p, *next;
 	struct rq *src_rq;
-	int this_cpu = this_rq->cpu;
-	int cpu;
-	int ret = 0;
 
 	/*
 	 * If cpusets are used, and we have overlapping
@@ -608,23 +605,25 @@ static int pull_rt_task(struct rq *this_rq)
 			if (double_lock_balance(this_rq, src_rq)) {
 				/* unlocked our runqueue lock */
 				struct task_struct *old_next = next;
+
 				next = pick_next_task_rt(this_rq);
 				if (next != old_next)
 					ret = 1;
 			}
-			if (likely(src_rq->rt.rt_nr_running <= 1))
+			if (likely(src_rq->rt.rt_nr_running <= 1)) {
 				/*
 				 * Small chance that this_rq->curr changed
 				 * but it's really harmless here.
 				 */
 				rt_clear_overload(this_rq);
-			else
+			} else {
 				/*
 				 * Heh, the src_rq is now overloaded, since
 				 * we already have the src_rq lock, go straight
 				 * to pulling tasks from it.
 				 */
 				goto try_pulling;
+			}
 			spin_unlock(&src_rq->lock);
 			continue;
 		}
@@ -638,6 +637,7 @@ static int pull_rt_task(struct rq *this_rq)
 		 */
 		if (double_lock_balance(this_rq, src_rq)) {
 			struct task_struct *old_next = next;
+
 			next = pick_next_task_rt(this_rq);
 			if (next != old_next)
 				ret = 1;
@@ -674,7 +674,7 @@ static int pull_rt_task(struct rq *this_rq)
 			 */
 			if (p->prio < src_rq->curr->prio ||
 			    (next && next->prio < src_rq->curr->prio))
-				goto bail;
+				goto out;
 
 			ret = 1;
 
@@ -686,9 +686,7 @@ static int pull_rt_task(struct rq *this_rq)
 			 * case there's an even higher prio task
 			 * in another runqueue. (low likelyhood
 			 * but possible)
-			 */
-
-			/*
+			 *
 			 * Update next so that we won't pick a task
 			 * on another cpu with a priority lower (or equal)
 			 * than the one we just picked.
@@ -696,7 +694,7 @@ static int pull_rt_task(struct rq *this_rq)
 			next = p;
 
 		}
- bail:
+ out:
 		spin_unlock(&src_rq->lock);
 	}
 
-- 
cgit v1.2.3


From 7f51f298204ec0528422cd9b23feac12612c5665 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 25 Jan 2008 21:08:17 +0100
Subject: sched: clean up schedule_balance_rt()

clean up schedule_balance_rt().

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_rt.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 05ada7d44800..4d0a60e47dfa 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -701,12 +701,10 @@ static int pull_rt_task(struct rq *this_rq)
 	return ret;
 }
 
-static void schedule_balance_rt(struct rq *rq,
-				struct task_struct *prev)
+static void schedule_balance_rt(struct rq *rq, struct task_struct *prev)
 {
 	/* Try to pull RT tasks here if we lower this rq's prio */
-	if (unlikely(rt_task(prev)) &&
-	    rq->rt.highest_prio > prev->prio)
+	if (unlikely(rt_task(prev)) && rq->rt.highest_prio > prev->prio)
 		pull_rt_task(rq);
 }
 
-- 
cgit v1.2.3


From 57d885fea0da0e9541d7730a9e1dcf734981a173 Mon Sep 17 00:00:00 2001
From: Gregory Haskins <ghaskins@novell.com>
Date: Fri, 25 Jan 2008 21:08:18 +0100
Subject: sched: add sched-domain roots

We add the notion of a root-domain which will be used later to rescope
global variables to per-domain variables.  Each exclusive cpuset
essentially defines an island domain by fully partitioning the member cpus
from any other cpuset.  However, we currently still maintain some
policy/state as global variables which transcend all cpusets.  Consider,
for instance, rt-overload state.

Whenever a new exclusive cpuset is created, we also create a new
root-domain object and move each cpu member to the root-domain's span.
By default the system creates a single root-domain with all cpus as
members (mimicking the global state we have today).

We add some plumbing for storing class specific data in our root-domain.
Whenever a RQ is switching root-domains (because of repartitioning) we
give each sched_class the opportunity to remove any state from its old
domain and add state to the new one.  This logic doesn't have any clients
yet but it will later in the series.

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
CC: Christoph Lameter <clameter@sgi.com>
CC: Paul Jackson <pj@sgi.com>
CC: Simon Derr <simon.derr@bull.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h |   3 ++
 kernel/sched.c        | 121 ++++++++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 121 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6fbbf38555ac..2e69f19369e4 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -849,6 +849,9 @@ struct sched_class {
 	void (*task_tick) (struct rq *rq, struct task_struct *p);
 	void (*task_new) (struct rq *rq, struct task_struct *p);
 	void (*set_cpus_allowed)(struct task_struct *p, cpumask_t *newmask);
+
+	void (*join_domain)(struct rq *rq);
+	void (*leave_domain)(struct rq *rq);
 };
 
 struct load_weight {
diff --git a/kernel/sched.c b/kernel/sched.c
index 36bd8ff2a669..34b7d721d735 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -349,6 +349,28 @@ struct rt_rq {
 	int overloaded;
 };
 
+#ifdef CONFIG_SMP
+
+/*
+ * We add the notion of a root-domain which will be used to define per-domain
+ * variables.  Each exclusive cpuset essentially defines an island domain by
+ * fully partitioning the member cpus from any other cpuset.  Whenever a new
+ * exclusive cpuset is created, we also create and attach a new root-domain
+ * object.
+ *
+ * By default the system creates a single root-domain with all cpus as
+ * members (mimicking the global state we have today).
+ */
+struct root_domain {
+	atomic_t refcount;
+	cpumask_t span;
+	cpumask_t online;
+};
+
+static struct root_domain def_root_domain;
+
+#endif
+
 /*
  * This is the main, per-CPU runqueue data structure.
  *
@@ -406,6 +428,7 @@ struct rq {
 	atomic_t nr_iowait;
 
 #ifdef CONFIG_SMP
+	struct root_domain  *rd;
 	struct sched_domain *sd;
 
 	/* For active balancing */
@@ -5550,6 +5573,15 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 	case CPU_ONLINE_FROZEN:
 		/* Strictly unnecessary, as first user will wake it. */
 		wake_up_process(cpu_rq(cpu)->migration_thread);
+
+		/* Update our root-domain */
+		rq = cpu_rq(cpu);
+		spin_lock_irqsave(&rq->lock, flags);
+		if (rq->rd) {
+			BUG_ON(!cpu_isset(cpu, rq->rd->span));
+			cpu_set(cpu, rq->rd->online);
+		}
+		spin_unlock_irqrestore(&rq->lock, flags);
 		break;
 
 #ifdef CONFIG_HOTPLUG_CPU
@@ -5600,6 +5632,17 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 		}
 		spin_unlock_irq(&rq->lock);
 		break;
+
+	case CPU_DOWN_PREPARE:
+		/* Update our root-domain */
+		rq = cpu_rq(cpu);
+		spin_lock_irqsave(&rq->lock, flags);
+		if (rq->rd) {
+			BUG_ON(!cpu_isset(cpu, rq->rd->span));
+			cpu_clear(cpu, rq->rd->online);
+		}
+		spin_unlock_irqrestore(&rq->lock, flags);
+		break;
 #endif
 	}
 	return NOTIFY_OK;
@@ -5788,11 +5831,69 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
 	return 1;
 }
 
+static void rq_attach_root(struct rq *rq, struct root_domain *rd)
+{
+	unsigned long flags;
+	const struct sched_class *class;
+
+	spin_lock_irqsave(&rq->lock, flags);
+
+	if (rq->rd) {
+		struct root_domain *old_rd = rq->rd;
+
+		for (class = sched_class_highest; class; class = class->next)
+			if (class->leave_domain)
+				class->leave_domain(rq);
+
+		if (atomic_dec_and_test(&old_rd->refcount))
+			kfree(old_rd);
+	}
+
+	atomic_inc(&rd->refcount);
+	rq->rd = rd;
+
+	for (class = sched_class_highest; class; class = class->next)
+		if (class->join_domain)
+			class->join_domain(rq);
+
+	spin_unlock_irqrestore(&rq->lock, flags);
+}
+
+static void init_rootdomain(struct root_domain *rd, const cpumask_t *map)
+{
+	memset(rd, 0, sizeof(*rd));
+
+	rd->span = *map;
+	cpus_and(rd->online, rd->span, cpu_online_map);
+}
+
+static void init_defrootdomain(void)
+{
+	cpumask_t cpus = CPU_MASK_ALL;
+
+	init_rootdomain(&def_root_domain, &cpus);
+	atomic_set(&def_root_domain.refcount, 1);
+}
+
+static struct root_domain *alloc_rootdomain(const cpumask_t *map)
+{
+	struct root_domain *rd;
+
+	rd = kmalloc(sizeof(*rd), GFP_KERNEL);
+	if (!rd)
+		return NULL;
+
+	init_rootdomain(rd, map);
+
+	return rd;
+}
+
 /*
  * Attach the domain 'sd' to 'cpu' as its base domain.  Callers must
  * hold the hotplug lock.
  */
-static void cpu_attach_domain(struct sched_domain *sd, int cpu)
+static void cpu_attach_domain(struct sched_domain *sd,
+			      struct root_domain *rd, int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
 	struct sched_domain *tmp;
@@ -5817,6 +5918,7 @@ static void cpu_attach_domain(struct sched_domain *sd, int cpu)
 
 	sched_domain_debug(sd, cpu);
 
+	rq_attach_root(rq, rd);
 	rcu_assign_pointer(rq->sd, sd);
 }
 
@@ -6185,6 +6287,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
 static int build_sched_domains(const cpumask_t *cpu_map)
 {
 	int i;
+	struct root_domain *rd;
 #ifdef CONFIG_NUMA
 	struct sched_group **sched_group_nodes = NULL;
 	int sd_allnodes = 0;
@@ -6201,6 +6304,12 @@ static int build_sched_domains(const cpumask_t *cpu_map)
 	sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
 #endif
 
+	rd = alloc_rootdomain(cpu_map);
+	if (!rd) {
+		printk(KERN_WARNING "Cannot alloc root domain\n");
+		return -ENOMEM;
+	}
+
 	/*
 	 * Set up domains for cpus specified by the cpu_map.
 	 */
@@ -6417,7 +6526,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
 #else
 		sd = &per_cpu(phys_domains, i);
 #endif
-		cpu_attach_domain(sd, i);
+		cpu_attach_domain(sd, rd, i);
 	}
 
 	return 0;
@@ -6475,7 +6584,7 @@ static void detach_destroy_domains(const cpumask_t *cpu_map)
 	unregister_sched_domain_sysctl();
 
 	for_each_cpu_mask(i, *cpu_map)
-		cpu_attach_domain(NULL, i);
+		cpu_attach_domain(NULL, &def_root_domain, i);
 	synchronize_sched();
 	arch_destroy_sched_domains(cpu_map);
 }
@@ -6727,6 +6836,10 @@ void __init sched_init(void)
 	int highest_cpu = 0;
 	int i, j;
 
+#ifdef CONFIG_SMP
+	init_defrootdomain();
+#endif
+
 	for_each_possible_cpu(i) {
 		struct rt_prio_array *array;
 		struct rq *rq;
@@ -6765,6 +6878,8 @@ void __init sched_init(void)
 			rq->cpu_load[j] = 0;
 #ifdef CONFIG_SMP
 		rq->sd = NULL;
+		rq->rd = NULL;
+		rq_attach_root(rq, &def_root_domain);
 		rq->active_balance = 0;
 		rq->next_balance = jiffies;
 		rq->push_cpu = 0;
-- 
cgit v1.2.3


From 637f50851b57a32f7ec67c50fc16f1601ab1a87a Mon Sep 17 00:00:00 2001
From: Gregory Haskins <ghaskins@novell.com>
Date: Fri, 25 Jan 2008 21:08:18 +0100
Subject: sched: only balance our RT tasks within our domain

We move the rt-overload data as the first global to per-domain
reclassification.  This limits the scope of overload related cache-line
bouncing to stay with a specified partition instead of affecting all
cpus in the system.

Finally, we limit the scope of find_lowest_cpu searches to the domain
instead of the entire system.  Note that we would always respect domain
boundaries even without this patch, but we first would scan potentially
all cpus before whittling the list down.  Now we can avoid looking at
RQs that are out of scope, again reducing cache-line hits.

Note: In some cases, task->cpus_allowed will effectively reduce our search
to within our domain.  However, I believe there are cases where the
cpus_allowed mask may be all ones and therefore we err on the side of
caution.  If it can be optimized later, so be it.

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
CC: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c    |  7 +++++++
 kernel/sched_rt.c | 57 ++++++++++++++++++++++++++++++-------------------------
 2 files changed, 38 insertions(+), 26 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 34b7d721d735..35ef06c99214 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -365,6 +365,13 @@ struct root_domain {
 	atomic_t refcount;
 	cpumask_t span;
 	cpumask_t online;
+
+        /*
+	 * The "RT overload" flag: it gets set if a CPU has more than
+	 * one runnable RT task.
+	 */
+	cpumask_t rto_mask;
+	atomic_t  rto_count;
 };
 
 static struct root_domain def_root_domain;
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 4d0a60e47dfa..b049e5110eea 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -5,22 +5,14 @@
 
 #ifdef CONFIG_SMP
 
-/*
- * The "RT overload" flag: it gets set if a CPU has more than
- * one runnable RT task.
- */
-static cpumask_t rt_overload_mask;
-static atomic_t rto_count;
-
-static inline int rt_overloaded(void)
+static inline int rt_overloaded(struct rq *rq)
 {
-	return atomic_read(&rto_count);
+	return atomic_read(&rq->rd->rto_count);
 }
 
 static inline void rt_set_overload(struct rq *rq)
 {
-	rq->rt.overloaded = 1;
-	cpu_set(rq->cpu, rt_overload_mask);
+	cpu_set(rq->cpu, rq->rd->rto_mask);
 	/*
 	 * Make sure the mask is visible before we set
 	 * the overload count. That is checked to determine
@@ -29,23 +21,25 @@ static inline void rt_set_overload(struct rq *rq)
 	 * updated yet.
 	 */
 	wmb();
-	atomic_inc(&rto_count);
+	atomic_inc(&rq->rd->rto_count);
 }
 
 static inline void rt_clear_overload(struct rq *rq)
 {
 	/* the order here really doesn't matter */
-	atomic_dec(&rto_count);
-	cpu_clear(rq->cpu, rt_overload_mask);
-	rq->rt.overloaded = 0;
+	atomic_dec(&rq->rd->rto_count);
+	cpu_clear(rq->cpu, rq->rd->rto_mask);
 }
 
 static void update_rt_migration(struct rq *rq)
 {
-	if (rq->rt.rt_nr_migratory && (rq->rt.rt_nr_running > 1))
+	if (rq->rt.rt_nr_migratory && (rq->rt.rt_nr_running > 1)) {
 		rt_set_overload(rq);
-	else
+		rq->rt.overloaded = 1;
+	} else {
 		rt_clear_overload(rq);
+		rq->rt.overloaded = 0;
+	}
 }
 #endif /* CONFIG_SMP */
 
@@ -306,7 +300,7 @@ static int find_lowest_cpus(struct task_struct *task, cpumask_t *lowest_mask)
 	int       count       = 0;
 	int       cpu;
 
-	cpus_and(*lowest_mask, cpu_online_map, task->cpus_allowed);
+	cpus_and(*lowest_mask, task_rq(task)->rd->online, task->cpus_allowed);
 
 	/*
 	 * Scan each rq for the lowest prio.
@@ -580,18 +574,12 @@ static int pull_rt_task(struct rq *this_rq)
 	struct task_struct *p, *next;
 	struct rq *src_rq;
 
-	/*
-	 * If cpusets are used, and we have overlapping
-	 * run queue cpusets, then this algorithm may not catch all.
-	 * This is just the price you pay on trying to keep
-	 * dirtying caches down on large SMP machines.
-	 */
-	if (likely(!rt_overloaded()))
+	if (likely(!rt_overloaded(this_rq)))
 		return 0;
 
 	next = pick_next_task_rt(this_rq);
 
-	for_each_cpu_mask(cpu, rt_overload_mask) {
+	for_each_cpu_mask(cpu, this_rq->rd->rto_mask) {
 		if (this_cpu == cpu)
 			continue;
 
@@ -811,6 +799,20 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p)
 	}
 }
 
+/* Assumes rq->lock is held */
+static void join_domain_rt(struct rq *rq)
+{
+	if (rq->rt.overloaded)
+		rt_set_overload(rq);
+}
+
+/* Assumes rq->lock is held */
+static void leave_domain_rt(struct rq *rq)
+{
+	if (rq->rt.overloaded)
+		rt_clear_overload(rq);
+}
+
 static void set_curr_task_rt(struct rq *rq)
 {
 	struct task_struct *p = rq->curr;
@@ -840,4 +842,7 @@ const struct sched_class rt_sched_class = {
 
 	.set_curr_task          = set_curr_task_rt,
 	.task_tick		= task_tick_rt,
+
+	.join_domain            = join_domain_rt,
+	.leave_domain           = leave_domain_rt,
 };
-- 
cgit v1.2.3


From bdd7c81b4973e72b670eff6b5725bab189b723d6 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 25 Jan 2008 21:08:18 +0100
Subject: sched: fix sched_rt.c:join/leave_domain

fix build bug in sched_rt.c:join/leave_domain and make them only
be included on SMP builds.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_rt.c | 33 ++++++++++++++++-----------------
 1 file changed, 16 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index b049e5110eea..3ea0cae513d2 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -767,6 +767,20 @@ static void set_cpus_allowed_rt(struct task_struct *p, cpumask_t *new_mask)
 	p->nr_cpus_allowed = weight;
 }
 
+/* Assumes rq->lock is held */
+static void join_domain_rt(struct rq *rq)
+{
+	if (rq->rt.overloaded)
+		rt_set_overload(rq);
+}
+
+/* Assumes rq->lock is held */
+static void leave_domain_rt(struct rq *rq)
+{
+	if (rq->rt.overloaded)
+		rt_clear_overload(rq);
+}
+
 #else /* CONFIG_SMP */
 # define schedule_tail_balance_rt(rq)	do { } while (0)
 # define schedule_balance_rt(rq, prev)	do { } while (0)
@@ -799,20 +813,6 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p)
 	}
 }
 
-/* Assumes rq->lock is held */
-static void join_domain_rt(struct rq *rq)
-{
-	if (rq->rt.overloaded)
-		rt_set_overload(rq);
-}
-
-/* Assumes rq->lock is held */
-static void leave_domain_rt(struct rq *rq)
-{
-	if (rq->rt.overloaded)
-		rt_clear_overload(rq);
-}
-
 static void set_curr_task_rt(struct rq *rq)
 {
 	struct task_struct *p = rq->curr;
@@ -838,11 +838,10 @@ const struct sched_class rt_sched_class = {
 	.load_balance		= load_balance_rt,
 	.move_one_task		= move_one_task_rt,
 	.set_cpus_allowed       = set_cpus_allowed_rt,
+	.join_domain            = join_domain_rt,
+	.leave_domain           = leave_domain_rt,
 #endif
 
 	.set_curr_task          = set_curr_task_rt,
 	.task_tick		= task_tick_rt,
-
-	.join_domain            = join_domain_rt,
-	.leave_domain           = leave_domain_rt,
 };
-- 
cgit v1.2.3


From d7876a08db50895ed9808ede4a259cccf65eba47 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 25 Jan 2008 21:08:19 +0100
Subject: sched: remove unused JIFFIES_TO_NS() macro

remove unused JIFFIES_TO_NS() macro.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 35ef06c99214..461ee900d1ac 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -96,10 +96,9 @@ unsigned long long __attribute__((weak)) sched_clock(void)
 #define MAX_USER_PRIO		(USER_PRIO(MAX_PRIO))
 
 /*
- * Some helpers for converting nanosecond timing to jiffy resolution
+ * Helpers for converting nanosecond timing to jiffy resolution
  */
 #define NS_TO_JIFFIES(TIME)	((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
-#define JIFFIES_TO_NS(TIME)	((TIME) * (NSEC_PER_SEC / HZ))
 
 #define NICE_0_LOAD		SCHED_LOAD_SCALE
 #define NICE_0_SHIFT		SCHED_LOAD_SHIFT
-- 
cgit v1.2.3


From 0eab9146571dfa9b50ea952ec2ab27d591f26b63 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 25 Jan 2008 21:08:19 +0100
Subject: sched: style cleanup, #2

style cleanup of various changes that were done recently.

no code changed:

      text    data     bss     dec     hex filename
     26399    2578      48   29025    7161 sched.o.before
     26399    2578      48   29025    7161 sched.o.after

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 32 +++++++++++++++++---------------
 1 file changed, 17 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 461ee900d1ac..23b9925a1dfb 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -235,17 +235,17 @@ static void set_se_shares(struct sched_entity *se, unsigned long shares);
  *	Every task in system belong to this group at bootup.
  */
 struct task_group init_task_group = {
-	.se     = init_sched_entity_p,
+	.se	= init_sched_entity_p,
 	.cfs_rq = init_cfs_rq_p,
 };
 
 #ifdef CONFIG_FAIR_USER_SCHED
-# define INIT_TASK_GROUP_LOAD	2*NICE_0_LOAD
+# define INIT_TASK_GROUP_LOAD	(2*NICE_0_LOAD)
 #else
 # define INIT_TASK_GROUP_LOAD	NICE_0_LOAD
 #endif
 
-#define MIN_GROUP_SHARES       2
+#define MIN_GROUP_SHARES	2
 
 static int init_task_group_load = INIT_TASK_GROUP_LOAD;
 
@@ -352,8 +352,8 @@ struct rt_rq {
 
 /*
  * We add the notion of a root-domain which will be used to define per-domain
- * variables.  Each exclusive cpuset essentially defines an island domain by
- * fully partitioning the member cpus from any other cpuset.  Whenever a new
+ * variables. Each exclusive cpuset essentially defines an island domain by
+ * fully partitioning the member cpus from any other cpuset. Whenever a new
  * exclusive cpuset is created, we also create and attach a new root-domain
  * object.
  *
@@ -365,12 +365,12 @@ struct root_domain {
 	cpumask_t span;
 	cpumask_t online;
 
-        /*
+	/*
 	 * The "RT overload" flag: it gets set if a CPU has more than
 	 * one runnable RT task.
 	 */
 	cpumask_t rto_mask;
-	atomic_t  rto_count;
+	atomic_t rto_count;
 };
 
 static struct root_domain def_root_domain;
@@ -434,7 +434,7 @@ struct rq {
 	atomic_t nr_iowait;
 
 #ifdef CONFIG_SMP
-	struct root_domain  *rd;
+	struct root_domain *rd;
 	struct sched_domain *sd;
 
 	/* For active balancing */
@@ -5066,7 +5066,7 @@ int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
 	if (p->sched_class->set_cpus_allowed)
 		p->sched_class->set_cpus_allowed(p, &new_mask);
 	else {
-		p->cpus_allowed    = new_mask;
+		p->cpus_allowed = new_mask;
 		p->nr_cpus_allowed = cpus_weight(new_mask);
 	}
 
@@ -5847,9 +5847,10 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
 	if (rq->rd) {
 		struct root_domain *old_rd = rq->rd;
 
-		for (class = sched_class_highest; class; class = class->next)
+		for (class = sched_class_highest; class; class = class->next) {
 			if (class->leave_domain)
 				class->leave_domain(rq);
+		}
 
 		if (atomic_dec_and_test(&old_rd->refcount))
 			kfree(old_rd);
@@ -5858,9 +5859,10 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
 	atomic_inc(&rd->refcount);
 	rq->rd = rd;
 
-	for (class = sched_class_highest; class; class = class->next)
+	for (class = sched_class_highest; class; class = class->next) {
 		if (class->join_domain)
 			class->join_domain(rq);
+	}
 
 	spin_unlock_irqrestore(&rq->lock, flags);
 }
@@ -5895,11 +5897,11 @@ static struct root_domain *alloc_rootdomain(const cpumask_t *map)
 }
 
 /*
- * Attach the domain 'sd' to 'cpu' as its base domain.  Callers must
+ * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
  * hold the hotplug lock.
  */
-static void cpu_attach_domain(struct sched_domain *sd,
-			      struct root_domain *rd, int cpu)
+static void
+cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
 	struct sched_domain *tmp;
@@ -7095,7 +7097,7 @@ static int rebalance_shares(struct sched_domain *sd, int this_cpu)
 		for_each_cpu_mask(i, sdspan)
 			total_load += tg->cfs_rq[i]->load.weight;
 
-		/* Nothing to do if this group has no load  */
+		/* Nothing to do if this group has no load */
 		if (!total_load)
 			continue;
 
-- 
cgit v1.2.3


From b913176917399e92e6f741672038c73d7ce93be5 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 25 Jan 2008 21:08:19 +0100
Subject: sched: add credits for RT balancing improvements

add credits for RT balancing improvements.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 23b9925a1dfb..55c521780f93 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -22,6 +22,8 @@
  *              by Peter Williams
  *  2007-05-06  Interactivity improvements to CFS by Mike Galbraith
  *  2007-07-01  Group scheduling enhancements by Srivatsa Vaddagiri
+ *  2007-11-29  RT balancing improvements by Steven Rostedt, Gregory Haskins,
+ *              Thomas Gleixner, Mike Kravetz
  */
 
 #include <linux/mm.h>
-- 
cgit v1.2.3


From 9ec3b77e11b9398ab40b492c4fde7d8aac04a718 Mon Sep 17 00:00:00 2001
From: Dmitry Adamushko <dmitry.adamushko@gmail.com>
Date: Fri, 25 Jan 2008 21:08:21 +0100
Subject: sched: no need for 'affine wakeup' balancing

No need to do a check for 'affine wakeup and passive balancing possibilities'
in select_task_rq_fair() when task_cpu(p) == this_cpu.

I guess, this part got missed upon introduction of per-sched_class
select_task_rq() in try_to_wake_up().

Signed-off-by: Dmitry Adamushko <dmitry.adamushko@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index f881fc5e035c..2208692dc4a0 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -925,6 +925,9 @@ static int select_task_rq_fair(struct task_struct *p, int sync)
 	this_cpu = smp_processor_id();
 	new_cpu  = cpu;
 
+	if (cpu == this_cpu)
+		goto out_set_cpu;
+
 	for_each_domain(this_cpu, sd) {
 		if (cpu_isset(cpu, sd->span)) {
 			this_sd = sd;
-- 
cgit v1.2.3


From 5d2f5a616d65e3c08acde3195694c4ab8afbc1b7 Mon Sep 17 00:00:00 2001
From: Dmitry Adamushko <dmitry.adamushko@gmail.com>
Date: Fri, 25 Jan 2008 21:08:21 +0100
Subject: sched: get rid of 'new_cpu' in try_to_wake_up()

Clean-up try_to_wake_up().

Get rid of the 'new_cpu' variable in try_to_wake_up() [ that's, one
#ifdef section less ].  Also remove a few redundant blank lines.

Signed-off-by: Dmitry Adamushko <dmitry.adamushko@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 55c521780f93..9d6fb731559b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1559,9 +1559,6 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
 	unsigned long flags;
 	long old_state;
 	struct rq *rq;
-#ifdef CONFIG_SMP
-	int new_cpu;
-#endif
 
 	rq = task_rq_lock(p, &flags);
 	old_state = p->state;
@@ -1579,9 +1576,9 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
 	if (unlikely(task_running(rq, p)))
 		goto out_activate;
 
-	new_cpu = p->sched_class->select_task_rq(p, sync);
-	if (new_cpu != cpu) {
-		set_task_cpu(p, new_cpu);
+	cpu = p->sched_class->select_task_rq(p, sync);
+	if (cpu != orig_cpu) {
+		set_task_cpu(p, cpu);
 		task_rq_unlock(rq, &flags);
 		/* might preempt at this point */
 		rq = task_rq_lock(p, &flags);
@@ -1608,10 +1605,8 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
 			}
 		}
 	}
-
 #endif
 
-
 out_activate:
 #endif /* CONFIG_SMP */
 	schedstat_inc(p, se.nr_wakeups);
-- 
cgit v1.2.3


From 4bf0b77158d581c952af237aec79d0604b78fe27 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 25 Jan 2008 21:08:21 +0100
Subject: sched: remove do_div() from __sched_slice()

Yanmin Zhang noticed a nice optimization:

  p = l * nr / nl, nl = l/g -> p = g * nr

which eliminates a do_div() from __sched_period().

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 2208692dc4a0..10aa6e1ae3dd 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -248,8 +248,8 @@ static u64 __sched_period(unsigned long nr_running)
 	unsigned long nr_latency = sched_nr_latency;
 
 	if (unlikely(nr_running > nr_latency)) {
+		period = sysctl_sched_min_granularity;
 		period *= nr_running;
-		do_div(period, nr_latency);
 	}
 
 	return period;
-- 
cgit v1.2.3


From 9a897c5a6701bcb6f099f7ca20194999102729fd Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Fri, 25 Jan 2008 21:08:22 +0100
Subject: sched: RT-balance, replace hooks with pre/post schedule and wakeup
 methods

To make the main sched.c code more agnostic to the schedule classes.
Instead of having specific hooks in the schedule code for the RT class
balancing. They are replaced with a pre_schedule, post_schedule
and task_wake_up methods. These methods may be used by any of the classes
but currently, only the sched_rt class implements them.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h |  3 +++
 kernel/sched.c        | 20 ++++++++++++++++----
 kernel/sched_rt.c     | 17 +++++++----------
 3 files changed, 26 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2e69f19369e4..c67d2c2f0111 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -843,6 +843,9 @@ struct sched_class {
 	int (*move_one_task) (struct rq *this_rq, int this_cpu,
 			      struct rq *busiest, struct sched_domain *sd,
 			      enum cpu_idle_type idle);
+	void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
+	void (*post_schedule) (struct rq *this_rq);
+	void (*task_wake_up) (struct rq *this_rq, struct task_struct *task);
 #endif
 
 	void (*set_curr_task) (struct rq *rq);
diff --git a/kernel/sched.c b/kernel/sched.c
index 9d6fb731559b..2368a0d882e3 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1625,7 +1625,10 @@ out_activate:
 
 out_running:
 	p->state = TASK_RUNNING;
-	wakeup_balance_rt(rq, p);
+#ifdef CONFIG_SMP
+	if (p->sched_class->task_wake_up)
+		p->sched_class->task_wake_up(rq, p);
+#endif
 out:
 	task_rq_unlock(rq, &flags);
 
@@ -1748,7 +1751,10 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
 		inc_nr_running(p, rq);
 	}
 	check_preempt_curr(rq, p);
-	wakeup_balance_rt(rq, p);
+#ifdef CONFIG_SMP
+	if (p->sched_class->task_wake_up)
+		p->sched_class->task_wake_up(rq, p);
+#endif
 	task_rq_unlock(rq, &flags);
 }
 
@@ -1869,7 +1875,10 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
 	prev_state = prev->state;
 	finish_arch_switch(prev);
 	finish_lock_switch(rq, prev);
-	schedule_tail_balance_rt(rq);
+#ifdef CONFIG_SMP
+	if (current->sched_class->post_schedule)
+		current->sched_class->post_schedule(rq);
+#endif
 
 	fire_sched_in_preempt_notifiers(current);
 	if (mm)
@@ -3638,7 +3647,10 @@ need_resched_nonpreemptible:
 		switch_count = &prev->nvcsw;
 	}
 
-	schedule_balance_rt(rq, prev);
+#ifdef CONFIG_SMP
+	if (prev->sched_class->pre_schedule)
+		prev->sched_class->pre_schedule(rq, prev);
+#endif
 
 	if (unlikely(!rq->nr_running))
 		idle_balance(cpu, rq);
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 3ea0cae513d2..a5a45104603a 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -689,14 +689,14 @@ static int pull_rt_task(struct rq *this_rq)
 	return ret;
 }
 
-static void schedule_balance_rt(struct rq *rq, struct task_struct *prev)
+static void pre_schedule_rt(struct rq *rq, struct task_struct *prev)
 {
 	/* Try to pull RT tasks here if we lower this rq's prio */
 	if (unlikely(rt_task(prev)) && rq->rt.highest_prio > prev->prio)
 		pull_rt_task(rq);
 }
 
-static void schedule_tail_balance_rt(struct rq *rq)
+static void post_schedule_rt(struct rq *rq)
 {
 	/*
 	 * If we have more than one rt_task queued, then
@@ -713,10 +713,9 @@ static void schedule_tail_balance_rt(struct rq *rq)
 }
 
 
-static void wakeup_balance_rt(struct rq *rq, struct task_struct *p)
+static void task_wake_up_rt(struct rq *rq, struct task_struct *p)
 {
-	if (unlikely(rt_task(p)) &&
-	    !task_running(rq, p) &&
+	if (!task_running(rq, p) &&
 	    (p->prio >= rq->rt.highest_prio) &&
 	    rq->rt.overloaded)
 		push_rt_tasks(rq);
@@ -780,11 +779,6 @@ static void leave_domain_rt(struct rq *rq)
 	if (rq->rt.overloaded)
 		rt_clear_overload(rq);
 }
-
-#else /* CONFIG_SMP */
-# define schedule_tail_balance_rt(rq)	do { } while (0)
-# define schedule_balance_rt(rq, prev)	do { } while (0)
-# define wakeup_balance_rt(rq, p)	do { } while (0)
 #endif /* CONFIG_SMP */
 
 static void task_tick_rt(struct rq *rq, struct task_struct *p)
@@ -840,6 +834,9 @@ const struct sched_class rt_sched_class = {
 	.set_cpus_allowed       = set_cpus_allowed_rt,
 	.join_domain            = join_domain_rt,
 	.leave_domain           = leave_domain_rt,
+	.pre_schedule		= pre_schedule_rt,
+	.post_schedule		= post_schedule_rt,
+	.task_wake_up		= task_wake_up_rt,
 #endif
 
 	.set_curr_task          = set_curr_task_rt,
-- 
cgit v1.2.3


From cb46984504048db946cd551c261df4e70d59a8ea Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Fri, 25 Jan 2008 21:08:22 +0100
Subject: sched: RT-balance, add new methods to sched_class

Dmitry Adamushko found that the current implementation of the RT
balancing code left out changes to the sched_setscheduler and
rt_mutex_setprio.

This patch addresses this issue by adding methods to the schedule classes
to handle being switched out of (switched_from) and being switched into
(switched_to) a sched_class. Also a method for changing of priorities
is also added (prio_changed).

This patch also removes some duplicate logic between rt_mutex_setprio and
sched_setscheduler.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h   |  7 ++++
 kernel/sched.c          | 42 +++++++++++------------
 kernel/sched_fair.c     | 39 ++++++++++++++++++++++
 kernel/sched_idletask.c | 31 +++++++++++++++++
 kernel/sched_rt.c       | 89 +++++++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 186 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index c67d2c2f0111..f2044e707004 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -855,6 +855,13 @@ struct sched_class {
 
 	void (*join_domain)(struct rq *rq);
 	void (*leave_domain)(struct rq *rq);
+
+	void (*switched_from) (struct rq *this_rq, struct task_struct *task,
+			       int running);
+	void (*switched_to) (struct rq *this_rq, struct task_struct *task,
+			     int running);
+	void (*prio_changed) (struct rq *this_rq, struct task_struct *task,
+			     int oldprio, int running);
 };
 
 struct load_weight {
diff --git a/kernel/sched.c b/kernel/sched.c
index 2368a0d882e3..5834c7fb79a5 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1152,6 +1152,18 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
 #endif
 }
 
+static inline void check_class_changed(struct rq *rq, struct task_struct *p,
+				       const struct sched_class *prev_class,
+				       int oldprio, int running)
+{
+	if (prev_class != p->sched_class) {
+		if (prev_class->switched_from)
+			prev_class->switched_from(rq, p, running);
+		p->sched_class->switched_to(rq, p, running);
+	} else
+		p->sched_class->prio_changed(rq, p, oldprio, running);
+}
+
 #ifdef CONFIG_SMP
 
 /*
@@ -4017,6 +4029,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 	unsigned long flags;
 	int oldprio, on_rq, running;
 	struct rq *rq;
+	const struct sched_class *prev_class = p->sched_class;
 
 	BUG_ON(prio < 0 || prio > MAX_PRIO);
 
@@ -4042,18 +4055,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 	if (on_rq) {
 		if (running)
 			p->sched_class->set_curr_task(rq);
+
 		enqueue_task(rq, p, 0);
-		/*
-		 * Reschedule if we are currently running on this runqueue and
-		 * our priority decreased, or if we are not currently running on
-		 * this runqueue and our priority is higher than the current's
-		 */
-		if (running) {
-			if (p->prio > oldprio)
-				resched_task(rq->curr);
-		} else {
-			check_preempt_curr(rq, p);
-		}
+
+		check_class_changed(rq, p, prev_class, oldprio, running);
 	}
 	task_rq_unlock(rq, &flags);
 }
@@ -4253,6 +4258,7 @@ int sched_setscheduler(struct task_struct *p, int policy,
 {
 	int retval, oldprio, oldpolicy = -1, on_rq, running;
 	unsigned long flags;
+	const struct sched_class *prev_class = p->sched_class;
 	struct rq *rq;
 
 	/* may grab non-irq protected spin_locks */
@@ -4346,18 +4352,10 @@ recheck:
 	if (on_rq) {
 		if (running)
 			p->sched_class->set_curr_task(rq);
+
 		activate_task(rq, p, 0);
-		/*
-		 * Reschedule if we are currently running on this runqueue and
-		 * our priority decreased, or if we are not currently running on
-		 * this runqueue and our priority is higher than the current's
-		 */
-		if (running) {
-			if (p->prio > oldprio)
-				resched_task(rq->curr);
-		} else {
-			check_preempt_curr(rq, p);
-		}
+
+		check_class_changed(rq, p, prev_class, oldprio, running);
 	}
 	__task_rq_unlock(rq);
 	spin_unlock_irqrestore(&p->pi_lock, flags);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 10aa6e1ae3dd..dfa18d55561d 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1280,6 +1280,42 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
 	resched_task(rq->curr);
 }
 
+/*
+ * Priority of the task has changed. Check to see if we preempt
+ * the current task.
+ */
+static void prio_changed_fair(struct rq *rq, struct task_struct *p,
+			      int oldprio, int running)
+{
+	/*
+	 * Reschedule if we are currently running on this runqueue and
+	 * our priority decreased, or if we are not currently running on
+	 * this runqueue and our priority is higher than the current's
+	 */
+	if (running) {
+		if (p->prio > oldprio)
+			resched_task(rq->curr);
+	} else
+		check_preempt_curr(rq, p);
+}
+
+/*
+ * We switched to the sched_fair class.
+ */
+static void switched_to_fair(struct rq *rq, struct task_struct *p,
+			     int running)
+{
+	/*
+	 * We were most likely switched from sched_rt, so
+	 * kick off the schedule if running, otherwise just see
+	 * if we can still preempt the current task.
+	 */
+	if (running)
+		resched_task(rq->curr);
+	else
+		check_preempt_curr(rq, p);
+}
+
 /* Account for a task changing its policy or group.
  *
  * This routine is mostly called to set cfs_rq->curr field when a task
@@ -1318,6 +1354,9 @@ static const struct sched_class fair_sched_class = {
 	.set_curr_task          = set_curr_task_fair,
 	.task_tick		= task_tick_fair,
 	.task_new		= task_new_fair,
+
+	.prio_changed		= prio_changed_fair,
+	.switched_to		= switched_to_fair,
 };
 
 #ifdef CONFIG_SCHED_DEBUG
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index ca5374860aef..ef7a2661fa10 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -69,6 +69,33 @@ static void set_curr_task_idle(struct rq *rq)
 {
 }
 
+static void switched_to_idle(struct rq *rq, struct task_struct *p,
+			     int running)
+{
+	/* Can this actually happen?? */
+	if (running)
+		resched_task(rq->curr);
+	else
+		check_preempt_curr(rq, p);
+}
+
+static void prio_changed_idle(struct rq *rq, struct task_struct *p,
+			      int oldprio, int running)
+{
+	/* This can happen for hot plug CPUS */
+
+	/*
+	 * Reschedule if we are currently running on this runqueue and
+	 * our priority decreased, or if we are not currently running on
+	 * this runqueue and our priority is higher than the current's
+	 */
+	if (running) {
+		if (p->prio > oldprio)
+			resched_task(rq->curr);
+	} else
+		check_preempt_curr(rq, p);
+}
+
 /*
  * Simple, special scheduling class for the per-CPU idle tasks:
  */
@@ -94,5 +121,9 @@ const struct sched_class idle_sched_class = {
 
 	.set_curr_task          = set_curr_task_idle,
 	.task_tick		= task_tick_idle,
+
+	.prio_changed		= prio_changed_idle,
+	.switched_to		= switched_to_idle,
+
 	/* no .task_new for idle tasks */
 };
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index a5a45104603a..57fa3d96847b 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -779,7 +779,92 @@ static void leave_domain_rt(struct rq *rq)
 	if (rq->rt.overloaded)
 		rt_clear_overload(rq);
 }
+
+/*
+ * When switch from the rt queue, we bring ourselves to a position
+ * that we might want to pull RT tasks from other runqueues.
+ */
+static void switched_from_rt(struct rq *rq, struct task_struct *p,
+			   int running)
+{
+	/*
+	 * If there are other RT tasks then we will reschedule
+	 * and the scheduling of the other RT tasks will handle
+	 * the balancing. But if we are the last RT task
+	 * we may need to handle the pulling of RT tasks
+	 * now.
+	 */
+	if (!rq->rt.rt_nr_running)
+		pull_rt_task(rq);
+}
+#endif /* CONFIG_SMP */
+
+/*
+ * When switching a task to RT, we may overload the runqueue
+ * with RT tasks. In this case we try to push them off to
+ * other runqueues.
+ */
+static void switched_to_rt(struct rq *rq, struct task_struct *p,
+			   int running)
+{
+	int check_resched = 1;
+
+	/*
+	 * If we are already running, then there's nothing
+	 * that needs to be done. But if we are not running
+	 * we may need to preempt the current running task.
+	 * If that current running task is also an RT task
+	 * then see if we can move to another run queue.
+	 */
+	if (!running) {
+#ifdef CONFIG_SMP
+		if (rq->rt.overloaded && push_rt_task(rq) &&
+		    /* Don't resched if we changed runqueues */
+		    rq != task_rq(p))
+			check_resched = 0;
+#endif /* CONFIG_SMP */
+		if (check_resched && p->prio < rq->curr->prio)
+			resched_task(rq->curr);
+	}
+}
+
+/*
+ * Priority of the task has changed. This may cause
+ * us to initiate a push or pull.
+ */
+static void prio_changed_rt(struct rq *rq, struct task_struct *p,
+			    int oldprio, int running)
+{
+	if (running) {
+#ifdef CONFIG_SMP
+		/*
+		 * If our priority decreases while running, we
+		 * may need to pull tasks to this runqueue.
+		 */
+		if (oldprio < p->prio)
+			pull_rt_task(rq);
+		/*
+		 * If there's a higher priority task waiting to run
+		 * then reschedule.
+		 */
+		if (p->prio > rq->rt.highest_prio)
+			resched_task(p);
+#else
+		/* For UP simply resched on drop of prio */
+		if (oldprio < p->prio)
+			resched_task(p);
 #endif /* CONFIG_SMP */
+	} else {
+		/*
+		 * This task is not running, but if it is
+		 * greater than the current running task
+		 * then reschedule.
+		 */
+		if (p->prio < rq->curr->prio)
+			resched_task(rq->curr);
+	}
+}
+
 
 static void task_tick_rt(struct rq *rq, struct task_struct *p)
 {
@@ -837,8 +922,12 @@ const struct sched_class rt_sched_class = {
 	.pre_schedule		= pre_schedule_rt,
 	.post_schedule		= post_schedule_rt,
 	.task_wake_up		= task_wake_up_rt,
+	.switched_from		= switched_from_rt,
 #endif
 
 	.set_curr_task          = set_curr_task_rt,
 	.task_tick		= task_tick_rt,
+
+	.prio_changed		= prio_changed_rt,
+	.switched_to		= switched_to_rt,
 };
-- 
cgit v1.2.3


From cdc8eb984ce47a7c90a049f45229f7b0d59ba781 Mon Sep 17 00:00:00 2001
From: Gregory Haskins <ghaskins@novell.com>
Date: Fri, 25 Jan 2008 21:08:23 +0100
Subject: sched: RT-balance, only adjust overload state when changing

The overload set/clears were originally idempotent when this logic was first
implemented.  But that is no longer true due to the addition of the atomic
counter and this logic was never updated to work properly with that change.
So only adjust the overload state if it is actually changing to avoid
getting out of sync.

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_rt.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 57fa3d96847b..a386758ffebb 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -34,9 +34,11 @@ static inline void rt_clear_overload(struct rq *rq)
 static void update_rt_migration(struct rq *rq)
 {
 	if (rq->rt.rt_nr_migratory && (rq->rt.rt_nr_running > 1)) {
-		rt_set_overload(rq);
-		rq->rt.overloaded = 1;
-	} else {
+		if (!rq->rt.overloaded) {
+			rt_set_overload(rq);
+			rq->rt.overloaded = 1;
+		}
+	} else if (rq->rt.overloaded) {
 		rt_clear_overload(rq);
 		rq->rt.overloaded = 0;
 	}
-- 
cgit v1.2.3


From c49443c538c1bbf50eda27e4a3711e9fc15176b0 Mon Sep 17 00:00:00 2001
From: Gregory Haskins <ghaskins@novell.com>
Date: Fri, 25 Jan 2008 21:08:23 +0100
Subject: sched: remove some old cpuset logic

We had support for overlapping cpuset based rto logic in early
prototypes that is no longer used, so remove it.

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_rt.c | 33 ---------------------------------
 1 file changed, 33 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index a386758ffebb..9affb3c9d3db 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -586,38 +586,6 @@ static int pull_rt_task(struct rq *this_rq)
 			continue;
 
 		src_rq = cpu_rq(cpu);
-		if (unlikely(src_rq->rt.rt_nr_running <= 1)) {
-			/*
-			 * It is possible that overlapping cpusets
-			 * will miss clearing a non overloaded runqueue.
-			 * Clear it now.
-			 */
-			if (double_lock_balance(this_rq, src_rq)) {
-				/* unlocked our runqueue lock */
-				struct task_struct *old_next = next;
-
-				next = pick_next_task_rt(this_rq);
-				if (next != old_next)
-					ret = 1;
-			}
-			if (likely(src_rq->rt.rt_nr_running <= 1)) {
-				/*
-				 * Small chance that this_rq->curr changed
-				 * but it's really harmless here.
-				 */
-				rt_clear_overload(this_rq);
-			} else {
-				/*
-				 * Heh, the src_rq is now overloaded, since
-				 * we already have the src_rq lock, go straight
-				 * to pulling tasks from it.
-				 */
-				goto try_pulling;
-			}
-			spin_unlock(&src_rq->lock);
-			continue;
-		}
-
 		/*
 		 * We can potentially drop this_rq's lock in
 		 * double_lock_balance, and another CPU could
@@ -641,7 +609,6 @@ static int pull_rt_task(struct rq *this_rq)
 			continue;
 		}
 
- try_pulling:
 		p = pick_next_highest_task_rt(src_rq, this_cpu);
 
 		/*
-- 
cgit v1.2.3


From c2d727aa2ff17a1c8e5ed1e5e231bb8579b27e82 Mon Sep 17 00:00:00 2001
From: Dipankar Sarma <dipankar@in.ibm.com>
Date: Fri, 25 Jan 2008 21:08:23 +0100
Subject: Preempt-RCU: Use softirq instead of tasklets for

This patch makes RCU use softirq instead of tasklets.

It also adds a memory barrier after raising the softirq
inorder to ensure that the cpu sees the most recently updated
value of rcu->cur while processing callbacks.
The discussion of the related theoretical race pointed out
by James Huang can be found here --> http://lkml.org/lkml/2007/11/20/603

Signed-off-by: Gautham R Shenoy <ego@in.ibm.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Dipankar Sarma <dipankar@in.ibm.com>
Reviewed-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/interrupt.h |  1 +
 kernel/rcupdate.c         | 25 +++++++++++++++++--------
 2 files changed, 18 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 2306920fa388..c3db4a00f1fa 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -256,6 +256,7 @@ enum
 #ifdef CONFIG_HIGH_RES_TIMERS
 	HRTIMER_SOFTIRQ,
 #endif
+	RCU_SOFTIRQ, 	/* Preferable RCU should always be the last softirq */
 };
 
 /* softirq mask and active fields moved to irq_cpustat_t in
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index f2c1a04e9b18..4dfa0b792efa 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -73,8 +73,6 @@ static struct rcu_ctrlblk rcu_bh_ctrlblk = {
 DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L };
 DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L };
 
-/* Fake initialization required by compiler */
-static DEFINE_PER_CPU(struct tasklet_struct, rcu_tasklet) = {NULL};
 static int blimit = 10;
 static int qhimark = 10000;
 static int qlowmark = 100;
@@ -231,6 +229,18 @@ void rcu_barrier(void)
 }
 EXPORT_SYMBOL_GPL(rcu_barrier);
 
+/* Raises the softirq for processing rcu_callbacks. */
+static inline void raise_rcu_softirq(void)
+{
+	raise_softirq(RCU_SOFTIRQ);
+	/*
+	 * The smp_mb() here is required to ensure that this cpu's
+	 * __rcu_process_callbacks() reads the most recently updated
+	 * value of rcu->cur.
+	 */
+	smp_mb();
+}
+
 /*
  * Invoke the completed RCU callbacks. They are expected to be in
  * a per-cpu list.
@@ -260,7 +270,7 @@ static void rcu_do_batch(struct rcu_data *rdp)
 	if (!rdp->donelist)
 		rdp->donetail = &rdp->donelist;
 	else
-		tasklet_schedule(&per_cpu(rcu_tasklet, rdp->cpu));
+		raise_rcu_softirq();
 }
 
 /*
@@ -412,7 +422,6 @@ static void rcu_offline_cpu(int cpu)
 					&per_cpu(rcu_bh_data, cpu));
 	put_cpu_var(rcu_data);
 	put_cpu_var(rcu_bh_data);
-	tasklet_kill_immediate(&per_cpu(rcu_tasklet, cpu), cpu);
 }
 
 #else
@@ -424,7 +433,7 @@ static void rcu_offline_cpu(int cpu)
 #endif
 
 /*
- * This does the RCU processing work from tasklet context. 
+ * This does the RCU processing work from softirq context.
  */
 static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp,
 					struct rcu_data *rdp)
@@ -469,7 +478,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp,
 		rcu_do_batch(rdp);
 }
 
-static void rcu_process_callbacks(unsigned long unused)
+static void rcu_process_callbacks(struct softirq_action *unused)
 {
 	__rcu_process_callbacks(&rcu_ctrlblk, &__get_cpu_var(rcu_data));
 	__rcu_process_callbacks(&rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data));
@@ -533,7 +542,7 @@ void rcu_check_callbacks(int cpu, int user)
 		rcu_bh_qsctr_inc(cpu);
 	} else if (!in_softirq())
 		rcu_bh_qsctr_inc(cpu);
-	tasklet_schedule(&per_cpu(rcu_tasklet, cpu));
+	raise_rcu_softirq();
 }
 
 static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp,
@@ -556,7 +565,7 @@ static void __cpuinit rcu_online_cpu(int cpu)
 
 	rcu_init_percpu_data(cpu, &rcu_ctrlblk, rdp);
 	rcu_init_percpu_data(cpu, &rcu_bh_ctrlblk, bh_rdp);
-	tasklet_init(&per_cpu(rcu_tasklet, cpu), rcu_process_callbacks, 0UL);
+	open_softirq(RCU_SOFTIRQ, rcu_process_callbacks, NULL);
 }
 
 static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
-- 
cgit v1.2.3


From 01c1c660f4b8086cad7a62345fd04290f3d82c8f Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 25 Jan 2008 21:08:24 +0100
Subject: Preempt-RCU: reorganize RCU code into rcuclassic.c and rcupdate.c

This patch re-organizes the RCU code to enable multiple implementations
of RCU. Users of RCU continues to include rcupdate.h and the
RCU interfaces remain the same. This is in preparation for
subsequently merging the preemptible RCU implementation.

Signed-off-by: Gautham R Shenoy <ego@in.ibm.com>
Signed-off-by: Dipankar Sarma <dipankar@in.ibm.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/rcuclassic.h | 161 +++++++++++++
 include/linux/rcupdate.h   | 168 ++++---------
 kernel/Makefile            |   2 +-
 kernel/rcuclassic.c        | 576 +++++++++++++++++++++++++++++++++++++++++++++
 kernel/rcupdate.c          | 575 ++------------------------------------------
 5 files changed, 812 insertions(+), 670 deletions(-)
 create mode 100644 include/linux/rcuclassic.h
 create mode 100644 kernel/rcuclassic.c

(limited to 'kernel')

diff --git a/include/linux/rcuclassic.h b/include/linux/rcuclassic.h
new file mode 100644
index 000000000000..2b8b045a51d5
--- /dev/null
+++ b/include/linux/rcuclassic.h
@@ -0,0 +1,161 @@
+/*
+ * Read-Copy Update mechanism for mutual exclusion (classic version)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright IBM Corporation, 2001
+ *
+ * Author: Dipankar Sarma <dipankar@in.ibm.com>
+ *
+ * Based on the original work by Paul McKenney <paulmck@us.ibm.com>
+ * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
+ * Papers:
+ * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf
+ * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001)
+ *
+ * For detailed explanation of Read-Copy Update mechanism see -
+ * 		Documentation/RCU
+ *
+ */
+
+#ifndef __LINUX_RCUCLASSIC_H
+#define __LINUX_RCUCLASSIC_H
+
+#ifdef __KERNEL__
+
+#include <linux/cache.h>
+#include <linux/spinlock.h>
+#include <linux/threads.h>
+#include <linux/percpu.h>
+#include <linux/cpumask.h>
+#include <linux/seqlock.h>
+
+
+/* Global control variables for rcupdate callback mechanism. */
+struct rcu_ctrlblk {
+	long	cur;		/* Current batch number.                      */
+	long	completed;	/* Number of the last completed batch         */
+	int	next_pending;	/* Is the next batch already waiting?         */
+
+	int	signaled;
+
+	spinlock_t	lock	____cacheline_internodealigned_in_smp;
+	cpumask_t	cpumask; /* CPUs that need to switch in order    */
+				 /* for current batch to proceed.        */
+} ____cacheline_internodealigned_in_smp;
+
+/* Is batch a before batch b ? */
+static inline int rcu_batch_before(long a, long b)
+{
+	return (a - b) < 0;
+}
+
+/* Is batch a after batch b ? */
+static inline int rcu_batch_after(long a, long b)
+{
+	return (a - b) > 0;
+}
+
+/*
+ * Per-CPU data for Read-Copy UPdate.
+ * nxtlist - new callbacks are added here
+ * curlist - current batch for which quiescent cycle started if any
+ */
+struct rcu_data {
+	/* 1) quiescent state handling : */
+	long		quiescbatch;     /* Batch # for grace period */
+	int		passed_quiesc;	 /* User-mode/idle loop etc. */
+	int		qs_pending;	 /* core waits for quiesc state */
+
+	/* 2) batch handling */
+	long  	       	batch;           /* Batch # for current RCU batch */
+	struct rcu_head *nxtlist;
+	struct rcu_head **nxttail;
+	long            qlen; 	 	 /* # of queued callbacks */
+	struct rcu_head *curlist;
+	struct rcu_head **curtail;
+	struct rcu_head *donelist;
+	struct rcu_head **donetail;
+	long		blimit;		 /* Upper limit on a processed batch */
+	int cpu;
+	struct rcu_head barrier;
+};
+
+DECLARE_PER_CPU(struct rcu_data, rcu_data);
+DECLARE_PER_CPU(struct rcu_data, rcu_bh_data);
+
+/*
+ * Increment the quiescent state counter.
+ * The counter is a bit degenerated: We do not need to know
+ * how many quiescent states passed, just if there was at least
+ * one since the start of the grace period. Thus just a flag.
+ */
+static inline void rcu_qsctr_inc(int cpu)
+{
+	struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
+	rdp->passed_quiesc = 1;
+}
+static inline void rcu_bh_qsctr_inc(int cpu)
+{
+	struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu);
+	rdp->passed_quiesc = 1;
+}
+
+extern int rcu_pending(int cpu);
+extern int rcu_needs_cpu(int cpu);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+extern struct lockdep_map rcu_lock_map;
+# define rcu_read_acquire()	\
+			lock_acquire(&rcu_lock_map, 0, 0, 2, 1, _THIS_IP_)
+# define rcu_read_release()	lock_release(&rcu_lock_map, 1, _THIS_IP_)
+#else
+# define rcu_read_acquire()	do { } while (0)
+# define rcu_read_release()	do { } while (0)
+#endif
+
+#define __rcu_read_lock() \
+	do { \
+		preempt_disable(); \
+		__acquire(RCU); \
+		rcu_read_acquire(); \
+	} while (0)
+#define __rcu_read_unlock() \
+	do { \
+		rcu_read_release(); \
+		__release(RCU); \
+		preempt_enable(); \
+	} while (0)
+#define __rcu_read_lock_bh() \
+	do { \
+		local_bh_disable(); \
+		__acquire(RCU_BH); \
+		rcu_read_acquire(); \
+	} while (0)
+#define __rcu_read_unlock_bh() \
+	do { \
+		rcu_read_release(); \
+		__release(RCU_BH); \
+		local_bh_enable(); \
+	} while (0)
+
+#define __synchronize_sched() synchronize_rcu()
+
+extern void __rcu_init(void);
+extern void rcu_check_callbacks(int cpu, int user);
+extern void rcu_restart_cpu(int cpu);
+
+#endif /* __KERNEL__ */
+#endif /* __LINUX_RCUCLASSIC_H */
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index cc24a01df940..12aa13e13150 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -15,7 +15,7 @@
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  *
- * Copyright (C) IBM Corporation, 2001
+ * Copyright IBM Corporation, 2001
  *
  * Author: Dipankar Sarma <dipankar@in.ibm.com>
  * 
@@ -53,96 +53,14 @@ struct rcu_head {
 	void (*func)(struct rcu_head *head);
 };
 
+#include <linux/rcuclassic.h>
+
 #define RCU_HEAD_INIT 	{ .next = NULL, .func = NULL }
 #define RCU_HEAD(head) struct rcu_head head = RCU_HEAD_INIT
 #define INIT_RCU_HEAD(ptr) do { \
        (ptr)->next = NULL; (ptr)->func = NULL; \
 } while (0)
 
-
-
-/* Global control variables for rcupdate callback mechanism. */
-struct rcu_ctrlblk {
-	long	cur;		/* Current batch number.                      */
-	long	completed;	/* Number of the last completed batch         */
-	int	next_pending;	/* Is the next batch already waiting?         */
-
-	int	signaled;
-
-	spinlock_t	lock	____cacheline_internodealigned_in_smp;
-	cpumask_t	cpumask; /* CPUs that need to switch in order    */
-	                         /* for current batch to proceed.        */
-} ____cacheline_internodealigned_in_smp;
-
-/* Is batch a before batch b ? */
-static inline int rcu_batch_before(long a, long b)
-{
-        return (a - b) < 0;
-}
-
-/* Is batch a after batch b ? */
-static inline int rcu_batch_after(long a, long b)
-{
-        return (a - b) > 0;
-}
-
-/*
- * Per-CPU data for Read-Copy UPdate.
- * nxtlist - new callbacks are added here
- * curlist - current batch for which quiescent cycle started if any
- */
-struct rcu_data {
-	/* 1) quiescent state handling : */
-	long		quiescbatch;     /* Batch # for grace period */
-	int		passed_quiesc;	 /* User-mode/idle loop etc. */
-	int		qs_pending;	 /* core waits for quiesc state */
-
-	/* 2) batch handling */
-	long  	       	batch;           /* Batch # for current RCU batch */
-	struct rcu_head *nxtlist;
-	struct rcu_head **nxttail;
-	long            qlen; 	 	 /* # of queued callbacks */
-	struct rcu_head *curlist;
-	struct rcu_head **curtail;
-	struct rcu_head *donelist;
-	struct rcu_head **donetail;
-	long		blimit;		 /* Upper limit on a processed batch */
-	int cpu;
-	struct rcu_head barrier;
-};
-
-DECLARE_PER_CPU(struct rcu_data, rcu_data);
-DECLARE_PER_CPU(struct rcu_data, rcu_bh_data);
-
-/*
- * Increment the quiescent state counter.
- * The counter is a bit degenerated: We do not need to know
- * how many quiescent states passed, just if there was at least
- * one since the start of the grace period. Thus just a flag.
- */
-static inline void rcu_qsctr_inc(int cpu)
-{
-	struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
-	rdp->passed_quiesc = 1;
-}
-static inline void rcu_bh_qsctr_inc(int cpu)
-{
-	struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu);
-	rdp->passed_quiesc = 1;
-}
-
-extern int rcu_pending(int cpu);
-extern int rcu_needs_cpu(int cpu);
-
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-extern struct lockdep_map rcu_lock_map;
-# define rcu_read_acquire()	lock_acquire(&rcu_lock_map, 0, 0, 2, 1, _THIS_IP_)
-# define rcu_read_release()	lock_release(&rcu_lock_map, 1, _THIS_IP_)
-#else
-# define rcu_read_acquire()	do { } while (0)
-# define rcu_read_release()	do { } while (0)
-#endif
-
 /**
  * rcu_read_lock - mark the beginning of an RCU read-side critical section.
  *
@@ -172,24 +90,13 @@ extern struct lockdep_map rcu_lock_map;
  *
  * It is illegal to block while in an RCU read-side critical section.
  */
-#define rcu_read_lock() \
-	do { \
-		preempt_disable(); \
-		__acquire(RCU); \
-		rcu_read_acquire(); \
-	} while(0)
+#define rcu_read_lock() __rcu_read_lock()
 
 /**
  * rcu_read_unlock - marks the end of an RCU read-side critical section.
  *
  * See rcu_read_lock() for more information.
  */
-#define rcu_read_unlock() \
-	do { \
-		rcu_read_release(); \
-		__release(RCU); \
-		preempt_enable(); \
-	} while(0)
 
 /*
  * So where is rcu_write_lock()?  It does not exist, as there is no
@@ -200,6 +107,7 @@ extern struct lockdep_map rcu_lock_map;
  * used as well.  RCU does not care how the writers keep out of each
  * others' way, as long as they do so.
  */
+#define rcu_read_unlock() __rcu_read_unlock()
 
 /**
  * rcu_read_lock_bh - mark the beginning of a softirq-only RCU critical section
@@ -212,24 +120,14 @@ extern struct lockdep_map rcu_lock_map;
  * can use just rcu_read_lock().
  *
  */
-#define rcu_read_lock_bh() \
-	do { \
-		local_bh_disable(); \
-		__acquire(RCU_BH); \
-		rcu_read_acquire(); \
-	} while(0)
+#define rcu_read_lock_bh() __rcu_read_lock_bh()
 
 /*
  * rcu_read_unlock_bh - marks the end of a softirq-only RCU critical section
  *
  * See rcu_read_lock_bh() for more information.
  */
-#define rcu_read_unlock_bh() \
-	do { \
-		rcu_read_release(); \
-		__release(RCU_BH); \
-		local_bh_enable(); \
-	} while(0)
+#define rcu_read_unlock_bh() __rcu_read_unlock_bh()
 
 /*
  * Prevent the compiler from merging or refetching accesses.  The compiler
@@ -293,21 +191,53 @@ extern struct lockdep_map rcu_lock_map;
  * In "classic RCU", these two guarantees happen to be one and
  * the same, but can differ in realtime RCU implementations.
  */
-#define synchronize_sched() synchronize_rcu()
+#define synchronize_sched() __synchronize_sched()
+
+/**
+ * call_rcu - Queue an RCU callback for invocation after a grace period.
+ * @head: structure to be used for queueing the RCU updates.
+ * @func: actual update function to be invoked after the grace period
+ *
+ * The update function will be invoked some time after a full grace
+ * period elapses, in other words after all currently executing RCU
+ * read-side critical sections have completed.  RCU read-side critical
+ * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
+ * and may be nested.
+ */
+extern void call_rcu(struct rcu_head *head,
+			      void (*func)(struct rcu_head *head));
+
+/**
+ * call_rcu_bh - Queue an RCU for invocation after a quicker grace period.
+ * @head: structure to be used for queueing the RCU updates.
+ * @func: actual update function to be invoked after the grace period
+ *
+ * The update function will be invoked some time after a full grace
+ * period elapses, in other words after all currently executing RCU
+ * read-side critical sections have completed. call_rcu_bh() assumes
+ * that the read-side critical sections end on completion of a softirq
+ * handler. This means that read-side critical sections in process
+ * context must not be interrupted by softirqs. This interface is to be
+ * used when most of the read-side critical sections are in softirq context.
+ * RCU read-side critical sections are delimited by :
+ *  - rcu_read_lock() and  rcu_read_unlock(), if in interrupt context.
+ *  OR
+ *  - rcu_read_lock_bh() and rcu_read_unlock_bh(), if in process context.
+ *  These may be nested.
+ */
+extern void call_rcu_bh(struct rcu_head *head,
+			void (*func)(struct rcu_head *head));
+
+/* Exported common interfaces */
+extern void synchronize_rcu(void);
+extern void rcu_barrier(void);
 
+/* Internal to kernel */
 extern void rcu_init(void);
 extern void rcu_check_callbacks(int cpu, int user);
-extern void rcu_restart_cpu(int cpu);
+
 extern long rcu_batches_completed(void);
 extern long rcu_batches_completed_bh(void);
 
-/* Exported interfaces */
-extern void FASTCALL(call_rcu(struct rcu_head *head, 
-				void (*func)(struct rcu_head *head)));
-extern void FASTCALL(call_rcu_bh(struct rcu_head *head,
-				void (*func)(struct rcu_head *head)));
-extern void synchronize_rcu(void);
-extern void rcu_barrier(void);
-
 #endif /* __KERNEL__ */
 #endif /* __LINUX_RCUPDATE_H */
diff --git a/kernel/Makefile b/kernel/Makefile
index dfa96956dae0..def5dd6097a0 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -6,7 +6,7 @@ obj-y     = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
 	    exit.o itimer.o time.o softirq.o resource.o \
 	    sysctl.o capability.o ptrace.o timer.o user.o user_namespace.o \
 	    signal.o sys.o kmod.o workqueue.o pid.o \
-	    rcupdate.o extable.o params.o posix-timers.o \
+	    rcupdate.o rcuclassic.o extable.o params.o posix-timers.o \
 	    kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
 	    hrtimer.o rwsem.o latency.o nsproxy.o srcu.o \
 	    utsname.o notifier.o
diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
new file mode 100644
index 000000000000..18369e3386e2
--- /dev/null
+++ b/kernel/rcuclassic.c
@@ -0,0 +1,576 @@
+/*
+ * Read-Copy Update mechanism for mutual exclusion
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright IBM Corporation, 2001
+ *
+ * Authors: Dipankar Sarma <dipankar@in.ibm.com>
+ *	    Manfred Spraul <manfred@colorfullife.com>
+ *
+ * Based on the original work by Paul McKenney <paulmck@us.ibm.com>
+ * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
+ * Papers:
+ * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf
+ * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001)
+ *
+ * For detailed explanation of Read-Copy Update mechanism see -
+ * 		Documentation/RCU
+ *
+ */
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/rcupdate.h>
+#include <linux/interrupt.h>
+#include <linux/sched.h>
+#include <asm/atomic.h>
+#include <linux/bitops.h>
+#include <linux/module.h>
+#include <linux/completion.h>
+#include <linux/moduleparam.h>
+#include <linux/percpu.h>
+#include <linux/notifier.h>
+/* #include <linux/rcupdate.h> @@@ */
+#include <linux/cpu.h>
+#include <linux/mutex.h>
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+static struct lock_class_key rcu_lock_key;
+struct lockdep_map rcu_lock_map =
+	STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key);
+EXPORT_SYMBOL_GPL(rcu_lock_map);
+#endif
+
+
+/* Definition for rcupdate control block. */
+static struct rcu_ctrlblk rcu_ctrlblk = {
+	.cur = -300,
+	.completed = -300,
+	.lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock),
+	.cpumask = CPU_MASK_NONE,
+};
+static struct rcu_ctrlblk rcu_bh_ctrlblk = {
+	.cur = -300,
+	.completed = -300,
+	.lock = __SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock),
+	.cpumask = CPU_MASK_NONE,
+};
+
+DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L };
+DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L };
+
+static int blimit = 10;
+static int qhimark = 10000;
+static int qlowmark = 100;
+
+#ifdef CONFIG_SMP
+static void force_quiescent_state(struct rcu_data *rdp,
+			struct rcu_ctrlblk *rcp)
+{
+	int cpu;
+	cpumask_t cpumask;
+	set_need_resched();
+	if (unlikely(!rcp->signaled)) {
+		rcp->signaled = 1;
+		/*
+		 * Don't send IPI to itself. With irqs disabled,
+		 * rdp->cpu is the current cpu.
+		 */
+		cpumask = rcp->cpumask;
+		cpu_clear(rdp->cpu, cpumask);
+		for_each_cpu_mask(cpu, cpumask)
+			smp_send_reschedule(cpu);
+	}
+}
+#else
+static inline void force_quiescent_state(struct rcu_data *rdp,
+			struct rcu_ctrlblk *rcp)
+{
+	set_need_resched();
+}
+#endif
+
+/**
+ * call_rcu - Queue an RCU callback for invocation after a grace period.
+ * @head: structure to be used for queueing the RCU updates.
+ * @func: actual update function to be invoked after the grace period
+ *
+ * The update function will be invoked some time after a full grace
+ * period elapses, in other words after all currently executing RCU
+ * read-side critical sections have completed.  RCU read-side critical
+ * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
+ * and may be nested.
+ */
+void call_rcu(struct rcu_head *head,
+				void (*func)(struct rcu_head *rcu))
+{
+	unsigned long flags;
+	struct rcu_data *rdp;
+
+	head->func = func;
+	head->next = NULL;
+	local_irq_save(flags);
+	rdp = &__get_cpu_var(rcu_data);
+	*rdp->nxttail = head;
+	rdp->nxttail = &head->next;
+	if (unlikely(++rdp->qlen > qhimark)) {
+		rdp->blimit = INT_MAX;
+		force_quiescent_state(rdp, &rcu_ctrlblk);
+	}
+	local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(call_rcu);
+
+/**
+ * call_rcu_bh - Queue an RCU for invocation after a quicker grace period.
+ * @head: structure to be used for queueing the RCU updates.
+ * @func: actual update function to be invoked after the grace period
+ *
+ * The update function will be invoked some time after a full grace
+ * period elapses, in other words after all currently executing RCU
+ * read-side critical sections have completed. call_rcu_bh() assumes
+ * that the read-side critical sections end on completion of a softirq
+ * handler. This means that read-side critical sections in process
+ * context must not be interrupted by softirqs. This interface is to be
+ * used when most of the read-side critical sections are in softirq context.
+ * RCU read-side critical sections are delimited by rcu_read_lock() and
+ * rcu_read_unlock(), * if in interrupt context or rcu_read_lock_bh()
+ * and rcu_read_unlock_bh(), if in process context. These may be nested.
+ */
+void call_rcu_bh(struct rcu_head *head,
+				void (*func)(struct rcu_head *rcu))
+{
+	unsigned long flags;
+	struct rcu_data *rdp;
+
+	head->func = func;
+	head->next = NULL;
+	local_irq_save(flags);
+	rdp = &__get_cpu_var(rcu_bh_data);
+	*rdp->nxttail = head;
+	rdp->nxttail = &head->next;
+
+	if (unlikely(++rdp->qlen > qhimark)) {
+		rdp->blimit = INT_MAX;
+		force_quiescent_state(rdp, &rcu_bh_ctrlblk);
+	}
+
+	local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(call_rcu_bh);
+
+/*
+ * Return the number of RCU batches processed thus far.  Useful
+ * for debug and statistics.
+ */
+long rcu_batches_completed(void)
+{
+	return rcu_ctrlblk.completed;
+}
+EXPORT_SYMBOL_GPL(rcu_batches_completed);
+
+/*
+ * Return the number of RCU batches processed thus far.  Useful
+ * for debug and statistics.
+ */
+long rcu_batches_completed_bh(void)
+{
+	return rcu_bh_ctrlblk.completed;
+}
+EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
+
+/* Raises the softirq for processing rcu_callbacks. */
+static inline void raise_rcu_softirq(void)
+{
+	raise_softirq(RCU_SOFTIRQ);
+	/*
+	 * The smp_mb() here is required to ensure that this cpu's
+	 * __rcu_process_callbacks() reads the most recently updated
+	 * value of rcu->cur.
+	 */
+	smp_mb();
+}
+
+/*
+ * Invoke the completed RCU callbacks. They are expected to be in
+ * a per-cpu list.
+ */
+static void rcu_do_batch(struct rcu_data *rdp)
+{
+	struct rcu_head *next, *list;
+	int count = 0;
+
+	list = rdp->donelist;
+	while (list) {
+		next = list->next;
+		prefetch(next);
+		list->func(list);
+		list = next;
+		if (++count >= rdp->blimit)
+			break;
+	}
+	rdp->donelist = list;
+
+	local_irq_disable();
+	rdp->qlen -= count;
+	local_irq_enable();
+	if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark)
+		rdp->blimit = blimit;
+
+	if (!rdp->donelist)
+		rdp->donetail = &rdp->donelist;
+	else
+		raise_rcu_softirq();
+}
+
+/*
+ * Grace period handling:
+ * The grace period handling consists out of two steps:
+ * - A new grace period is started.
+ *   This is done by rcu_start_batch. The start is not broadcasted to
+ *   all cpus, they must pick this up by comparing rcp->cur with
+ *   rdp->quiescbatch. All cpus are recorded  in the
+ *   rcu_ctrlblk.cpumask bitmap.
+ * - All cpus must go through a quiescent state.
+ *   Since the start of the grace period is not broadcasted, at least two
+ *   calls to rcu_check_quiescent_state are required:
+ *   The first call just notices that a new grace period is running. The
+ *   following calls check if there was a quiescent state since the beginning
+ *   of the grace period. If so, it updates rcu_ctrlblk.cpumask. If
+ *   the bitmap is empty, then the grace period is completed.
+ *   rcu_check_quiescent_state calls rcu_start_batch(0) to start the next grace
+ *   period (if necessary).
+ */
+/*
+ * Register a new batch of callbacks, and start it up if there is currently no
+ * active batch and the batch to be registered has not already occurred.
+ * Caller must hold rcu_ctrlblk.lock.
+ */
+static void rcu_start_batch(struct rcu_ctrlblk *rcp)
+{
+	if (rcp->next_pending &&
+			rcp->completed == rcp->cur) {
+		rcp->next_pending = 0;
+		/*
+		 * next_pending == 0 must be visible in
+		 * __rcu_process_callbacks() before it can see new value of cur.
+		 */
+		smp_wmb();
+		rcp->cur++;
+
+		/*
+		 * Accessing nohz_cpu_mask before incrementing rcp->cur needs a
+		 * Barrier  Otherwise it can cause tickless idle CPUs to be
+		 * included in rcp->cpumask, which will extend graceperiods
+		 * unnecessarily.
+		 */
+		smp_mb();
+		cpus_andnot(rcp->cpumask, cpu_online_map, nohz_cpu_mask);
+
+		rcp->signaled = 0;
+	}
+}
+
+/*
+ * cpu went through a quiescent state since the beginning of the grace period.
+ * Clear it from the cpu mask and complete the grace period if it was the last
+ * cpu. Start another grace period if someone has further entries pending
+ */
+static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp)
+{
+	cpu_clear(cpu, rcp->cpumask);
+	if (cpus_empty(rcp->cpumask)) {
+		/* batch completed ! */
+		rcp->completed = rcp->cur;
+		rcu_start_batch(rcp);
+	}
+}
+
+/*
+ * Check if the cpu has gone through a quiescent state (say context
+ * switch). If so and if it already hasn't done so in this RCU
+ * quiescent cycle, then indicate that it has done so.
+ */
+static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp,
+					struct rcu_data *rdp)
+{
+	if (rdp->quiescbatch != rcp->cur) {
+		/* start new grace period: */
+		rdp->qs_pending = 1;
+		rdp->passed_quiesc = 0;
+		rdp->quiescbatch = rcp->cur;
+		return;
+	}
+
+	/* Grace period already completed for this cpu?
+	 * qs_pending is checked instead of the actual bitmap to avoid
+	 * cacheline trashing.
+	 */
+	if (!rdp->qs_pending)
+		return;
+
+	/*
+	 * Was there a quiescent state since the beginning of the grace
+	 * period? If no, then exit and wait for the next call.
+	 */
+	if (!rdp->passed_quiesc)
+		return;
+	rdp->qs_pending = 0;
+
+	spin_lock(&rcp->lock);
+	/*
+	 * rdp->quiescbatch/rcp->cur and the cpu bitmap can come out of sync
+	 * during cpu startup. Ignore the quiescent state.
+	 */
+	if (likely(rdp->quiescbatch == rcp->cur))
+		cpu_quiet(rdp->cpu, rcp);
+
+	spin_unlock(&rcp->lock);
+}
+
+
+#ifdef CONFIG_HOTPLUG_CPU
+
+/* warning! helper for rcu_offline_cpu. do not use elsewhere without reviewing
+ * locking requirements, the list it's pulling from has to belong to a cpu
+ * which is dead and hence not processing interrupts.
+ */
+static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list,
+				struct rcu_head **tail)
+{
+	local_irq_disable();
+	*this_rdp->nxttail = list;
+	if (list)
+		this_rdp->nxttail = tail;
+	local_irq_enable();
+}
+
+static void __rcu_offline_cpu(struct rcu_data *this_rdp,
+				struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
+{
+	/* if the cpu going offline owns the grace period
+	 * we can block indefinitely waiting for it, so flush
+	 * it here
+	 */
+	spin_lock_bh(&rcp->lock);
+	if (rcp->cur != rcp->completed)
+		cpu_quiet(rdp->cpu, rcp);
+	spin_unlock_bh(&rcp->lock);
+	rcu_move_batch(this_rdp, rdp->curlist, rdp->curtail);
+	rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail);
+	rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail);
+}
+
+static void rcu_offline_cpu(int cpu)
+{
+	struct rcu_data *this_rdp = &get_cpu_var(rcu_data);
+	struct rcu_data *this_bh_rdp = &get_cpu_var(rcu_bh_data);
+
+	__rcu_offline_cpu(this_rdp, &rcu_ctrlblk,
+					&per_cpu(rcu_data, cpu));
+	__rcu_offline_cpu(this_bh_rdp, &rcu_bh_ctrlblk,
+					&per_cpu(rcu_bh_data, cpu));
+	put_cpu_var(rcu_data);
+	put_cpu_var(rcu_bh_data);
+}
+
+#else
+
+static void rcu_offline_cpu(int cpu)
+{
+}
+
+#endif
+
+/*
+ * This does the RCU processing work from softirq context.
+ */
+static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp,
+					struct rcu_data *rdp)
+{
+	if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) {
+		*rdp->donetail = rdp->curlist;
+		rdp->donetail = rdp->curtail;
+		rdp->curlist = NULL;
+		rdp->curtail = &rdp->curlist;
+	}
+
+	if (rdp->nxtlist && !rdp->curlist) {
+		local_irq_disable();
+		rdp->curlist = rdp->nxtlist;
+		rdp->curtail = rdp->nxttail;
+		rdp->nxtlist = NULL;
+		rdp->nxttail = &rdp->nxtlist;
+		local_irq_enable();
+
+		/*
+		 * start the next batch of callbacks
+		 */
+
+		/* determine batch number */
+		rdp->batch = rcp->cur + 1;
+		/* see the comment and corresponding wmb() in
+		 * the rcu_start_batch()
+		 */
+		smp_rmb();
+
+		if (!rcp->next_pending) {
+			/* and start it/schedule start if it's a new batch */
+			spin_lock(&rcp->lock);
+			rcp->next_pending = 1;
+			rcu_start_batch(rcp);
+			spin_unlock(&rcp->lock);
+		}
+	}
+
+	rcu_check_quiescent_state(rcp, rdp);
+	if (rdp->donelist)
+		rcu_do_batch(rdp);
+}
+
+static void rcu_process_callbacks(struct softirq_action *unused)
+{
+	__rcu_process_callbacks(&rcu_ctrlblk, &__get_cpu_var(rcu_data));
+	__rcu_process_callbacks(&rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data));
+}
+
+static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
+{
+	/* This cpu has pending rcu entries and the grace period
+	 * for them has completed.
+	 */
+	if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch))
+		return 1;
+
+	/* This cpu has no pending entries, but there are new entries */
+	if (!rdp->curlist && rdp->nxtlist)
+		return 1;
+
+	/* This cpu has finished callbacks to invoke */
+	if (rdp->donelist)
+		return 1;
+
+	/* The rcu core waits for a quiescent state from the cpu */
+	if (rdp->quiescbatch != rcp->cur || rdp->qs_pending)
+		return 1;
+
+	/* nothing to do */
+	return 0;
+}
+
+/*
+ * Check to see if there is any immediate RCU-related work to be done
+ * by the current CPU, returning 1 if so.  This function is part of the
+ * RCU implementation; it is -not- an exported member of the RCU API.
+ */
+int rcu_pending(int cpu)
+{
+	return __rcu_pending(&rcu_ctrlblk, &per_cpu(rcu_data, cpu)) ||
+		__rcu_pending(&rcu_bh_ctrlblk, &per_cpu(rcu_bh_data, cpu));
+}
+
+/*
+ * Check to see if any future RCU-related work will need to be done
+ * by the current CPU, even if none need be done immediately, returning
+ * 1 if so.  This function is part of the RCU implementation; it is -not-
+ * an exported member of the RCU API.
+ */
+int rcu_needs_cpu(int cpu)
+{
+	struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
+	struct rcu_data *rdp_bh = &per_cpu(rcu_bh_data, cpu);
+
+	return (!!rdp->curlist || !!rdp_bh->curlist || rcu_pending(cpu));
+}
+
+void rcu_check_callbacks(int cpu, int user)
+{
+	if (user ||
+	    (idle_cpu(cpu) && !in_softirq() &&
+				hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
+		rcu_qsctr_inc(cpu);
+		rcu_bh_qsctr_inc(cpu);
+	} else if (!in_softirq())
+		rcu_bh_qsctr_inc(cpu);
+	raise_rcu_softirq();
+}
+
+static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp,
+						struct rcu_data *rdp)
+{
+	memset(rdp, 0, sizeof(*rdp));
+	rdp->curtail = &rdp->curlist;
+	rdp->nxttail = &rdp->nxtlist;
+	rdp->donetail = &rdp->donelist;
+	rdp->quiescbatch = rcp->completed;
+	rdp->qs_pending = 0;
+	rdp->cpu = cpu;
+	rdp->blimit = blimit;
+}
+
+static void __cpuinit rcu_online_cpu(int cpu)
+{
+	struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
+	struct rcu_data *bh_rdp = &per_cpu(rcu_bh_data, cpu);
+
+	rcu_init_percpu_data(cpu, &rcu_ctrlblk, rdp);
+	rcu_init_percpu_data(cpu, &rcu_bh_ctrlblk, bh_rdp);
+	open_softirq(RCU_SOFTIRQ, rcu_process_callbacks, NULL);
+}
+
+static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
+				unsigned long action, void *hcpu)
+{
+	long cpu = (long)hcpu;
+
+	switch (action) {
+	case CPU_UP_PREPARE:
+	case CPU_UP_PREPARE_FROZEN:
+		rcu_online_cpu(cpu);
+		break;
+	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
+		rcu_offline_cpu(cpu);
+		break;
+	default:
+		break;
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block __cpuinitdata rcu_nb = {
+	.notifier_call	= rcu_cpu_notify,
+};
+
+/*
+ * Initializes rcu mechanism.  Assumed to be called early.
+ * That is before local timer(SMP) or jiffie timer (uniproc) is setup.
+ * Note that rcu_qsctr and friends are implicitly
+ * initialized due to the choice of ``0'' for RCU_CTR_INVALID.
+ */
+void __init __rcu_init(void)
+{
+	rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE,
+			(void *)(long)smp_processor_id());
+	/* Register notifier for non-boot CPUs */
+	register_cpu_notifier(&rcu_nb);
+}
+
+module_param(blimit, int, 0);
+module_param(qhimark, int, 0);
+module_param(qlowmark, int, 0);
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 4dfa0b792efa..0ccd0095ebdc 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -15,7 +15,7 @@
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  *
- * Copyright (C) IBM Corporation, 2001
+ * Copyright IBM Corporation, 2001
  *
  * Authors: Dipankar Sarma <dipankar@in.ibm.com>
  *	    Manfred Spraul <manfred@colorfullife.com>
@@ -35,163 +35,57 @@
 #include <linux/init.h>
 #include <linux/spinlock.h>
 #include <linux/smp.h>
-#include <linux/rcupdate.h>
 #include <linux/interrupt.h>
 #include <linux/sched.h>
 #include <asm/atomic.h>
 #include <linux/bitops.h>
-#include <linux/module.h>
 #include <linux/completion.h>
-#include <linux/moduleparam.h>
 #include <linux/percpu.h>
 #include <linux/notifier.h>
 #include <linux/cpu.h>
 #include <linux/mutex.h>
+#include <linux/module.h>
 
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-static struct lock_class_key rcu_lock_key;
-struct lockdep_map rcu_lock_map =
-	STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key);
-
-EXPORT_SYMBOL_GPL(rcu_lock_map);
-#endif
-
-/* Definition for rcupdate control block. */
-static struct rcu_ctrlblk rcu_ctrlblk = {
-	.cur = -300,
-	.completed = -300,
-	.lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock),
-	.cpumask = CPU_MASK_NONE,
-};
-static struct rcu_ctrlblk rcu_bh_ctrlblk = {
-	.cur = -300,
-	.completed = -300,
-	.lock = __SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock),
-	.cpumask = CPU_MASK_NONE,
+struct rcu_synchronize {
+	struct rcu_head head;
+	struct completion completion;
 };
 
-DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L };
-DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L };
-
-static int blimit = 10;
-static int qhimark = 10000;
-static int qlowmark = 100;
-
+static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL};
 static atomic_t rcu_barrier_cpu_count;
 static DEFINE_MUTEX(rcu_barrier_mutex);
 static struct completion rcu_barrier_completion;
 
-#ifdef CONFIG_SMP
-static void force_quiescent_state(struct rcu_data *rdp,
-			struct rcu_ctrlblk *rcp)
-{
-	int cpu;
-	cpumask_t cpumask;
-	set_need_resched();
-	if (unlikely(!rcp->signaled)) {
-		rcp->signaled = 1;
-		/*
-		 * Don't send IPI to itself. With irqs disabled,
-		 * rdp->cpu is the current cpu.
-		 */
-		cpumask = rcp->cpumask;
-		cpu_clear(rdp->cpu, cpumask);
-		for_each_cpu_mask(cpu, cpumask)
-			smp_send_reschedule(cpu);
-	}
-}
-#else
-static inline void force_quiescent_state(struct rcu_data *rdp,
-			struct rcu_ctrlblk *rcp)
+/* Because of FASTCALL declaration of complete, we use this wrapper */
+static void wakeme_after_rcu(struct rcu_head  *head)
 {
-	set_need_resched();
+	struct rcu_synchronize *rcu;
+
+	rcu = container_of(head, struct rcu_synchronize, head);
+	complete(&rcu->completion);
 }
-#endif
 
 /**
- * call_rcu - Queue an RCU callback for invocation after a grace period.
- * @head: structure to be used for queueing the RCU updates.
- * @func: actual update function to be invoked after the grace period
+ * synchronize_rcu - wait until a grace period has elapsed.
  *
- * The update function will be invoked some time after a full grace
- * period elapses, in other words after all currently executing RCU
+ * Control will return to the caller some time after a full grace
+ * period has elapsed, in other words after all currently executing RCU
  * read-side critical sections have completed.  RCU read-side critical
  * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
  * and may be nested.
  */
-void fastcall call_rcu(struct rcu_head *head,
-				void (*func)(struct rcu_head *rcu))
-{
-	unsigned long flags;
-	struct rcu_data *rdp;
-
-	head->func = func;
-	head->next = NULL;
-	local_irq_save(flags);
-	rdp = &__get_cpu_var(rcu_data);
-	*rdp->nxttail = head;
-	rdp->nxttail = &head->next;
-	if (unlikely(++rdp->qlen > qhimark)) {
-		rdp->blimit = INT_MAX;
-		force_quiescent_state(rdp, &rcu_ctrlblk);
-	}
-	local_irq_restore(flags);
-}
-
-/**
- * call_rcu_bh - Queue an RCU for invocation after a quicker grace period.
- * @head: structure to be used for queueing the RCU updates.
- * @func: actual update function to be invoked after the grace period
- *
- * The update function will be invoked some time after a full grace
- * period elapses, in other words after all currently executing RCU
- * read-side critical sections have completed. call_rcu_bh() assumes
- * that the read-side critical sections end on completion of a softirq
- * handler. This means that read-side critical sections in process
- * context must not be interrupted by softirqs. This interface is to be
- * used when most of the read-side critical sections are in softirq context.
- * RCU read-side critical sections are delimited by rcu_read_lock() and
- * rcu_read_unlock(), * if in interrupt context or rcu_read_lock_bh()
- * and rcu_read_unlock_bh(), if in process context. These may be nested.
- */
-void fastcall call_rcu_bh(struct rcu_head *head,
-				void (*func)(struct rcu_head *rcu))
+void synchronize_rcu(void)
 {
-	unsigned long flags;
-	struct rcu_data *rdp;
-
-	head->func = func;
-	head->next = NULL;
-	local_irq_save(flags);
-	rdp = &__get_cpu_var(rcu_bh_data);
-	*rdp->nxttail = head;
-	rdp->nxttail = &head->next;
-
-	if (unlikely(++rdp->qlen > qhimark)) {
-		rdp->blimit = INT_MAX;
-		force_quiescent_state(rdp, &rcu_bh_ctrlblk);
-	}
-
-	local_irq_restore(flags);
-}
+	struct rcu_synchronize rcu;
 
-/*
- * Return the number of RCU batches processed thus far.  Useful
- * for debug and statistics.
- */
-long rcu_batches_completed(void)
-{
-	return rcu_ctrlblk.completed;
-}
+	init_completion(&rcu.completion);
+	/* Will wake me after RCU finished */
+	call_rcu(&rcu.head, wakeme_after_rcu);
 
-/*
- * Return the number of RCU batches processed thus far.  Useful
- * for debug and statistics.
- */
-long rcu_batches_completed_bh(void)
-{
-	return rcu_bh_ctrlblk.completed;
+	/* Wait for it */
+	wait_for_completion(&rcu.completion);
 }
+EXPORT_SYMBOL_GPL(synchronize_rcu);
 
 static void rcu_barrier_callback(struct rcu_head *notused)
 {
@@ -205,10 +99,8 @@ static void rcu_barrier_callback(struct rcu_head *notused)
 static void rcu_barrier_func(void *notused)
 {
 	int cpu = smp_processor_id();
-	struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
-	struct rcu_head *head;
+	struct rcu_head *head = &per_cpu(rcu_barrier_head, cpu);
 
-	head = &rdp->barrier;
 	atomic_inc(&rcu_barrier_cpu_count);
 	call_rcu(head, rcu_barrier_callback);
 }
@@ -229,425 +121,8 @@ void rcu_barrier(void)
 }
 EXPORT_SYMBOL_GPL(rcu_barrier);
 
-/* Raises the softirq for processing rcu_callbacks. */
-static inline void raise_rcu_softirq(void)
-{
-	raise_softirq(RCU_SOFTIRQ);
-	/*
-	 * The smp_mb() here is required to ensure that this cpu's
-	 * __rcu_process_callbacks() reads the most recently updated
-	 * value of rcu->cur.
-	 */
-	smp_mb();
-}
-
-/*
- * Invoke the completed RCU callbacks. They are expected to be in
- * a per-cpu list.
- */
-static void rcu_do_batch(struct rcu_data *rdp)
-{
-	struct rcu_head *next, *list;
-	int count = 0;
-
-	list = rdp->donelist;
-	while (list) {
-		next = list->next;
-		prefetch(next);
-		list->func(list);
-		list = next;
-		if (++count >= rdp->blimit)
-			break;
-	}
-	rdp->donelist = list;
-
-	local_irq_disable();
-	rdp->qlen -= count;
-	local_irq_enable();
-	if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark)
-		rdp->blimit = blimit;
-
-	if (!rdp->donelist)
-		rdp->donetail = &rdp->donelist;
-	else
-		raise_rcu_softirq();
-}
-
-/*
- * Grace period handling:
- * The grace period handling consists out of two steps:
- * - A new grace period is started.
- *   This is done by rcu_start_batch. The start is not broadcasted to
- *   all cpus, they must pick this up by comparing rcp->cur with
- *   rdp->quiescbatch. All cpus are recorded  in the
- *   rcu_ctrlblk.cpumask bitmap.
- * - All cpus must go through a quiescent state.
- *   Since the start of the grace period is not broadcasted, at least two
- *   calls to rcu_check_quiescent_state are required:
- *   The first call just notices that a new grace period is running. The
- *   following calls check if there was a quiescent state since the beginning
- *   of the grace period. If so, it updates rcu_ctrlblk.cpumask. If
- *   the bitmap is empty, then the grace period is completed.
- *   rcu_check_quiescent_state calls rcu_start_batch(0) to start the next grace
- *   period (if necessary).
- */
-/*
- * Register a new batch of callbacks, and start it up if there is currently no
- * active batch and the batch to be registered has not already occurred.
- * Caller must hold rcu_ctrlblk.lock.
- */
-static void rcu_start_batch(struct rcu_ctrlblk *rcp)
-{
-	if (rcp->next_pending &&
-			rcp->completed == rcp->cur) {
-		rcp->next_pending = 0;
-		/*
-		 * next_pending == 0 must be visible in
-		 * __rcu_process_callbacks() before it can see new value of cur.
-		 */
-		smp_wmb();
-		rcp->cur++;
-
-		/*
-		 * Accessing nohz_cpu_mask before incrementing rcp->cur needs a
-		 * Barrier  Otherwise it can cause tickless idle CPUs to be
-		 * included in rcp->cpumask, which will extend graceperiods
-		 * unnecessarily.
-		 */
-		smp_mb();
-		cpus_andnot(rcp->cpumask, cpu_online_map, nohz_cpu_mask);
-
-		rcp->signaled = 0;
-	}
-}
-
-/*
- * cpu went through a quiescent state since the beginning of the grace period.
- * Clear it from the cpu mask and complete the grace period if it was the last
- * cpu. Start another grace period if someone has further entries pending
- */
-static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp)
-{
-	cpu_clear(cpu, rcp->cpumask);
-	if (cpus_empty(rcp->cpumask)) {
-		/* batch completed ! */
-		rcp->completed = rcp->cur;
-		rcu_start_batch(rcp);
-	}
-}
-
-/*
- * Check if the cpu has gone through a quiescent state (say context
- * switch). If so and if it already hasn't done so in this RCU
- * quiescent cycle, then indicate that it has done so.
- */
-static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp,
-					struct rcu_data *rdp)
-{
-	if (rdp->quiescbatch != rcp->cur) {
-		/* start new grace period: */
-		rdp->qs_pending = 1;
-		rdp->passed_quiesc = 0;
-		rdp->quiescbatch = rcp->cur;
-		return;
-	}
-
-	/* Grace period already completed for this cpu?
-	 * qs_pending is checked instead of the actual bitmap to avoid
-	 * cacheline trashing.
-	 */
-	if (!rdp->qs_pending)
-		return;
-
-	/* 
-	 * Was there a quiescent state since the beginning of the grace
-	 * period? If no, then exit and wait for the next call.
-	 */
-	if (!rdp->passed_quiesc)
-		return;
-	rdp->qs_pending = 0;
-
-	spin_lock(&rcp->lock);
-	/*
-	 * rdp->quiescbatch/rcp->cur and the cpu bitmap can come out of sync
-	 * during cpu startup. Ignore the quiescent state.
-	 */
-	if (likely(rdp->quiescbatch == rcp->cur))
-		cpu_quiet(rdp->cpu, rcp);
-
-	spin_unlock(&rcp->lock);
-}
-
-
-#ifdef CONFIG_HOTPLUG_CPU
-
-/* warning! helper for rcu_offline_cpu. do not use elsewhere without reviewing
- * locking requirements, the list it's pulling from has to belong to a cpu
- * which is dead and hence not processing interrupts.
- */
-static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list,
-				struct rcu_head **tail)
-{
-	local_irq_disable();
-	*this_rdp->nxttail = list;
-	if (list)
-		this_rdp->nxttail = tail;
-	local_irq_enable();
-}
-
-static void __rcu_offline_cpu(struct rcu_data *this_rdp,
-				struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
-{
-	/* if the cpu going offline owns the grace period
-	 * we can block indefinitely waiting for it, so flush
-	 * it here
-	 */
-	spin_lock_bh(&rcp->lock);
-	if (rcp->cur != rcp->completed)
-		cpu_quiet(rdp->cpu, rcp);
-	spin_unlock_bh(&rcp->lock);
-	rcu_move_batch(this_rdp, rdp->curlist, rdp->curtail);
-	rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail);
-	rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail);
-}
-
-static void rcu_offline_cpu(int cpu)
-{
-	struct rcu_data *this_rdp = &get_cpu_var(rcu_data);
-	struct rcu_data *this_bh_rdp = &get_cpu_var(rcu_bh_data);
-
-	__rcu_offline_cpu(this_rdp, &rcu_ctrlblk,
-					&per_cpu(rcu_data, cpu));
-	__rcu_offline_cpu(this_bh_rdp, &rcu_bh_ctrlblk,
-					&per_cpu(rcu_bh_data, cpu));
-	put_cpu_var(rcu_data);
-	put_cpu_var(rcu_bh_data);
-}
-
-#else
-
-static void rcu_offline_cpu(int cpu)
-{
-}
-
-#endif
-
-/*
- * This does the RCU processing work from softirq context.
- */
-static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp,
-					struct rcu_data *rdp)
-{
-	if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) {
-		*rdp->donetail = rdp->curlist;
-		rdp->donetail = rdp->curtail;
-		rdp->curlist = NULL;
-		rdp->curtail = &rdp->curlist;
-	}
-
-	if (rdp->nxtlist && !rdp->curlist) {
-		local_irq_disable();
-		rdp->curlist = rdp->nxtlist;
-		rdp->curtail = rdp->nxttail;
-		rdp->nxtlist = NULL;
-		rdp->nxttail = &rdp->nxtlist;
-		local_irq_enable();
-
-		/*
-		 * start the next batch of callbacks
-		 */
-
-		/* determine batch number */
-		rdp->batch = rcp->cur + 1;
-		/* see the comment and corresponding wmb() in
-		 * the rcu_start_batch()
-		 */
-		smp_rmb();
-
-		if (!rcp->next_pending) {
-			/* and start it/schedule start if it's a new batch */
-			spin_lock(&rcp->lock);
-			rcp->next_pending = 1;
-			rcu_start_batch(rcp);
-			spin_unlock(&rcp->lock);
-		}
-	}
-
-	rcu_check_quiescent_state(rcp, rdp);
-	if (rdp->donelist)
-		rcu_do_batch(rdp);
-}
-
-static void rcu_process_callbacks(struct softirq_action *unused)
-{
-	__rcu_process_callbacks(&rcu_ctrlblk, &__get_cpu_var(rcu_data));
-	__rcu_process_callbacks(&rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data));
-}
-
-static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
-{
-	/* This cpu has pending rcu entries and the grace period
-	 * for them has completed.
-	 */
-	if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch))
-		return 1;
-
-	/* This cpu has no pending entries, but there are new entries */
-	if (!rdp->curlist && rdp->nxtlist)
-		return 1;
-
-	/* This cpu has finished callbacks to invoke */
-	if (rdp->donelist)
-		return 1;
-
-	/* The rcu core waits for a quiescent state from the cpu */
-	if (rdp->quiescbatch != rcp->cur || rdp->qs_pending)
-		return 1;
-
-	/* nothing to do */
-	return 0;
-}
-
-/*
- * Check to see if there is any immediate RCU-related work to be done
- * by the current CPU, returning 1 if so.  This function is part of the
- * RCU implementation; it is -not- an exported member of the RCU API.
- */
-int rcu_pending(int cpu)
-{
-	return __rcu_pending(&rcu_ctrlblk, &per_cpu(rcu_data, cpu)) ||
-		__rcu_pending(&rcu_bh_ctrlblk, &per_cpu(rcu_bh_data, cpu));
-}
-
-/*
- * Check to see if any future RCU-related work will need to be done
- * by the current CPU, even if none need be done immediately, returning
- * 1 if so.  This function is part of the RCU implementation; it is -not-
- * an exported member of the RCU API.
- */
-int rcu_needs_cpu(int cpu)
-{
-	struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
-	struct rcu_data *rdp_bh = &per_cpu(rcu_bh_data, cpu);
-
-	return (!!rdp->curlist || !!rdp_bh->curlist || rcu_pending(cpu));
-}
-
-void rcu_check_callbacks(int cpu, int user)
-{
-	if (user || 
-	    (idle_cpu(cpu) && !in_softirq() && 
-				hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
-		rcu_qsctr_inc(cpu);
-		rcu_bh_qsctr_inc(cpu);
-	} else if (!in_softirq())
-		rcu_bh_qsctr_inc(cpu);
-	raise_rcu_softirq();
-}
-
-static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp,
-						struct rcu_data *rdp)
-{
-	memset(rdp, 0, sizeof(*rdp));
-	rdp->curtail = &rdp->curlist;
-	rdp->nxttail = &rdp->nxtlist;
-	rdp->donetail = &rdp->donelist;
-	rdp->quiescbatch = rcp->completed;
-	rdp->qs_pending = 0;
-	rdp->cpu = cpu;
-	rdp->blimit = blimit;
-}
-
-static void __cpuinit rcu_online_cpu(int cpu)
-{
-	struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
-	struct rcu_data *bh_rdp = &per_cpu(rcu_bh_data, cpu);
-
-	rcu_init_percpu_data(cpu, &rcu_ctrlblk, rdp);
-	rcu_init_percpu_data(cpu, &rcu_bh_ctrlblk, bh_rdp);
-	open_softirq(RCU_SOFTIRQ, rcu_process_callbacks, NULL);
-}
-
-static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
-				unsigned long action, void *hcpu)
-{
-	long cpu = (long)hcpu;
-	switch (action) {
-	case CPU_UP_PREPARE:
-	case CPU_UP_PREPARE_FROZEN:
-		rcu_online_cpu(cpu);
-		break;
-	case CPU_DEAD:
-	case CPU_DEAD_FROZEN:
-		rcu_offline_cpu(cpu);
-		break;
-	default:
-		break;
-	}
-	return NOTIFY_OK;
-}
-
-static struct notifier_block __cpuinitdata rcu_nb = {
-	.notifier_call	= rcu_cpu_notify,
-};
-
-/*
- * Initializes rcu mechanism.  Assumed to be called early.
- * That is before local timer(SMP) or jiffie timer (uniproc) is setup.
- * Note that rcu_qsctr and friends are implicitly
- * initialized due to the choice of ``0'' for RCU_CTR_INVALID.
- */
 void __init rcu_init(void)
 {
-	rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE,
-			(void *)(long)smp_processor_id());
-	/* Register notifier for non-boot CPUs */
-	register_cpu_notifier(&rcu_nb);
-}
-
-struct rcu_synchronize {
-	struct rcu_head head;
-	struct completion completion;
-};
-
-/* Because of FASTCALL declaration of complete, we use this wrapper */
-static void wakeme_after_rcu(struct rcu_head  *head)
-{
-	struct rcu_synchronize *rcu;
-
-	rcu = container_of(head, struct rcu_synchronize, head);
-	complete(&rcu->completion);
-}
-
-/**
- * synchronize_rcu - wait until a grace period has elapsed.
- *
- * Control will return to the caller some time after a full grace
- * period has elapsed, in other words after all currently executing RCU
- * read-side critical sections have completed.  RCU read-side critical
- * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
- * and may be nested.
- *
- * If your read-side code is not protected by rcu_read_lock(), do -not-
- * use synchronize_rcu().
- */
-void synchronize_rcu(void)
-{
-	struct rcu_synchronize rcu;
-
-	init_completion(&rcu.completion);
-	/* Will wake me after RCU finished */
-	call_rcu(&rcu.head, wakeme_after_rcu);
-
-	/* Wait for it */
-	wait_for_completion(&rcu.completion);
+	__rcu_init();
 }
 
-module_param(blimit, int, 0);
-module_param(qhimark, int, 0);
-module_param(qlowmark, int, 0);
-EXPORT_SYMBOL_GPL(rcu_batches_completed);
-EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
-EXPORT_SYMBOL_GPL(call_rcu);
-EXPORT_SYMBOL_GPL(call_rcu_bh);
-EXPORT_SYMBOL_GPL(synchronize_rcu);
-- 
cgit v1.2.3


From e0ecfa7917cafe72f4a75f87e8bb5d8d51dc534f Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 25 Jan 2008 21:08:24 +0100
Subject: Preempt-RCU: fix rcu_barrier for preemptive environment.

Fix rcu_barrier() to work properly in preemptive kernel environment.
Also, the ordering of callback must be preserved while moving
callbacks to another CPU during CPU hotplug.

Signed-off-by: Gautham R Shenoy <ego@in.ibm.com>
Signed-off-by: Dipankar Sarma <dipankar@in.ibm.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcuclassic.c |  2 +-
 kernel/rcupdate.c   | 10 ++++++++++
 2 files changed, 11 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
index 18369e3386e2..ce0cf16cab67 100644
--- a/kernel/rcuclassic.c
+++ b/kernel/rcuclassic.c
@@ -371,9 +371,9 @@ static void __rcu_offline_cpu(struct rcu_data *this_rdp,
 	if (rcp->cur != rcp->completed)
 		cpu_quiet(rdp->cpu, rcp);
 	spin_unlock_bh(&rcp->lock);
+	rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail);
 	rcu_move_batch(this_rdp, rdp->curlist, rdp->curtail);
 	rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail);
-	rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail);
 }
 
 static void rcu_offline_cpu(int cpu)
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 0ccd0095ebdc..760dfc233a00 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -115,7 +115,17 @@ void rcu_barrier(void)
 	mutex_lock(&rcu_barrier_mutex);
 	init_completion(&rcu_barrier_completion);
 	atomic_set(&rcu_barrier_cpu_count, 0);
+	/*
+	 * The queueing of callbacks in all CPUs must be atomic with
+	 * respect to RCU, otherwise one CPU may queue a callback,
+	 * wait for a grace period, decrement barrier count and call
+	 * complete(), while other CPUs have not yet queued anything.
+	 * So, we need to make sure that grace periods cannot complete
+	 * until all the callbacks are queued.
+	 */
+	rcu_read_lock();
 	on_each_cpu(rcu_barrier_func, NULL, 0, 1);
+	rcu_read_unlock();
 	wait_for_completion(&rcu_barrier_completion);
 	mutex_unlock(&rcu_barrier_mutex);
 }
-- 
cgit v1.2.3


From e260be673a15b6125068270e0216a3bfbfc12f87 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 25 Jan 2008 21:08:24 +0100
Subject: Preempt-RCU: implementation

This patch implements a new version of RCU which allows its read-side
critical sections to be preempted. It uses a set of counter pairs
to keep track of the read-side critical sections and flips them
when all tasks exit read-side critical section. The details
of this implementation can be found in this paper -

	http://www.rdrop.com/users/paulmck/RCU/OLSrtRCU.2006.08.11a.pdf

and the article-

	http://lwn.net/Articles/253651/

This patch was developed as a part of the -rt kernel development and
meant to provide better latencies when read-side critical sections of
RCU don't disable preemption.  As a consequence of keeping track of RCU
readers, the readers have a slight overhead (optimizations in the paper).
This implementation co-exists with the "classic" RCU implementations
and can be switched to at compiler.

Also includes RCU tracing summarized in debugfs.

[ akpm@linux-foundation.org: build fixes on non-preempt architectures ]

Signed-off-by: Gautham R Shenoy <ego@in.ibm.com>
Signed-off-by: Dipankar Sarma <dipankar@in.ibm.com>
Signed-off-by: Paul E. McKenney <paulmck@us.ibm.com>
Reviewed-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 fs/Kconfig                       |   1 -
 include/linux/rcuclassic.h       |   3 +
 include/linux/rcupdate.h         |  11 +-
 include/linux/rcupreempt.h       |  86 +++++
 include/linux/rcupreempt_trace.h |  99 +++++
 include/linux/sched.h            |   5 +
 init/Kconfig                     |  28 ++
 kernel/Kconfig.preempt           |  10 +
 kernel/Makefile                  |   7 +-
 kernel/fork.c                    |   4 +
 kernel/rcuclassic.c              |   1 -
 kernel/rcupreempt.c              | 816 +++++++++++++++++++++++++++++++++++++++
 kernel/rcupreempt_trace.c        | 330 ++++++++++++++++
 13 files changed, 1394 insertions(+), 7 deletions(-)
 create mode 100644 include/linux/rcupreempt.h
 create mode 100644 include/linux/rcupreempt_trace.h
 create mode 100644 kernel/rcupreempt.c
 create mode 100644 kernel/rcupreempt_trace.c

(limited to 'kernel')

diff --git a/fs/Kconfig b/fs/Kconfig
index 781b47d2f9f2..b4799efaf9e8 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -2130,4 +2130,3 @@ source "fs/nls/Kconfig"
 source "fs/dlm/Kconfig"
 
 endmenu
-
diff --git a/include/linux/rcuclassic.h b/include/linux/rcuclassic.h
index 2b8b045a51d5..4d6624260b4c 100644
--- a/include/linux/rcuclassic.h
+++ b/include/linux/rcuclassic.h
@@ -157,5 +157,8 @@ extern void __rcu_init(void);
 extern void rcu_check_callbacks(int cpu, int user);
 extern void rcu_restart_cpu(int cpu);
 
+extern long rcu_batches_completed(void);
+extern long rcu_batches_completed_bh(void);
+
 #endif /* __KERNEL__ */
 #endif /* __LINUX_RCUCLASSIC_H */
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 12aa13e13150..d32c14de270e 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -53,7 +53,11 @@ struct rcu_head {
 	void (*func)(struct rcu_head *head);
 };
 
+#ifdef CONFIG_CLASSIC_RCU
 #include <linux/rcuclassic.h>
+#else /* #ifdef CONFIG_CLASSIC_RCU */
+#include <linux/rcupreempt.h>
+#endif /* #else #ifdef CONFIG_CLASSIC_RCU */
 
 #define RCU_HEAD_INIT 	{ .next = NULL, .func = NULL }
 #define RCU_HEAD(head) struct rcu_head head = RCU_HEAD_INIT
@@ -231,13 +235,12 @@ extern void call_rcu_bh(struct rcu_head *head,
 /* Exported common interfaces */
 extern void synchronize_rcu(void);
 extern void rcu_barrier(void);
+extern long rcu_batches_completed(void);
+extern long rcu_batches_completed_bh(void);
 
 /* Internal to kernel */
 extern void rcu_init(void);
-extern void rcu_check_callbacks(int cpu, int user);
-
-extern long rcu_batches_completed(void);
-extern long rcu_batches_completed_bh(void);
+extern int rcu_needs_cpu(int cpu);
 
 #endif /* __KERNEL__ */
 #endif /* __LINUX_RCUPDATE_H */
diff --git a/include/linux/rcupreempt.h b/include/linux/rcupreempt.h
new file mode 100644
index 000000000000..ece8eb3e4151
--- /dev/null
+++ b/include/linux/rcupreempt.h
@@ -0,0 +1,86 @@
+/*
+ * Read-Copy Update mechanism for mutual exclusion (RT implementation)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2006
+ *
+ * Author:  Paul McKenney <paulmck@us.ibm.com>
+ *
+ * Based on the original work by Paul McKenney <paul.mckenney@us.ibm.com>
+ * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
+ * Papers:
+ * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf
+ * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001)
+ *
+ * For detailed explanation of Read-Copy Update mechanism see -
+ * 		Documentation/RCU
+ *
+ */
+
+#ifndef __LINUX_RCUPREEMPT_H
+#define __LINUX_RCUPREEMPT_H
+
+#ifdef __KERNEL__
+
+#include <linux/cache.h>
+#include <linux/spinlock.h>
+#include <linux/threads.h>
+#include <linux/percpu.h>
+#include <linux/cpumask.h>
+#include <linux/seqlock.h>
+
+#define rcu_qsctr_inc(cpu)
+#define rcu_bh_qsctr_inc(cpu)
+#define call_rcu_bh(head, rcu) call_rcu(head, rcu)
+
+extern void __rcu_read_lock(void);
+extern void __rcu_read_unlock(void);
+extern int rcu_pending(int cpu);
+extern int rcu_needs_cpu(int cpu);
+
+#define __rcu_read_lock_bh()	{ rcu_read_lock(); local_bh_disable(); }
+#define __rcu_read_unlock_bh()	{ local_bh_enable(); rcu_read_unlock(); }
+
+extern void __synchronize_sched(void);
+
+extern void __rcu_init(void);
+extern void rcu_check_callbacks(int cpu, int user);
+extern void rcu_restart_cpu(int cpu);
+extern long rcu_batches_completed(void);
+
+/*
+ * Return the number of RCU batches processed thus far. Useful for debug
+ * and statistic. The _bh variant is identifcal to straight RCU
+ */
+static inline long rcu_batches_completed_bh(void)
+{
+	return rcu_batches_completed();
+}
+
+#ifdef CONFIG_RCU_TRACE
+struct rcupreempt_trace;
+extern long *rcupreempt_flipctr(int cpu);
+extern long rcupreempt_data_completed(void);
+extern int rcupreempt_flip_flag(int cpu);
+extern int rcupreempt_mb_flag(int cpu);
+extern char *rcupreempt_try_flip_state_name(void);
+extern struct rcupreempt_trace *rcupreempt_trace_cpu(int cpu);
+#endif
+
+struct softirq_action;
+
+#endif /* __KERNEL__ */
+#endif /* __LINUX_RCUPREEMPT_H */
diff --git a/include/linux/rcupreempt_trace.h b/include/linux/rcupreempt_trace.h
new file mode 100644
index 000000000000..21cd6b2a5c42
--- /dev/null
+++ b/include/linux/rcupreempt_trace.h
@@ -0,0 +1,99 @@
+/*
+ * Read-Copy Update mechanism for mutual exclusion (RT implementation)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2006
+ *
+ * Author:  Paul McKenney <paulmck@us.ibm.com>
+ *
+ * Based on the original work by Paul McKenney <paul.mckenney@us.ibm.com>
+ * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
+ * Papers:
+ * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf
+ * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001)
+ *
+ * For detailed explanation of the Preemptible Read-Copy Update mechanism see -
+ * 		 http://lwn.net/Articles/253651/
+ */
+
+#ifndef __LINUX_RCUPREEMPT_TRACE_H
+#define __LINUX_RCUPREEMPT_TRACE_H
+
+#ifdef __KERNEL__
+#include <linux/types.h>
+#include <linux/kernel.h>
+
+#include <asm/atomic.h>
+
+/*
+ * PREEMPT_RCU data structures.
+ */
+
+struct rcupreempt_trace {
+	long		next_length;
+	long		next_add;
+	long		wait_length;
+	long		wait_add;
+	long		done_length;
+	long		done_add;
+	long		done_remove;
+	atomic_t	done_invoked;
+	long		rcu_check_callbacks;
+	atomic_t	rcu_try_flip_1;
+	atomic_t	rcu_try_flip_e1;
+	long		rcu_try_flip_i1;
+	long		rcu_try_flip_ie1;
+	long		rcu_try_flip_g1;
+	long		rcu_try_flip_a1;
+	long		rcu_try_flip_ae1;
+	long		rcu_try_flip_a2;
+	long		rcu_try_flip_z1;
+	long		rcu_try_flip_ze1;
+	long		rcu_try_flip_z2;
+	long		rcu_try_flip_m1;
+	long		rcu_try_flip_me1;
+	long		rcu_try_flip_m2;
+};
+
+#ifdef CONFIG_RCU_TRACE
+#define RCU_TRACE(fn, arg) 	fn(arg);
+#else
+#define RCU_TRACE(fn, arg)
+#endif
+
+extern void rcupreempt_trace_move2done(struct rcupreempt_trace *trace);
+extern void rcupreempt_trace_move2wait(struct rcupreempt_trace *trace);
+extern void rcupreempt_trace_try_flip_1(struct rcupreempt_trace *trace);
+extern void rcupreempt_trace_try_flip_e1(struct rcupreempt_trace *trace);
+extern void rcupreempt_trace_try_flip_i1(struct rcupreempt_trace *trace);
+extern void rcupreempt_trace_try_flip_ie1(struct rcupreempt_trace *trace);
+extern void rcupreempt_trace_try_flip_g1(struct rcupreempt_trace *trace);
+extern void rcupreempt_trace_try_flip_a1(struct rcupreempt_trace *trace);
+extern void rcupreempt_trace_try_flip_ae1(struct rcupreempt_trace *trace);
+extern void rcupreempt_trace_try_flip_a2(struct rcupreempt_trace *trace);
+extern void rcupreempt_trace_try_flip_z1(struct rcupreempt_trace *trace);
+extern void rcupreempt_trace_try_flip_ze1(struct rcupreempt_trace *trace);
+extern void rcupreempt_trace_try_flip_z2(struct rcupreempt_trace *trace);
+extern void rcupreempt_trace_try_flip_m1(struct rcupreempt_trace *trace);
+extern void rcupreempt_trace_try_flip_me1(struct rcupreempt_trace *trace);
+extern void rcupreempt_trace_try_flip_m2(struct rcupreempt_trace *trace);
+extern void rcupreempt_trace_check_callbacks(struct rcupreempt_trace *trace);
+extern void rcupreempt_trace_done_remove(struct rcupreempt_trace *trace);
+extern void rcupreempt_trace_invoke(struct rcupreempt_trace *trace);
+extern void rcupreempt_trace_next_add(struct rcupreempt_trace *trace);
+
+#endif /* __KERNEL__ */
+#endif /* __LINUX_RCUPREEMPT_TRACE_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index f2044e707004..72e1b8ecfbe1 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -974,6 +974,11 @@ struct task_struct {
 	int nr_cpus_allowed;
 	unsigned int time_slice;
 
+#ifdef CONFIG_PREEMPT_RCU
+	int rcu_read_lock_nesting;
+	int rcu_flipctr_idx;
+#endif /* #ifdef CONFIG_PREEMPT_RCU */
+
 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
 	struct sched_info sched_info;
 #endif
diff --git a/init/Kconfig b/init/Kconfig
index f5becd2a12f6..0eda68f0ad54 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -763,3 +763,31 @@ source "block/Kconfig"
 
 config PREEMPT_NOTIFIERS
 	bool
+
+choice
+	prompt "RCU implementation type:"
+	default CLASSIC_RCU
+
+config CLASSIC_RCU
+	bool "Classic RCU"
+	help
+	  This option selects the classic RCU implementation that is
+	  designed for best read-side performance on non-realtime
+	  systems.
+
+	  Say Y if you are unsure.
+
+config PREEMPT_RCU
+	bool "Preemptible RCU"
+	depends on PREEMPT
+	help
+	  This option reduces the latency of the kernel by making certain
+	  RCU sections preemptible. Normally RCU code is non-preemptible, if
+	  this option is selected then read-only RCU sections become
+	  preemptible. This helps latency, but may expose bugs due to
+	  now-naive assumptions about each RCU read-side critical section
+	  remaining on a given CPU through its execution.
+
+	  Say N if you are unsure.
+
+endchoice
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index c64ce9c14207..61fa116efcde 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -63,3 +63,13 @@ config PREEMPT_BKL
 	  Say Y here if you are building a kernel for a desktop system.
 	  Say N if you are unsure.
 
+config RCU_TRACE
+	bool "Enable tracing for RCU - currently stats in debugfs"
+	select DEBUG_FS
+	default y
+	help
+	  This option provides tracing in RCU which presents stats
+	  in debugfs for debugging RCU implementation.
+
+	  Say Y here if you want to enable RCU tracing
+	  Say N if you are unsure.
diff --git a/kernel/Makefile b/kernel/Makefile
index def5dd6097a0..68755cd9a7e4 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -6,7 +6,7 @@ obj-y     = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
 	    exit.o itimer.o time.o softirq.o resource.o \
 	    sysctl.o capability.o ptrace.o timer.o user.o user_namespace.o \
 	    signal.o sys.o kmod.o workqueue.o pid.o \
-	    rcupdate.o rcuclassic.o extable.o params.o posix-timers.o \
+	    rcupdate.o extable.o params.o posix-timers.o \
 	    kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
 	    hrtimer.o rwsem.o latency.o nsproxy.o srcu.o \
 	    utsname.o notifier.o
@@ -52,6 +52,11 @@ obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o
 obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
 obj-$(CONFIG_SECCOMP) += seccomp.o
 obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
+obj-$(CONFIG_CLASSIC_RCU) += rcuclassic.o
+obj-$(CONFIG_PREEMPT_RCU) += rcupreempt.o
+ifeq ($(CONFIG_PREEMPT_RCU),y)
+obj-$(CONFIG_RCU_TRACE) += rcupreempt_trace.o
+endif
 obj-$(CONFIG_RELAY) += relay.o
 obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
 obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
diff --git a/kernel/fork.c b/kernel/fork.c
index 930c51865ab4..9f8ef32cbc7a 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1045,6 +1045,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	copy_flags(clone_flags, p);
 	INIT_LIST_HEAD(&p->children);
 	INIT_LIST_HEAD(&p->sibling);
+#ifdef CONFIG_PREEMPT_RCU
+	p->rcu_read_lock_nesting = 0;
+	p->rcu_flipctr_idx = 0;
+#endif /* #ifdef CONFIG_PREEMPT_RCU */
 	p->vfork_done = NULL;
 	spin_lock_init(&p->alloc_lock);
 
diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
index ce0cf16cab67..f4ffbd0f306f 100644
--- a/kernel/rcuclassic.c
+++ b/kernel/rcuclassic.c
@@ -45,7 +45,6 @@
 #include <linux/moduleparam.h>
 #include <linux/percpu.h>
 #include <linux/notifier.h>
-/* #include <linux/rcupdate.h> @@@ */
 #include <linux/cpu.h>
 #include <linux/mutex.h>
 
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
new file mode 100644
index 000000000000..a5aabb1677f8
--- /dev/null
+++ b/kernel/rcupreempt.c
@@ -0,0 +1,816 @@
+/*
+ * Read-Copy Update mechanism for mutual exclusion, realtime implementation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright IBM Corporation, 2006
+ *
+ * Authors: Paul E. McKenney <paulmck@us.ibm.com>
+ *		With thanks to Esben Nielsen, Bill Huey, and Ingo Molnar
+ *		for pushing me away from locks and towards counters, and
+ *		to Suparna Bhattacharya for pushing me completely away
+ *		from atomic instructions on the read side.
+ *
+ * Papers:  http://www.rdrop.com/users/paulmck/RCU
+ *
+ * Design Document: http://lwn.net/Articles/253651/
+ *
+ * For detailed explanation of Read-Copy Update mechanism see -
+ * 		Documentation/RCU/ *.txt
+ *
+ */
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/rcupdate.h>
+#include <linux/interrupt.h>
+#include <linux/sched.h>
+#include <asm/atomic.h>
+#include <linux/bitops.h>
+#include <linux/module.h>
+#include <linux/completion.h>
+#include <linux/moduleparam.h>
+#include <linux/percpu.h>
+#include <linux/notifier.h>
+#include <linux/rcupdate.h>
+#include <linux/cpu.h>
+#include <linux/random.h>
+#include <linux/delay.h>
+#include <linux/byteorder/swabb.h>
+#include <linux/cpumask.h>
+#include <linux/rcupreempt_trace.h>
+
+/*
+ * Macro that prevents the compiler from reordering accesses, but does
+ * absolutely -nothing- to prevent CPUs from reordering.  This is used
+ * only to mediate communication between mainline code and hardware
+ * interrupt and NMI handlers.
+ */
+#define ACCESS_ONCE(x) (*(volatile typeof(x) *)&(x))
+
+/*
+ * PREEMPT_RCU data structures.
+ */
+
+/*
+ * GP_STAGES specifies the number of times the state machine has
+ * to go through the all the rcu_try_flip_states (see below)
+ * in a single Grace Period.
+ *
+ * GP in GP_STAGES stands for Grace Period ;)
+ */
+#define GP_STAGES    2
+struct rcu_data {
+	spinlock_t	lock;		/* Protect rcu_data fields. */
+	long		completed;	/* Number of last completed batch. */
+	int		waitlistcount;
+	struct tasklet_struct rcu_tasklet;
+	struct rcu_head *nextlist;
+	struct rcu_head **nexttail;
+	struct rcu_head *waitlist[GP_STAGES];
+	struct rcu_head **waittail[GP_STAGES];
+	struct rcu_head *donelist;
+	struct rcu_head **donetail;
+	long rcu_flipctr[2];
+#ifdef CONFIG_RCU_TRACE
+	struct rcupreempt_trace trace;
+#endif /* #ifdef CONFIG_RCU_TRACE */
+};
+
+/*
+ * States for rcu_try_flip() and friends.
+ */
+
+enum rcu_try_flip_states {
+
+	/*
+	 * Stay here if nothing is happening. Flip the counter if somthing
+	 * starts happening. Denoted by "I"
+	 */
+	rcu_try_flip_idle_state,
+
+	/*
+	 * Wait here for all CPUs to notice that the counter has flipped. This
+	 * prevents the old set of counters from ever being incremented once
+	 * we leave this state, which in turn is necessary because we cannot
+	 * test any individual counter for zero -- we can only check the sum.
+	 * Denoted by "A".
+	 */
+	rcu_try_flip_waitack_state,
+
+	/*
+	 * Wait here for the sum of the old per-CPU counters to reach zero.
+	 * Denoted by "Z".
+	 */
+	rcu_try_flip_waitzero_state,
+
+	/*
+	 * Wait here for each of the other CPUs to execute a memory barrier.
+	 * This is necessary to ensure that these other CPUs really have
+	 * completed executing their RCU read-side critical sections, despite
+	 * their CPUs wildly reordering memory. Denoted by "M".
+	 */
+	rcu_try_flip_waitmb_state,
+};
+
+struct rcu_ctrlblk {
+	spinlock_t	fliplock;	/* Protect state-machine transitions. */
+	long		completed;	/* Number of last completed batch. */
+	enum rcu_try_flip_states rcu_try_flip_state; /* The current state of
+							the rcu state machine */
+};
+
+static DEFINE_PER_CPU(struct rcu_data, rcu_data);
+static struct rcu_ctrlblk rcu_ctrlblk = {
+	.fliplock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.fliplock),
+	.completed = 0,
+	.rcu_try_flip_state = rcu_try_flip_idle_state,
+};
+
+
+#ifdef CONFIG_RCU_TRACE
+static char *rcu_try_flip_state_names[] =
+	{ "idle", "waitack", "waitzero", "waitmb" };
+#endif /* #ifdef CONFIG_RCU_TRACE */
+
+/*
+ * Enum and per-CPU flag to determine when each CPU has seen
+ * the most recent counter flip.
+ */
+
+enum rcu_flip_flag_values {
+	rcu_flip_seen,		/* Steady/initial state, last flip seen. */
+				/* Only GP detector can update. */
+	rcu_flipped		/* Flip just completed, need confirmation. */
+				/* Only corresponding CPU can update. */
+};
+static DEFINE_PER_CPU_SHARED_ALIGNED(enum rcu_flip_flag_values, rcu_flip_flag)
+								= rcu_flip_seen;
+
+/*
+ * Enum and per-CPU flag to determine when each CPU has executed the
+ * needed memory barrier to fence in memory references from its last RCU
+ * read-side critical section in the just-completed grace period.
+ */
+
+enum rcu_mb_flag_values {
+	rcu_mb_done,		/* Steady/initial state, no mb()s required. */
+				/* Only GP detector can update. */
+	rcu_mb_needed		/* Flip just completed, need an mb(). */
+				/* Only corresponding CPU can update. */
+};
+static DEFINE_PER_CPU_SHARED_ALIGNED(enum rcu_mb_flag_values, rcu_mb_flag)
+								= rcu_mb_done;
+
+/*
+ * RCU_DATA_ME: find the current CPU's rcu_data structure.
+ * RCU_DATA_CPU: find the specified CPU's rcu_data structure.
+ */
+#define RCU_DATA_ME()		(&__get_cpu_var(rcu_data))
+#define RCU_DATA_CPU(cpu)	(&per_cpu(rcu_data, cpu))
+
+/*
+ * Helper macro for tracing when the appropriate rcu_data is not
+ * cached in a local variable, but where the CPU number is so cached.
+ */
+#define RCU_TRACE_CPU(f, cpu) RCU_TRACE(f, &(RCU_DATA_CPU(cpu)->trace));
+
+/*
+ * Helper macro for tracing when the appropriate rcu_data is not
+ * cached in a local variable.
+ */
+#define RCU_TRACE_ME(f) RCU_TRACE(f, &(RCU_DATA_ME()->trace));
+
+/*
+ * Helper macro for tracing when the appropriate rcu_data is pointed
+ * to by a local variable.
+ */
+#define RCU_TRACE_RDP(f, rdp) RCU_TRACE(f, &((rdp)->trace));
+
+/*
+ * Return the number of RCU batches processed thus far.  Useful
+ * for debug and statistics.
+ */
+long rcu_batches_completed(void)
+{
+	return rcu_ctrlblk.completed;
+}
+EXPORT_SYMBOL_GPL(rcu_batches_completed);
+
+EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
+
+void __rcu_read_lock(void)
+{
+	int idx;
+	struct task_struct *t = current;
+	int nesting;
+
+	nesting = ACCESS_ONCE(t->rcu_read_lock_nesting);
+	if (nesting != 0) {
+
+		/* An earlier rcu_read_lock() covers us, just count it. */
+
+		t->rcu_read_lock_nesting = nesting + 1;
+
+	} else {
+		unsigned long flags;
+
+		/*
+		 * We disable interrupts for the following reasons:
+		 * - If we get scheduling clock interrupt here, and we
+		 *   end up acking the counter flip, it's like a promise
+		 *   that we will never increment the old counter again.
+		 *   Thus we will break that promise if that
+		 *   scheduling clock interrupt happens between the time
+		 *   we pick the .completed field and the time that we
+		 *   increment our counter.
+		 *
+		 * - We don't want to be preempted out here.
+		 *
+		 * NMIs can still occur, of course, and might themselves
+		 * contain rcu_read_lock().
+		 */
+
+		local_irq_save(flags);
+
+		/*
+		 * Outermost nesting of rcu_read_lock(), so increment
+		 * the current counter for the current CPU.  Use volatile
+		 * casts to prevent the compiler from reordering.
+		 */
+
+		idx = ACCESS_ONCE(rcu_ctrlblk.completed) & 0x1;
+		ACCESS_ONCE(RCU_DATA_ME()->rcu_flipctr[idx])++;
+
+		/*
+		 * Now that the per-CPU counter has been incremented, we
+		 * are protected from races with rcu_read_lock() invoked
+		 * from NMI handlers on this CPU.  We can therefore safely
+		 * increment the nesting counter, relieving further NMIs
+		 * of the need to increment the per-CPU counter.
+		 */
+
+		ACCESS_ONCE(t->rcu_read_lock_nesting) = nesting + 1;
+
+		/*
+		 * Now that we have preventing any NMIs from storing
+		 * to the ->rcu_flipctr_idx, we can safely use it to
+		 * remember which counter to decrement in the matching
+		 * rcu_read_unlock().
+		 */
+
+		ACCESS_ONCE(t->rcu_flipctr_idx) = idx;
+		local_irq_restore(flags);
+	}
+}
+EXPORT_SYMBOL_GPL(__rcu_read_lock);
+
+void __rcu_read_unlock(void)
+{
+	int idx;
+	struct task_struct *t = current;
+	int nesting;
+
+	nesting = ACCESS_ONCE(t->rcu_read_lock_nesting);
+	if (nesting > 1) {
+
+		/*
+		 * We are still protected by the enclosing rcu_read_lock(),
+		 * so simply decrement the counter.
+		 */
+
+		t->rcu_read_lock_nesting = nesting - 1;
+
+	} else {
+		unsigned long flags;
+
+		/*
+		 * Disable local interrupts to prevent the grace-period
+		 * detection state machine from seeing us half-done.
+		 * NMIs can still occur, of course, and might themselves
+		 * contain rcu_read_lock() and rcu_read_unlock().
+		 */
+
+		local_irq_save(flags);
+
+		/*
+		 * Outermost nesting of rcu_read_unlock(), so we must
+		 * decrement the current counter for the current CPU.
+		 * This must be done carefully, because NMIs can
+		 * occur at any point in this code, and any rcu_read_lock()
+		 * and rcu_read_unlock() pairs in the NMI handlers
+		 * must interact non-destructively with this code.
+		 * Lots of volatile casts, and -very- careful ordering.
+		 *
+		 * Changes to this code, including this one, must be
+		 * inspected, validated, and tested extremely carefully!!!
+		 */
+
+		/*
+		 * First, pick up the index.
+		 */
+
+		idx = ACCESS_ONCE(t->rcu_flipctr_idx);
+
+		/*
+		 * Now that we have fetched the counter index, it is
+		 * safe to decrement the per-task RCU nesting counter.
+		 * After this, any interrupts or NMIs will increment and
+		 * decrement the per-CPU counters.
+		 */
+		ACCESS_ONCE(t->rcu_read_lock_nesting) = nesting - 1;
+
+		/*
+		 * It is now safe to decrement this task's nesting count.
+		 * NMIs that occur after this statement will route their
+		 * rcu_read_lock() calls through this "else" clause, and
+		 * will thus start incrementing the per-CPU counter on
+		 * their own.  They will also clobber ->rcu_flipctr_idx,
+		 * but that is OK, since we have already fetched it.
+		 */
+
+		ACCESS_ONCE(RCU_DATA_ME()->rcu_flipctr[idx])--;
+		local_irq_restore(flags);
+	}
+}
+EXPORT_SYMBOL_GPL(__rcu_read_unlock);
+
+/*
+ * If a global counter flip has occurred since the last time that we
+ * advanced callbacks, advance them.  Hardware interrupts must be
+ * disabled when calling this function.
+ */
+static void __rcu_advance_callbacks(struct rcu_data *rdp)
+{
+	int cpu;
+	int i;
+	int wlc = 0;
+
+	if (rdp->completed != rcu_ctrlblk.completed) {
+		if (rdp->waitlist[GP_STAGES - 1] != NULL) {
+			*rdp->donetail = rdp->waitlist[GP_STAGES - 1];
+			rdp->donetail = rdp->waittail[GP_STAGES - 1];
+			RCU_TRACE_RDP(rcupreempt_trace_move2done, rdp);
+		}
+		for (i = GP_STAGES - 2; i >= 0; i--) {
+			if (rdp->waitlist[i] != NULL) {
+				rdp->waitlist[i + 1] = rdp->waitlist[i];
+				rdp->waittail[i + 1] = rdp->waittail[i];
+				wlc++;
+			} else {
+				rdp->waitlist[i + 1] = NULL;
+				rdp->waittail[i + 1] =
+					&rdp->waitlist[i + 1];
+			}
+		}
+		if (rdp->nextlist != NULL) {
+			rdp->waitlist[0] = rdp->nextlist;
+			rdp->waittail[0] = rdp->nexttail;
+			wlc++;
+			rdp->nextlist = NULL;
+			rdp->nexttail = &rdp->nextlist;
+			RCU_TRACE_RDP(rcupreempt_trace_move2wait, rdp);
+		} else {
+			rdp->waitlist[0] = NULL;
+			rdp->waittail[0] = &rdp->waitlist[0];
+		}
+		rdp->waitlistcount = wlc;
+		rdp->completed = rcu_ctrlblk.completed;
+	}
+
+	/*
+	 * Check to see if this CPU needs to report that it has seen
+	 * the most recent counter flip, thereby declaring that all
+	 * subsequent rcu_read_lock() invocations will respect this flip.
+	 */
+
+	cpu = raw_smp_processor_id();
+	if (per_cpu(rcu_flip_flag, cpu) == rcu_flipped) {
+		smp_mb();  /* Subsequent counter accesses must see new value */
+		per_cpu(rcu_flip_flag, cpu) = rcu_flip_seen;
+		smp_mb();  /* Subsequent RCU read-side critical sections */
+			   /*  seen -after- acknowledgement. */
+	}
+}
+
+/*
+ * Get here when RCU is idle.  Decide whether we need to
+ * move out of idle state, and return non-zero if so.
+ * "Straightforward" approach for the moment, might later
+ * use callback-list lengths, grace-period duration, or
+ * some such to determine when to exit idle state.
+ * Might also need a pre-idle test that does not acquire
+ * the lock, but let's get the simple case working first...
+ */
+
+static int
+rcu_try_flip_idle(void)
+{
+	int cpu;
+
+	RCU_TRACE_ME(rcupreempt_trace_try_flip_i1);
+	if (!rcu_pending(smp_processor_id())) {
+		RCU_TRACE_ME(rcupreempt_trace_try_flip_ie1);
+		return 0;
+	}
+
+	/*
+	 * Do the flip.
+	 */
+
+	RCU_TRACE_ME(rcupreempt_trace_try_flip_g1);
+	rcu_ctrlblk.completed++;  /* stands in for rcu_try_flip_g2 */
+
+	/*
+	 * Need a memory barrier so that other CPUs see the new
+	 * counter value before they see the subsequent change of all
+	 * the rcu_flip_flag instances to rcu_flipped.
+	 */
+
+	smp_mb();	/* see above block comment. */
+
+	/* Now ask each CPU for acknowledgement of the flip. */
+
+	for_each_possible_cpu(cpu)
+		per_cpu(rcu_flip_flag, cpu) = rcu_flipped;
+
+	return 1;
+}
+
+/*
+ * Wait for CPUs to acknowledge the flip.
+ */
+
+static int
+rcu_try_flip_waitack(void)
+{
+	int cpu;
+
+	RCU_TRACE_ME(rcupreempt_trace_try_flip_a1);
+	for_each_possible_cpu(cpu)
+		if (per_cpu(rcu_flip_flag, cpu) != rcu_flip_seen) {
+			RCU_TRACE_ME(rcupreempt_trace_try_flip_ae1);
+			return 0;
+		}
+
+	/*
+	 * Make sure our checks above don't bleed into subsequent
+	 * waiting for the sum of the counters to reach zero.
+	 */
+
+	smp_mb();	/* see above block comment. */
+	RCU_TRACE_ME(rcupreempt_trace_try_flip_a2);
+	return 1;
+}
+
+/*
+ * Wait for collective ``last'' counter to reach zero,
+ * then tell all CPUs to do an end-of-grace-period memory barrier.
+ */
+
+static int
+rcu_try_flip_waitzero(void)
+{
+	int cpu;
+	int lastidx = !(rcu_ctrlblk.completed & 0x1);
+	int sum = 0;
+
+	/* Check to see if the sum of the "last" counters is zero. */
+
+	RCU_TRACE_ME(rcupreempt_trace_try_flip_z1);
+	for_each_possible_cpu(cpu)
+		sum += RCU_DATA_CPU(cpu)->rcu_flipctr[lastidx];
+	if (sum != 0) {
+		RCU_TRACE_ME(rcupreempt_trace_try_flip_ze1);
+		return 0;
+	}
+
+	/*
+	 * This ensures that the other CPUs see the call for
+	 * memory barriers -after- the sum to zero has been
+	 * detected here
+	 */
+	smp_mb();  /*  ^^^^^^^^^^^^ */
+
+	/* Call for a memory barrier from each CPU. */
+	for_each_possible_cpu(cpu)
+		per_cpu(rcu_mb_flag, cpu) = rcu_mb_needed;
+
+	RCU_TRACE_ME(rcupreempt_trace_try_flip_z2);
+	return 1;
+}
+
+/*
+ * Wait for all CPUs to do their end-of-grace-period memory barrier.
+ * Return 0 once all CPUs have done so.
+ */
+
+static int
+rcu_try_flip_waitmb(void)
+{
+	int cpu;
+
+	RCU_TRACE_ME(rcupreempt_trace_try_flip_m1);
+	for_each_possible_cpu(cpu)
+		if (per_cpu(rcu_mb_flag, cpu) != rcu_mb_done) {
+			RCU_TRACE_ME(rcupreempt_trace_try_flip_me1);
+			return 0;
+		}
+
+	smp_mb(); /* Ensure that the above checks precede any following flip. */
+	RCU_TRACE_ME(rcupreempt_trace_try_flip_m2);
+	return 1;
+}
+
+/*
+ * Attempt a single flip of the counters.  Remember, a single flip does
+ * -not- constitute a grace period.  Instead, the interval between
+ * at least GP_STAGES consecutive flips is a grace period.
+ *
+ * If anyone is nuts enough to run this CONFIG_PREEMPT_RCU implementation
+ * on a large SMP, they might want to use a hierarchical organization of
+ * the per-CPU-counter pairs.
+ */
+static void rcu_try_flip(void)
+{
+	unsigned long flags;
+
+	RCU_TRACE_ME(rcupreempt_trace_try_flip_1);
+	if (unlikely(!spin_trylock_irqsave(&rcu_ctrlblk.fliplock, flags))) {
+		RCU_TRACE_ME(rcupreempt_trace_try_flip_e1);
+		return;
+	}
+
+	/*
+	 * Take the next transition(s) through the RCU grace-period
+	 * flip-counter state machine.
+	 */
+
+	switch (rcu_ctrlblk.rcu_try_flip_state) {
+	case rcu_try_flip_idle_state:
+		if (rcu_try_flip_idle())
+			rcu_ctrlblk.rcu_try_flip_state =
+				rcu_try_flip_waitack_state;
+		break;
+	case rcu_try_flip_waitack_state:
+		if (rcu_try_flip_waitack())
+			rcu_ctrlblk.rcu_try_flip_state =
+				rcu_try_flip_waitzero_state;
+		break;
+	case rcu_try_flip_waitzero_state:
+		if (rcu_try_flip_waitzero())
+			rcu_ctrlblk.rcu_try_flip_state =
+				rcu_try_flip_waitmb_state;
+		break;
+	case rcu_try_flip_waitmb_state:
+		if (rcu_try_flip_waitmb())
+			rcu_ctrlblk.rcu_try_flip_state =
+				rcu_try_flip_idle_state;
+	}
+	spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
+}
+
+/*
+ * Check to see if this CPU needs to do a memory barrier in order to
+ * ensure that any prior RCU read-side critical sections have committed
+ * their counter manipulations and critical-section memory references
+ * before declaring the grace period to be completed.
+ */
+static void rcu_check_mb(int cpu)
+{
+	if (per_cpu(rcu_mb_flag, cpu) == rcu_mb_needed) {
+		smp_mb();  /* Ensure RCU read-side accesses are visible. */
+		per_cpu(rcu_mb_flag, cpu) = rcu_mb_done;
+	}
+}
+
+void rcu_check_callbacks(int cpu, int user)
+{
+	unsigned long flags;
+	struct rcu_data *rdp = RCU_DATA_CPU(cpu);
+
+	rcu_check_mb(cpu);
+	if (rcu_ctrlblk.completed == rdp->completed)
+		rcu_try_flip();
+	spin_lock_irqsave(&rdp->lock, flags);
+	RCU_TRACE_RDP(rcupreempt_trace_check_callbacks, rdp);
+	__rcu_advance_callbacks(rdp);
+	if (rdp->donelist == NULL) {
+		spin_unlock_irqrestore(&rdp->lock, flags);
+	} else {
+		spin_unlock_irqrestore(&rdp->lock, flags);
+		raise_softirq(RCU_SOFTIRQ);
+	}
+}
+
+/*
+ * Needed by dynticks, to make sure all RCU processing has finished
+ * when we go idle:
+ */
+void rcu_advance_callbacks(int cpu, int user)
+{
+	unsigned long flags;
+	struct rcu_data *rdp = RCU_DATA_CPU(cpu);
+
+	if (rcu_ctrlblk.completed == rdp->completed) {
+		rcu_try_flip();
+		if (rcu_ctrlblk.completed == rdp->completed)
+			return;
+	}
+	spin_lock_irqsave(&rdp->lock, flags);
+	RCU_TRACE_RDP(rcupreempt_trace_check_callbacks, rdp);
+	__rcu_advance_callbacks(rdp);
+	spin_unlock_irqrestore(&rdp->lock, flags);
+}
+
+static void rcu_process_callbacks(struct softirq_action *unused)
+{
+	unsigned long flags;
+	struct rcu_head *next, *list;
+	struct rcu_data *rdp = RCU_DATA_ME();
+
+	spin_lock_irqsave(&rdp->lock, flags);
+	list = rdp->donelist;
+	if (list == NULL) {
+		spin_unlock_irqrestore(&rdp->lock, flags);
+		return;
+	}
+	rdp->donelist = NULL;
+	rdp->donetail = &rdp->donelist;
+	RCU_TRACE_RDP(rcupreempt_trace_done_remove, rdp);
+	spin_unlock_irqrestore(&rdp->lock, flags);
+	while (list) {
+		next = list->next;
+		list->func(list);
+		list = next;
+		RCU_TRACE_ME(rcupreempt_trace_invoke);
+	}
+}
+
+void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
+{
+	unsigned long flags;
+	struct rcu_data *rdp;
+
+	head->func = func;
+	head->next = NULL;
+	local_irq_save(flags);
+	rdp = RCU_DATA_ME();
+	spin_lock(&rdp->lock);
+	__rcu_advance_callbacks(rdp);
+	*rdp->nexttail = head;
+	rdp->nexttail = &head->next;
+	RCU_TRACE_RDP(rcupreempt_trace_next_add, rdp);
+	spin_unlock(&rdp->lock);
+	local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(call_rcu);
+
+/*
+ * Wait until all currently running preempt_disable() code segments
+ * (including hardware-irq-disable segments) complete.  Note that
+ * in -rt this does -not- necessarily result in all currently executing
+ * interrupt -handlers- having completed.
+ */
+void __synchronize_sched(void)
+{
+	cpumask_t oldmask;
+	int cpu;
+
+	if (sched_getaffinity(0, &oldmask) < 0)
+		oldmask = cpu_possible_map;
+	for_each_online_cpu(cpu) {
+		sched_setaffinity(0, cpumask_of_cpu(cpu));
+		schedule();
+	}
+	sched_setaffinity(0, oldmask);
+}
+EXPORT_SYMBOL_GPL(__synchronize_sched);
+
+/*
+ * Check to see if any future RCU-related work will need to be done
+ * by the current CPU, even if none need be done immediately, returning
+ * 1 if so.  Assumes that notifiers would take care of handling any
+ * outstanding requests from the RCU core.
+ *
+ * This function is part of the RCU implementation; it is -not-
+ * an exported member of the RCU API.
+ */
+int rcu_needs_cpu(int cpu)
+{
+	struct rcu_data *rdp = RCU_DATA_CPU(cpu);
+
+	return (rdp->donelist != NULL ||
+		!!rdp->waitlistcount ||
+		rdp->nextlist != NULL);
+}
+
+int rcu_pending(int cpu)
+{
+	struct rcu_data *rdp = RCU_DATA_CPU(cpu);
+
+	/* The CPU has at least one callback queued somewhere. */
+
+	if (rdp->donelist != NULL ||
+	    !!rdp->waitlistcount ||
+	    rdp->nextlist != NULL)
+		return 1;
+
+	/* The RCU core needs an acknowledgement from this CPU. */
+
+	if ((per_cpu(rcu_flip_flag, cpu) == rcu_flipped) ||
+	    (per_cpu(rcu_mb_flag, cpu) == rcu_mb_needed))
+		return 1;
+
+	/* This CPU has fallen behind the global grace-period number. */
+
+	if (rdp->completed != rcu_ctrlblk.completed)
+		return 1;
+
+	/* Nothing needed from this CPU. */
+
+	return 0;
+}
+
+void __init __rcu_init(void)
+{
+	int cpu;
+	int i;
+	struct rcu_data *rdp;
+
+	printk(KERN_NOTICE "Preemptible RCU implementation.\n");
+	for_each_possible_cpu(cpu) {
+		rdp = RCU_DATA_CPU(cpu);
+		spin_lock_init(&rdp->lock);
+		rdp->completed = 0;
+		rdp->waitlistcount = 0;
+		rdp->nextlist = NULL;
+		rdp->nexttail = &rdp->nextlist;
+		for (i = 0; i < GP_STAGES; i++) {
+			rdp->waitlist[i] = NULL;
+			rdp->waittail[i] = &rdp->waitlist[i];
+		}
+		rdp->donelist = NULL;
+		rdp->donetail = &rdp->donelist;
+		rdp->rcu_flipctr[0] = 0;
+		rdp->rcu_flipctr[1] = 0;
+	}
+	open_softirq(RCU_SOFTIRQ, rcu_process_callbacks, NULL);
+}
+
+/*
+ * Deprecated, use synchronize_rcu() or synchronize_sched() instead.
+ */
+void synchronize_kernel(void)
+{
+	synchronize_rcu();
+}
+
+#ifdef CONFIG_RCU_TRACE
+long *rcupreempt_flipctr(int cpu)
+{
+	return &RCU_DATA_CPU(cpu)->rcu_flipctr[0];
+}
+EXPORT_SYMBOL_GPL(rcupreempt_flipctr);
+
+int rcupreempt_flip_flag(int cpu)
+{
+	return per_cpu(rcu_flip_flag, cpu);
+}
+EXPORT_SYMBOL_GPL(rcupreempt_flip_flag);
+
+int rcupreempt_mb_flag(int cpu)
+{
+	return per_cpu(rcu_mb_flag, cpu);
+}
+EXPORT_SYMBOL_GPL(rcupreempt_mb_flag);
+
+char *rcupreempt_try_flip_state_name(void)
+{
+	return rcu_try_flip_state_names[rcu_ctrlblk.rcu_try_flip_state];
+}
+EXPORT_SYMBOL_GPL(rcupreempt_try_flip_state_name);
+
+struct rcupreempt_trace *rcupreempt_trace_cpu(int cpu)
+{
+	struct rcu_data *rdp = RCU_DATA_CPU(cpu);
+
+	return &rdp->trace;
+}
+EXPORT_SYMBOL_GPL(rcupreempt_trace_cpu);
+
+#endif /* #ifdef RCU_TRACE */
diff --git a/kernel/rcupreempt_trace.c b/kernel/rcupreempt_trace.c
new file mode 100644
index 000000000000..49ac4947af24
--- /dev/null
+++ b/kernel/rcupreempt_trace.c
@@ -0,0 +1,330 @@
+/*
+ * Read-Copy Update tracing for realtime implementation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright IBM Corporation, 2006
+ *
+ * Papers:  http://www.rdrop.com/users/paulmck/RCU
+ *
+ * For detailed explanation of Read-Copy Update mechanism see -
+ * 		Documentation/RCU/ *.txt
+ *
+ */
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/rcupdate.h>
+#include <linux/interrupt.h>
+#include <linux/sched.h>
+#include <asm/atomic.h>
+#include <linux/bitops.h>
+#include <linux/module.h>
+#include <linux/completion.h>
+#include <linux/moduleparam.h>
+#include <linux/percpu.h>
+#include <linux/notifier.h>
+#include <linux/rcupdate.h>
+#include <linux/cpu.h>
+#include <linux/mutex.h>
+#include <linux/rcupreempt_trace.h>
+#include <linux/debugfs.h>
+
+static struct mutex rcupreempt_trace_mutex;
+static char *rcupreempt_trace_buf;
+#define RCUPREEMPT_TRACE_BUF_SIZE 4096
+
+void rcupreempt_trace_move2done(struct rcupreempt_trace *trace)
+{
+	trace->done_length += trace->wait_length;
+	trace->done_add += trace->wait_length;
+	trace->wait_length = 0;
+}
+void rcupreempt_trace_move2wait(struct rcupreempt_trace *trace)
+{
+	trace->wait_length += trace->next_length;
+	trace->wait_add += trace->next_length;
+	trace->next_length = 0;
+}
+void rcupreempt_trace_try_flip_1(struct rcupreempt_trace *trace)
+{
+	atomic_inc(&trace->rcu_try_flip_1);
+}
+void rcupreempt_trace_try_flip_e1(struct rcupreempt_trace *trace)
+{
+	atomic_inc(&trace->rcu_try_flip_e1);
+}
+void rcupreempt_trace_try_flip_i1(struct rcupreempt_trace *trace)
+{
+	trace->rcu_try_flip_i1++;
+}
+void rcupreempt_trace_try_flip_ie1(struct rcupreempt_trace *trace)
+{
+	trace->rcu_try_flip_ie1++;
+}
+void rcupreempt_trace_try_flip_g1(struct rcupreempt_trace *trace)
+{
+	trace->rcu_try_flip_g1++;
+}
+void rcupreempt_trace_try_flip_a1(struct rcupreempt_trace *trace)
+{
+	trace->rcu_try_flip_a1++;
+}
+void rcupreempt_trace_try_flip_ae1(struct rcupreempt_trace *trace)
+{
+	trace->rcu_try_flip_ae1++;
+}
+void rcupreempt_trace_try_flip_a2(struct rcupreempt_trace *trace)
+{
+	trace->rcu_try_flip_a2++;
+}
+void rcupreempt_trace_try_flip_z1(struct rcupreempt_trace *trace)
+{
+	trace->rcu_try_flip_z1++;
+}
+void rcupreempt_trace_try_flip_ze1(struct rcupreempt_trace *trace)
+{
+	trace->rcu_try_flip_ze1++;
+}
+void rcupreempt_trace_try_flip_z2(struct rcupreempt_trace *trace)
+{
+	trace->rcu_try_flip_z2++;
+}
+void rcupreempt_trace_try_flip_m1(struct rcupreempt_trace *trace)
+{
+	trace->rcu_try_flip_m1++;
+}
+void rcupreempt_trace_try_flip_me1(struct rcupreempt_trace *trace)
+{
+	trace->rcu_try_flip_me1++;
+}
+void rcupreempt_trace_try_flip_m2(struct rcupreempt_trace *trace)
+{
+	trace->rcu_try_flip_m2++;
+}
+void rcupreempt_trace_check_callbacks(struct rcupreempt_trace *trace)
+{
+	trace->rcu_check_callbacks++;
+}
+void rcupreempt_trace_done_remove(struct rcupreempt_trace *trace)
+{
+	trace->done_remove += trace->done_length;
+	trace->done_length = 0;
+}
+void rcupreempt_trace_invoke(struct rcupreempt_trace *trace)
+{
+	atomic_inc(&trace->done_invoked);
+}
+void rcupreempt_trace_next_add(struct rcupreempt_trace *trace)
+{
+	trace->next_add++;
+	trace->next_length++;
+}
+
+static void rcupreempt_trace_sum(struct rcupreempt_trace *sp)
+{
+	struct rcupreempt_trace *cp;
+	int cpu;
+
+	memset(sp, 0, sizeof(*sp));
+	for_each_possible_cpu(cpu) {
+		cp = rcupreempt_trace_cpu(cpu);
+		sp->next_length += cp->next_length;
+		sp->next_add += cp->next_add;
+		sp->wait_length += cp->wait_length;
+		sp->wait_add += cp->wait_add;
+		sp->done_length += cp->done_length;
+		sp->done_add += cp->done_add;
+		sp->done_remove += cp->done_remove;
+		atomic_set(&sp->done_invoked, atomic_read(&cp->done_invoked));
+		sp->rcu_check_callbacks += cp->rcu_check_callbacks;
+		atomic_set(&sp->rcu_try_flip_1,
+			   atomic_read(&cp->rcu_try_flip_1));
+		atomic_set(&sp->rcu_try_flip_e1,
+			   atomic_read(&cp->rcu_try_flip_e1));
+		sp->rcu_try_flip_i1 += cp->rcu_try_flip_i1;
+		sp->rcu_try_flip_ie1 += cp->rcu_try_flip_ie1;
+		sp->rcu_try_flip_g1 += cp->rcu_try_flip_g1;
+		sp->rcu_try_flip_a1 += cp->rcu_try_flip_a1;
+		sp->rcu_try_flip_ae1 += cp->rcu_try_flip_ae1;
+		sp->rcu_try_flip_a2 += cp->rcu_try_flip_a2;
+		sp->rcu_try_flip_z1 += cp->rcu_try_flip_z1;
+		sp->rcu_try_flip_ze1 += cp->rcu_try_flip_ze1;
+		sp->rcu_try_flip_z2 += cp->rcu_try_flip_z2;
+		sp->rcu_try_flip_m1 += cp->rcu_try_flip_m1;
+		sp->rcu_try_flip_me1 += cp->rcu_try_flip_me1;
+		sp->rcu_try_flip_m2 += cp->rcu_try_flip_m2;
+	}
+}
+
+static ssize_t rcustats_read(struct file *filp, char __user *buffer,
+				size_t count, loff_t *ppos)
+{
+	struct rcupreempt_trace trace;
+	ssize_t bcount;
+	int cnt = 0;
+
+	rcupreempt_trace_sum(&trace);
+	mutex_lock(&rcupreempt_trace_mutex);
+	snprintf(&rcupreempt_trace_buf[cnt], RCUPREEMPT_TRACE_BUF_SIZE - cnt,
+		 "ggp=%ld rcc=%ld\n",
+		 rcu_batches_completed(),
+		 trace.rcu_check_callbacks);
+	snprintf(&rcupreempt_trace_buf[cnt], RCUPREEMPT_TRACE_BUF_SIZE - cnt,
+		 "na=%ld nl=%ld wa=%ld wl=%ld da=%ld dl=%ld dr=%ld di=%d\n"
+		 "1=%d e1=%d i1=%ld ie1=%ld g1=%ld a1=%ld ae1=%ld a2=%ld\n"
+		 "z1=%ld ze1=%ld z2=%ld m1=%ld me1=%ld m2=%ld\n",
+
+		 trace.next_add, trace.next_length,
+		 trace.wait_add, trace.wait_length,
+		 trace.done_add, trace.done_length,
+		 trace.done_remove, atomic_read(&trace.done_invoked),
+		 atomic_read(&trace.rcu_try_flip_1),
+		 atomic_read(&trace.rcu_try_flip_e1),
+		 trace.rcu_try_flip_i1, trace.rcu_try_flip_ie1,
+		 trace.rcu_try_flip_g1,
+		 trace.rcu_try_flip_a1, trace.rcu_try_flip_ae1,
+			 trace.rcu_try_flip_a2,
+		 trace.rcu_try_flip_z1, trace.rcu_try_flip_ze1,
+			 trace.rcu_try_flip_z2,
+		 trace.rcu_try_flip_m1, trace.rcu_try_flip_me1,
+			trace.rcu_try_flip_m2);
+	bcount = simple_read_from_buffer(buffer, count, ppos,
+			rcupreempt_trace_buf, strlen(rcupreempt_trace_buf));
+	mutex_unlock(&rcupreempt_trace_mutex);
+	return bcount;
+}
+
+static ssize_t rcugp_read(struct file *filp, char __user *buffer,
+				size_t count, loff_t *ppos)
+{
+	long oldgp = rcu_batches_completed();
+	ssize_t bcount;
+
+	mutex_lock(&rcupreempt_trace_mutex);
+	synchronize_rcu();
+	snprintf(rcupreempt_trace_buf, RCUPREEMPT_TRACE_BUF_SIZE,
+		"oldggp=%ld  newggp=%ld\n", oldgp, rcu_batches_completed());
+	bcount = simple_read_from_buffer(buffer, count, ppos,
+			rcupreempt_trace_buf, strlen(rcupreempt_trace_buf));
+	mutex_unlock(&rcupreempt_trace_mutex);
+	return bcount;
+}
+
+static ssize_t rcuctrs_read(struct file *filp, char __user *buffer,
+				size_t count, loff_t *ppos)
+{
+	int cnt = 0;
+	int cpu;
+	int f = rcu_batches_completed() & 0x1;
+	ssize_t bcount;
+
+	mutex_lock(&rcupreempt_trace_mutex);
+
+	cnt += snprintf(&rcupreempt_trace_buf[cnt], RCUPREEMPT_TRACE_BUF_SIZE,
+				"CPU last cur F M\n");
+	for_each_online_cpu(cpu) {
+		long *flipctr = rcupreempt_flipctr(cpu);
+		cnt += snprintf(&rcupreempt_trace_buf[cnt],
+				RCUPREEMPT_TRACE_BUF_SIZE - cnt,
+					"%3d %4ld %3ld %d %d\n",
+			       cpu,
+			       flipctr[!f],
+			       flipctr[f],
+			       rcupreempt_flip_flag(cpu),
+			       rcupreempt_mb_flag(cpu));
+	}
+	cnt += snprintf(&rcupreempt_trace_buf[cnt],
+			RCUPREEMPT_TRACE_BUF_SIZE - cnt,
+			"ggp = %ld, state = %s\n",
+			rcu_batches_completed(),
+			rcupreempt_try_flip_state_name());
+	cnt += snprintf(&rcupreempt_trace_buf[cnt],
+			RCUPREEMPT_TRACE_BUF_SIZE - cnt,
+			"\n");
+	bcount = simple_read_from_buffer(buffer, count, ppos,
+			rcupreempt_trace_buf, strlen(rcupreempt_trace_buf));
+	mutex_unlock(&rcupreempt_trace_mutex);
+	return bcount;
+}
+
+static struct file_operations rcustats_fops = {
+	.owner = THIS_MODULE,
+	.read = rcustats_read,
+};
+
+static struct file_operations rcugp_fops = {
+	.owner = THIS_MODULE,
+	.read = rcugp_read,
+};
+
+static struct file_operations rcuctrs_fops = {
+	.owner = THIS_MODULE,
+	.read = rcuctrs_read,
+};
+
+static struct dentry *rcudir, *statdir, *ctrsdir, *gpdir;
+static int rcupreempt_debugfs_init(void)
+{
+	rcudir = debugfs_create_dir("rcu", NULL);
+	if (!rcudir)
+		goto out;
+	statdir = debugfs_create_file("rcustats", 0444, rcudir,
+						NULL, &rcustats_fops);
+	if (!statdir)
+		goto free_out;
+
+	gpdir = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops);
+	if (!gpdir)
+		goto free_out;
+
+	ctrsdir = debugfs_create_file("rcuctrs", 0444, rcudir,
+						NULL, &rcuctrs_fops);
+	if (!ctrsdir)
+		goto free_out;
+	return 0;
+free_out:
+	if (statdir)
+		debugfs_remove(statdir);
+	if (gpdir)
+		debugfs_remove(gpdir);
+	debugfs_remove(rcudir);
+out:
+	return 1;
+}
+
+static int __init rcupreempt_trace_init(void)
+{
+	mutex_init(&rcupreempt_trace_mutex);
+	rcupreempt_trace_buf = kmalloc(RCUPREEMPT_TRACE_BUF_SIZE, GFP_KERNEL);
+	if (!rcupreempt_trace_buf)
+		return 1;
+	return rcupreempt_debugfs_init();
+}
+
+static void __exit rcupreempt_trace_cleanup(void)
+{
+	debugfs_remove(statdir);
+	debugfs_remove(gpdir);
+	debugfs_remove(ctrsdir);
+	debugfs_remove(rcudir);
+	kfree(rcupreempt_trace_buf);
+}
+
+
+module_init(rcupreempt_trace_init);
+module_exit(rcupreempt_trace_cleanup);
-- 
cgit v1.2.3


From eaf649e9fe6685f4c5a392cd0e16df5fd6660b7c Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 25 Jan 2008 21:08:25 +0100
Subject: Preempt-RCU: CPU Hotplug handling

This patch allows preemptible RCU to tolerate CPU-hotplug operations.
It accomplishes this by maintaining a local copy of a map of online
CPUs, which it accesses under its own lock.

Signed-off-by: Gautham R Shenoy <ego@in.ibm.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcupreempt.c | 147 ++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 142 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
index a5aabb1677f8..987cfb7ade89 100644
--- a/kernel/rcupreempt.c
+++ b/kernel/rcupreempt.c
@@ -147,6 +147,8 @@ static char *rcu_try_flip_state_names[] =
 	{ "idle", "waitack", "waitzero", "waitmb" };
 #endif /* #ifdef CONFIG_RCU_TRACE */
 
+static cpumask_t rcu_cpu_online_map __read_mostly = CPU_MASK_NONE;
+
 /*
  * Enum and per-CPU flag to determine when each CPU has seen
  * the most recent counter flip.
@@ -445,7 +447,7 @@ rcu_try_flip_idle(void)
 
 	/* Now ask each CPU for acknowledgement of the flip. */
 
-	for_each_possible_cpu(cpu)
+	for_each_cpu_mask(cpu, rcu_cpu_online_map)
 		per_cpu(rcu_flip_flag, cpu) = rcu_flipped;
 
 	return 1;
@@ -461,7 +463,7 @@ rcu_try_flip_waitack(void)
 	int cpu;
 
 	RCU_TRACE_ME(rcupreempt_trace_try_flip_a1);
-	for_each_possible_cpu(cpu)
+	for_each_cpu_mask(cpu, rcu_cpu_online_map)
 		if (per_cpu(rcu_flip_flag, cpu) != rcu_flip_seen) {
 			RCU_TRACE_ME(rcupreempt_trace_try_flip_ae1);
 			return 0;
@@ -492,7 +494,7 @@ rcu_try_flip_waitzero(void)
 	/* Check to see if the sum of the "last" counters is zero. */
 
 	RCU_TRACE_ME(rcupreempt_trace_try_flip_z1);
-	for_each_possible_cpu(cpu)
+	for_each_cpu_mask(cpu, rcu_cpu_online_map)
 		sum += RCU_DATA_CPU(cpu)->rcu_flipctr[lastidx];
 	if (sum != 0) {
 		RCU_TRACE_ME(rcupreempt_trace_try_flip_ze1);
@@ -507,7 +509,7 @@ rcu_try_flip_waitzero(void)
 	smp_mb();  /*  ^^^^^^^^^^^^ */
 
 	/* Call for a memory barrier from each CPU. */
-	for_each_possible_cpu(cpu)
+	for_each_cpu_mask(cpu, rcu_cpu_online_map)
 		per_cpu(rcu_mb_flag, cpu) = rcu_mb_needed;
 
 	RCU_TRACE_ME(rcupreempt_trace_try_flip_z2);
@@ -525,7 +527,7 @@ rcu_try_flip_waitmb(void)
 	int cpu;
 
 	RCU_TRACE_ME(rcupreempt_trace_try_flip_m1);
-	for_each_possible_cpu(cpu)
+	for_each_cpu_mask(cpu, rcu_cpu_online_map)
 		if (per_cpu(rcu_mb_flag, cpu) != rcu_mb_done) {
 			RCU_TRACE_ME(rcupreempt_trace_try_flip_me1);
 			return 0;
@@ -637,6 +639,98 @@ void rcu_advance_callbacks(int cpu, int user)
 	spin_unlock_irqrestore(&rdp->lock, flags);
 }
 
+#ifdef CONFIG_HOTPLUG_CPU
+#define rcu_offline_cpu_enqueue(srclist, srctail, dstlist, dsttail) do { \
+		*dsttail = srclist; \
+		if (srclist != NULL) { \
+			dsttail = srctail; \
+			srclist = NULL; \
+			srctail = &srclist;\
+		} \
+	} while (0)
+
+void rcu_offline_cpu(int cpu)
+{
+	int i;
+	struct rcu_head *list = NULL;
+	unsigned long flags;
+	struct rcu_data *rdp = RCU_DATA_CPU(cpu);
+	struct rcu_head **tail = &list;
+
+	/*
+	 * Remove all callbacks from the newly dead CPU, retaining order.
+	 * Otherwise rcu_barrier() will fail
+	 */
+
+	spin_lock_irqsave(&rdp->lock, flags);
+	rcu_offline_cpu_enqueue(rdp->donelist, rdp->donetail, list, tail);
+	for (i = GP_STAGES - 1; i >= 0; i--)
+		rcu_offline_cpu_enqueue(rdp->waitlist[i], rdp->waittail[i],
+						list, tail);
+	rcu_offline_cpu_enqueue(rdp->nextlist, rdp->nexttail, list, tail);
+	spin_unlock_irqrestore(&rdp->lock, flags);
+	rdp->waitlistcount = 0;
+
+	/* Disengage the newly dead CPU from the grace-period computation. */
+
+	spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags);
+	rcu_check_mb(cpu);
+	if (per_cpu(rcu_flip_flag, cpu) == rcu_flipped) {
+		smp_mb();  /* Subsequent counter accesses must see new value */
+		per_cpu(rcu_flip_flag, cpu) = rcu_flip_seen;
+		smp_mb();  /* Subsequent RCU read-side critical sections */
+			   /*  seen -after- acknowledgement. */
+	}
+
+	RCU_DATA_ME()->rcu_flipctr[0] += RCU_DATA_CPU(cpu)->rcu_flipctr[0];
+	RCU_DATA_ME()->rcu_flipctr[1] += RCU_DATA_CPU(cpu)->rcu_flipctr[1];
+
+	RCU_DATA_CPU(cpu)->rcu_flipctr[0] = 0;
+	RCU_DATA_CPU(cpu)->rcu_flipctr[1] = 0;
+
+	cpu_clear(cpu, rcu_cpu_online_map);
+
+	spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
+
+	/*
+	 * Place the removed callbacks on the current CPU's queue.
+	 * Make them all start a new grace period: simple approach,
+	 * in theory could starve a given set of callbacks, but
+	 * you would need to be doing some serious CPU hotplugging
+	 * to make this happen.  If this becomes a problem, adding
+	 * a synchronize_rcu() to the hotplug path would be a simple
+	 * fix.
+	 */
+
+	rdp = RCU_DATA_ME();
+	spin_lock_irqsave(&rdp->lock, flags);
+	*rdp->nexttail = list;
+	if (list)
+		rdp->nexttail = tail;
+	spin_unlock_irqrestore(&rdp->lock, flags);
+}
+
+void __devinit rcu_online_cpu(int cpu)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags);
+	cpu_set(cpu, rcu_cpu_online_map);
+	spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
+}
+
+#else /* #ifdef CONFIG_HOTPLUG_CPU */
+
+void rcu_offline_cpu(int cpu)
+{
+}
+
+void __devinit rcu_online_cpu(int cpu)
+{
+}
+
+#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
+
 static void rcu_process_callbacks(struct softirq_action *unused)
 {
 	unsigned long flags;
@@ -746,6 +840,32 @@ int rcu_pending(int cpu)
 	return 0;
 }
 
+static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
+				unsigned long action, void *hcpu)
+{
+	long cpu = (long)hcpu;
+
+	switch (action) {
+	case CPU_UP_PREPARE:
+	case CPU_UP_PREPARE_FROZEN:
+		rcu_online_cpu(cpu);
+		break;
+	case CPU_UP_CANCELED:
+	case CPU_UP_CANCELED_FROZEN:
+	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
+		rcu_offline_cpu(cpu);
+		break;
+	default:
+		break;
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block __cpuinitdata rcu_nb = {
+	.notifier_call = rcu_cpu_notify,
+};
+
 void __init __rcu_init(void)
 {
 	int cpu;
@@ -769,6 +889,23 @@ void __init __rcu_init(void)
 		rdp->rcu_flipctr[0] = 0;
 		rdp->rcu_flipctr[1] = 0;
 	}
+	register_cpu_notifier(&rcu_nb);
+
+	/*
+	 * We don't need protection against CPU-Hotplug here
+	 * since
+	 * a) If a CPU comes online while we are iterating over the
+	 *    cpu_online_map below, we would only end up making a
+	 *    duplicate call to rcu_online_cpu() which sets the corresponding
+	 *    CPU's mask in the rcu_cpu_online_map.
+	 *
+	 * b) A CPU cannot go offline at this point in time since the user
+	 *    does not have access to the sysfs interface, nor do we
+	 *    suspend the system.
+	 */
+	for_each_online_cpu(cpu)
+		rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE,	(void *)(long) cpu);
+
 	open_softirq(RCU_SOFTIRQ, rcu_process_callbacks, NULL);
 }
 
-- 
cgit v1.2.3


From dc938520d2bf343b239795cfa24e4f44649358dc Mon Sep 17 00:00:00 2001
From: Gregory Haskins <ghaskins@novell.com>
Date: Fri, 25 Jan 2008 21:08:26 +0100
Subject: sched: dynamically update the root-domain span/online maps

The baseline code statically builds the span maps when the domain is formed.
Previous attempts at dynamically updating the maps caused a suspend-to-ram
regression, which should now be fixed.

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
CC: Gautham R Shenoy <ego@in.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 31 +++++++++++++++++++------------
 1 file changed, 19 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 5834c7fb79a5..02d468844a91 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -359,8 +359,6 @@ struct rt_rq {
  * exclusive cpuset is created, we also create and attach a new root-domain
  * object.
  *
- * By default the system creates a single root-domain with all cpus as
- * members (mimicking the global state we have today).
  */
 struct root_domain {
 	atomic_t refcount;
@@ -375,6 +373,10 @@ struct root_domain {
 	atomic_t rto_count;
 };
 
+/*
+ * By default the system creates a single root-domain with all cpus as
+ * members (mimicking the global state we have today).
+ */
 static struct root_domain def_root_domain;
 
 #endif
@@ -5859,6 +5861,9 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
 				class->leave_domain(rq);
 		}
 
+		cpu_clear(rq->cpu, old_rd->span);
+		cpu_clear(rq->cpu, old_rd->online);
+
 		if (atomic_dec_and_test(&old_rd->refcount))
 			kfree(old_rd);
 	}
@@ -5866,6 +5871,10 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
 	atomic_inc(&rd->refcount);
 	rq->rd = rd;
 
+	cpu_set(rq->cpu, rd->span);
+	if (cpu_isset(rq->cpu, cpu_online_map))
+		cpu_set(rq->cpu, rd->online);
+
 	for (class = sched_class_highest; class; class = class->next) {
 		if (class->join_domain)
 			class->join_domain(rq);
@@ -5874,23 +5883,21 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
 	spin_unlock_irqrestore(&rq->lock, flags);
 }
 
-static void init_rootdomain(struct root_domain *rd, const cpumask_t *map)
+static void init_rootdomain(struct root_domain *rd)
 {
 	memset(rd, 0, sizeof(*rd));
 
-	rd->span = *map;
-	cpus_and(rd->online, rd->span, cpu_online_map);
+	cpus_clear(rd->span);
+	cpus_clear(rd->online);
 }
 
 static void init_defrootdomain(void)
 {
-	cpumask_t cpus = CPU_MASK_ALL;
-
-	init_rootdomain(&def_root_domain, &cpus);
+	init_rootdomain(&def_root_domain);
 	atomic_set(&def_root_domain.refcount, 1);
 }
 
-static struct root_domain *alloc_rootdomain(const cpumask_t *map)
+static struct root_domain *alloc_rootdomain(void)
 {
 	struct root_domain *rd;
 
@@ -5898,7 +5905,7 @@ static struct root_domain *alloc_rootdomain(const cpumask_t *map)
 	if (!rd)
 		return NULL;
 
-	init_rootdomain(rd, map);
+	init_rootdomain(rd);
 
 	return rd;
 }
@@ -6319,7 +6326,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
 	sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
 #endif
 
-	rd = alloc_rootdomain(cpu_map);
+	rd = alloc_rootdomain();
 	if (!rd) {
 		printk(KERN_WARNING "Cannot alloc root domain\n");
 		return -ENOMEM;
@@ -6894,7 +6901,6 @@ void __init sched_init(void)
 #ifdef CONFIG_SMP
 		rq->sd = NULL;
 		rq->rd = NULL;
-		rq_attach_root(rq, &def_root_domain);
 		rq->active_balance = 0;
 		rq->next_balance = jiffies;
 		rq->push_cpu = 0;
@@ -6903,6 +6909,7 @@ void __init sched_init(void)
 		INIT_LIST_HEAD(&rq->migration_queue);
 		rq->rt.highest_prio = MAX_RT_PRIO;
 		rq->rt.overloaded = 0;
+		rq_attach_root(rq, &def_root_domain);
 #endif
 		atomic_set(&rq->nr_iowait, 0);
 
-- 
cgit v1.2.3


From 8eb703e4f33488bf75829564d51d427e17f7cd4c Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@openvz.org>
Date: Fri, 25 Jan 2008 21:08:26 +0100
Subject: uids: merge multiple error paths in alloc_uid() into one

There are already 4 error paths in alloc_uid() that do incremental rollbacks.
I think it's time to merge them.  This costs us 8 lines of code :)

Maybe it would be better to merge this patch with the previous one, but I
remember that some time ago I sent a similar patch (fixing the error path and
cleaning it), but I was told to make two patches in such cases.

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Acked-by: Dhaval Giani <dhaval@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/user.c | 47 ++++++++++++++++++++---------------------------
 1 file changed, 20 insertions(+), 27 deletions(-)

(limited to 'kernel')

diff --git a/kernel/user.c b/kernel/user.c
index ab4fd706993b..bc1c48d35cb3 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -319,7 +319,7 @@ void free_uid(struct user_struct *up)
 struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
 {
 	struct hlist_head *hashent = uidhashentry(ns, uid);
-	struct user_struct *up;
+	struct user_struct *up, *new;
 
 	/* Make uid_hash_find() + uids_user_create() + uid_hash_insert()
 	 * atomic.
@@ -331,13 +331,9 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
 	spin_unlock_irq(&uidhash_lock);
 
 	if (!up) {
-		struct user_struct *new;
-
 		new = kmem_cache_alloc(uid_cachep, GFP_KERNEL);
-		if (!new) {
-			uids_mutex_unlock();
-			return NULL;
-		}
+		if (!new)
+			goto out_unlock;
 
 		new->uid = uid;
 		atomic_set(&new->__count, 1);
@@ -353,28 +349,14 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
 #endif
 		new->locked_shm = 0;
 
-		if (alloc_uid_keyring(new, current) < 0) {
-			kmem_cache_free(uid_cachep, new);
-			uids_mutex_unlock();
-			return NULL;
-		}
+		if (alloc_uid_keyring(new, current) < 0)
+			goto out_free_user;
 
-		if (sched_create_user(new) < 0) {
-			key_put(new->uid_keyring);
-			key_put(new->session_keyring);
-			kmem_cache_free(uid_cachep, new);
-			uids_mutex_unlock();
-			return NULL;
-		}
+		if (sched_create_user(new) < 0)
+			goto out_put_keys;
 
-		if (uids_user_create(new)) {
-			sched_destroy_user(new);
-			key_put(new->uid_keyring);
-			key_put(new->session_keyring);
-			kmem_cache_free(uid_cachep, new);
-			uids_mutex_unlock();
-			return NULL;
-		}
+		if (uids_user_create(new))
+			goto out_destoy_sched;
 
 		/*
 		 * Before adding this, check whether we raced
@@ -402,6 +384,17 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
 	uids_mutex_unlock();
 
 	return up;
+
+out_destoy_sched:
+	sched_destroy_user(new);
+out_put_keys:
+	key_put(new->uid_keyring);
+	key_put(new->session_keyring);
+out_free_user:
+	kmem_cache_free(uid_cachep, new);
+out_unlock:
+	uids_mutex_unlock();
+	return NULL;
 }
 
 void switch_uid(struct user_struct *new_user)
-- 
cgit v1.2.3


From fa717060f1ab7eb6570f2fb49136f838fc9195a9 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 25 Jan 2008 21:08:27 +0100
Subject: sched: sched_rt_entity

Move the task_struct members specific to rt scheduling together.
A future optimization could be to put sched_entity and sched_rt_entity
into a union.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
CC: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/init_task.h |  5 +++--
 include/linux/sched.h     |  8 ++++++--
 kernel/sched.c            |  2 +-
 kernel/sched_rt.c         | 20 ++++++++++----------
 mm/oom_kill.c             |  2 +-
 5 files changed, 21 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 572c65bcc80f..ee65d87bedb7 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -133,9 +133,10 @@ extern struct group_info init_groups;
 	.nr_cpus_allowed = NR_CPUS,					\
 	.mm		= NULL,						\
 	.active_mm	= &init_mm,					\
-	.run_list	= LIST_HEAD_INIT(tsk.run_list),			\
+	.rt		= {						\
+		.run_list	= LIST_HEAD_INIT(tsk.rt.run_list),	\
+		.time_slice	= HZ, },				\
 	.ioprio		= 0,						\
-	.time_slice	= HZ,						\
 	.tasks		= LIST_HEAD_INIT(tsk.tasks),			\
 	.ptrace_children= LIST_HEAD_INIT(tsk.ptrace_children),		\
 	.ptrace_list	= LIST_HEAD_INIT(tsk.ptrace_list),		\
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 72e1b8ecfbe1..a06d09ebd5c6 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -929,6 +929,11 @@ struct sched_entity {
 #endif
 };
 
+struct sched_rt_entity {
+	struct list_head run_list;
+	unsigned int time_slice;
+};
+
 struct task_struct {
 	volatile long state;	/* -1 unrunnable, 0 runnable, >0 stopped */
 	void *stack;
@@ -945,9 +950,9 @@ struct task_struct {
 #endif
 
 	int prio, static_prio, normal_prio;
-	struct list_head run_list;
 	const struct sched_class *sched_class;
 	struct sched_entity se;
+	struct sched_rt_entity rt;
 
 #ifdef CONFIG_PREEMPT_NOTIFIERS
 	/* list of struct preempt_notifier: */
@@ -972,7 +977,6 @@ struct task_struct {
 	unsigned int policy;
 	cpumask_t cpus_allowed;
 	int nr_cpus_allowed;
-	unsigned int time_slice;
 
 #ifdef CONFIG_PREEMPT_RCU
 	int rcu_read_lock_nesting;
diff --git a/kernel/sched.c b/kernel/sched.c
index 02d468844a91..c2cedd09d895 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1685,7 +1685,7 @@ static void __sched_fork(struct task_struct *p)
 	p->se.wait_max			= 0;
 #endif
 
-	INIT_LIST_HEAD(&p->run_list);
+	INIT_LIST_HEAD(&p->rt.run_list);
 	p->se.on_rq = 0;
 
 #ifdef CONFIG_PREEMPT_NOTIFIERS
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 9affb3c9d3db..29963af782ae 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -111,7 +111,7 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
 {
 	struct rt_prio_array *array = &rq->rt.active;
 
-	list_add_tail(&p->run_list, array->queue + p->prio);
+	list_add_tail(&p->rt.run_list, array->queue + p->prio);
 	__set_bit(p->prio, array->bitmap);
 	inc_cpu_load(rq, p->se.load.weight);
 
@@ -127,7 +127,7 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
 
 	update_curr_rt(rq);
 
-	list_del(&p->run_list);
+	list_del(&p->rt.run_list);
 	if (list_empty(array->queue + p->prio))
 		__clear_bit(p->prio, array->bitmap);
 	dec_cpu_load(rq, p->se.load.weight);
@@ -143,7 +143,7 @@ static void requeue_task_rt(struct rq *rq, struct task_struct *p)
 {
 	struct rt_prio_array *array = &rq->rt.active;
 
-	list_move_tail(&p->run_list, array->queue + p->prio);
+	list_move_tail(&p->rt.run_list, array->queue + p->prio);
 }
 
 static void
@@ -212,7 +212,7 @@ static struct task_struct *pick_next_task_rt(struct rq *rq)
 		return NULL;
 
 	queue = array->queue + idx;
-	next = list_entry(queue->next, struct task_struct, run_list);
+	next = list_entry(queue->next, struct task_struct, rt.run_list);
 
 	next->se.exec_start = rq->clock;
 
@@ -261,14 +261,14 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
 	queue = array->queue + idx;
 	BUG_ON(list_empty(queue));
 
-	next = list_entry(queue->next, struct task_struct, run_list);
+	next = list_entry(queue->next, struct task_struct, rt.run_list);
 	if (unlikely(pick_rt_task(rq, next, cpu)))
 		goto out;
 
 	if (queue->next->next != queue) {
 		/* same prio task */
 		next = list_entry(queue->next->next, struct task_struct,
-				  run_list);
+				  rt.run_list);
 		if (pick_rt_task(rq, next, cpu))
 			goto out;
 	}
@@ -282,7 +282,7 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
 	queue = array->queue + idx;
 	BUG_ON(list_empty(queue));
 
-	list_for_each_entry(next, queue, run_list) {
+	list_for_each_entry(next, queue, rt.run_list) {
 		if (pick_rt_task(rq, next, cpu))
 			goto out;
 	}
@@ -846,16 +846,16 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p)
 	if (p->policy != SCHED_RR)
 		return;
 
-	if (--p->time_slice)
+	if (--p->rt.time_slice)
 		return;
 
-	p->time_slice = DEF_TIMESLICE;
+	p->rt.time_slice = DEF_TIMESLICE;
 
 	/*
 	 * Requeue to the end of queue if we are not the only element
 	 * on the queue:
 	 */
-	if (p->run_list.prev != p->run_list.next) {
+	if (p->rt.run_list.prev != p->rt.run_list.next) {
 		requeue_task_rt(rq, p);
 		set_tsk_need_resched(p);
 	}
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 91a081a82f55..96473b482099 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -286,7 +286,7 @@ static void __oom_kill_task(struct task_struct *p, int verbose)
 	 * all the memory it needs. That way it should be able to
 	 * exit() and clear out its resources quickly...
 	 */
-	p->time_slice = HZ;
+	p->rt.time_slice = HZ;
 	set_tsk_thread_flag(p, TIF_MEMDIE);
 
 	force_sig(SIGKILL, p);
-- 
cgit v1.2.3


From 78f2c7db6068fd6ef75b8c120f04a388848eacb5 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 25 Jan 2008 21:08:27 +0100
Subject: sched: SCHED_FIFO/SCHED_RR watchdog timer

Introduce a new rlimit that allows the user to set a runtime timeout on
real-time tasks their slice. Once this limit is exceeded the task will receive
SIGXCPU.

So it measures runtime since the last sleep.

Input and ideas by Thomas Gleixner and Lennart Poettering.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
CC: Lennart Poettering <mzxreary@0pointer.de>
CC: Michael Kerrisk <mtk.manpages@googlemail.com>
CC: Ulrich Drepper <drepper@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/asm-generic/resource.h |  5 +++--
 include/linux/sched.h          |  1 +
 kernel/posix-cpu-timers.c      | 29 +++++++++++++++++++++++++++++
 kernel/sched_rt.c              | 30 ++++++++++++++++++++++++++++++
 4 files changed, 63 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/asm-generic/resource.h b/include/asm-generic/resource.h
index a4a22cc35898..587566f95f6c 100644
--- a/include/asm-generic/resource.h
+++ b/include/asm-generic/resource.h
@@ -44,8 +44,8 @@
 #define RLIMIT_NICE		13	/* max nice prio allowed to raise to
 					   0-39 for nice level 19 .. -20 */
 #define RLIMIT_RTPRIO		14	/* maximum realtime priority */
-
-#define RLIM_NLIMITS		15
+#define RLIMIT_RTTIME		15	/* timeout for RT tasks in us */
+#define RLIM_NLIMITS		16
 
 /*
  * SuS says limits have to be unsigned.
@@ -86,6 +86,7 @@
 	[RLIMIT_MSGQUEUE]	= {   MQ_BYTES_MAX,   MQ_BYTES_MAX },	\
 	[RLIMIT_NICE]		= { 0, 0 },				\
 	[RLIMIT_RTPRIO]		= { 0, 0 },				\
+	[RLIMIT_RTTIME]		= {  RLIM_INFINITY,  RLIM_INFINITY },	\
 }
 
 #endif	/* __KERNEL__ */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index a06d09ebd5c6..fe3f8fbc614e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -932,6 +932,7 @@ struct sched_entity {
 struct sched_rt_entity {
 	struct list_head run_list;
 	unsigned int time_slice;
+	unsigned long timeout;
 };
 
 struct task_struct {
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 68c96376e84a..2c076b36c4f6 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -967,6 +967,7 @@ static void check_thread_timers(struct task_struct *tsk,
 {
 	int maxfire;
 	struct list_head *timers = tsk->cpu_timers;
+	struct signal_struct *const sig = tsk->signal;
 
 	maxfire = 20;
 	tsk->it_prof_expires = cputime_zero;
@@ -1011,6 +1012,34 @@ static void check_thread_timers(struct task_struct *tsk,
 		t->firing = 1;
 		list_move_tail(&t->entry, firing);
 	}
+
+	/*
+	 * Check for the special case thread timers.
+	 */
+	if (sig->rlim[RLIMIT_RTTIME].rlim_cur != RLIM_INFINITY) {
+		unsigned long hard = sig->rlim[RLIMIT_RTTIME].rlim_max;
+		unsigned long *soft = &sig->rlim[RLIMIT_RTTIME].rlim_cur;
+
+		if (tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) {
+			/*
+			 * At the hard limit, we just die.
+			 * No need to calculate anything else now.
+			 */
+			__group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk);
+			return;
+		}
+		if (tsk->rt.timeout > DIV_ROUND_UP(*soft, USEC_PER_SEC/HZ)) {
+			/*
+			 * At the soft limit, send a SIGXCPU every second.
+			 */
+			if (sig->rlim[RLIMIT_RTTIME].rlim_cur
+			    < sig->rlim[RLIMIT_RTTIME].rlim_max) {
+				sig->rlim[RLIMIT_RTTIME].rlim_cur +=
+								USEC_PER_SEC;
+			}
+			__group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk);
+		}
+	}
 }
 
 /*
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 29963af782ae..f350f7b15158 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -116,6 +116,9 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
 	inc_cpu_load(rq, p->se.load.weight);
 
 	inc_rt_tasks(p, rq);
+
+	if (wakeup)
+		p->rt.timeout = 0;
 }
 
 /*
@@ -834,11 +837,38 @@ static void prio_changed_rt(struct rq *rq, struct task_struct *p,
 	}
 }
 
+static void watchdog(struct rq *rq, struct task_struct *p)
+{
+	unsigned long soft, hard;
+
+	if (!p->signal)
+		return;
+
+	soft = p->signal->rlim[RLIMIT_RTTIME].rlim_cur;
+	hard = p->signal->rlim[RLIMIT_RTTIME].rlim_max;
+
+	if (soft != RLIM_INFINITY) {
+		unsigned long next;
+
+		p->rt.timeout++;
+		next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ);
+		if (next > p->rt.timeout) {
+			u64 next_time = p->se.sum_exec_runtime;
+
+			next_time += next * (NSEC_PER_SEC/HZ);
+			if (p->it_sched_expires > next_time)
+				p->it_sched_expires = next_time;
+		} else
+			p->it_sched_expires = p->se.sum_exec_runtime;
+	}
+}
 
 static void task_tick_rt(struct rq *rq, struct task_struct *p)
 {
 	update_curr_rt(rq);
 
+	watchdog(rq, p);
+
 	/*
 	 * RR tasks need a special form of timeslice management.
 	 * FIFO tasks have no timeslices.
-- 
cgit v1.2.3


From 03319ec8b06849051747a17aa2a0f9aba9277980 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 25 Jan 2008 21:08:28 +0100
Subject: sched: documentation, whitespace fixes

whitespace fixes.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index c2cedd09d895..b9ee0f4db66a 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -182,7 +182,7 @@ struct task_group {
 	 *
 	 *	Bw(A) = 1000/(1000+2000+3000) * 100 = 16.66%
 	 *	Bw(B) = 2000/(1000+2000+3000) * 100 = 33.33%
-	 * 	Bw(C) = 3000/(1000+2000+3000) * 100 = 50%
+	 *	Bw(C) = 3000/(1000+2000+3000) * 100 = 50%
 	 *
 	 * The weight assigned to a task group's schedulable entities on every
 	 * cpu (task_group.se[a_cpu]->load.weight) is derived from the task
@@ -192,9 +192,9 @@ struct task_group {
 	 *  tg_A->se[0]->load.weight = tg_A->se[1]->load.weight = 1000;
 	 *
 	 * Note: It's not necessary that each of a task's group schedulable
-	 * 	 entity have the same weight on all CPUs. If the group
-	 * 	 has 2 of its tasks on CPU0 and 1 task on CPU1, then a
-	 * 	 better distribution of weight could be:
+	 *	 entity have the same weight on all CPUs. If the group
+	 *	 has 2 of its tasks on CPU0 and 1 task on CPU1, then a
+	 *	 better distribution of weight could be:
 	 *
 	 *	tg_A->se[0]->load.weight = 2/3 * 2000 = 1333
 	 *	tg_A->se[1]->load.weight = 1/2 * 2000 =  667
-- 
cgit v1.2.3


From 02b67cc3ba36bdba351d6c3a00593f4ec550d9d3 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Fri, 25 Jan 2008 21:08:28 +0100
Subject: sched: do not do cond_resched() when CONFIG_PREEMPT

Why do we even have cond_resched when real preemption
is on? It seems to be a waste of space and time.

remove cond_resched with CONFIG_PREEMPT on.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/kernel.h |  4 ++--
 include/linux/sched.h  | 13 ++++++++++++-
 kernel/sched.c         |  6 ++++--
 3 files changed, 18 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 94bc99656963..a7283c9beadf 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -105,8 +105,8 @@ struct user;
  * supposed to.
  */
 #ifdef CONFIG_PREEMPT_VOLUNTARY
-extern int cond_resched(void);
-# define might_resched() cond_resched()
+extern int _cond_resched(void);
+# define might_resched() _cond_resched()
 #else
 # define might_resched() do { } while (0)
 #endif
diff --git a/include/linux/sched.h b/include/linux/sched.h
index fe3f8fbc614e..7907845c2348 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1885,7 +1885,18 @@ static inline int need_resched(void)
  * cond_resched_lock() will drop the spinlock before scheduling,
  * cond_resched_softirq() will enable bhs before scheduling.
  */
-extern int cond_resched(void);
+#ifdef CONFIG_PREEMPT
+static inline int cond_resched(void)
+{
+	return 0;
+}
+#else
+extern int _cond_resched(void);
+static inline int cond_resched(void)
+{
+	return _cond_resched();
+}
+#endif
 extern int cond_resched_lock(spinlock_t * lock);
 extern int cond_resched_softirq(void);
 
diff --git a/kernel/sched.c b/kernel/sched.c
index b9ee0f4db66a..6ee37602a6d8 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4678,7 +4678,8 @@ static void __cond_resched(void)
 	} while (need_resched());
 }
 
-int __sched cond_resched(void)
+#if !defined(CONFIG_PREEMPT) || defined(CONFIG_PREEMPT_VOLUNTARY)
+int __sched _cond_resched(void)
 {
 	if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) &&
 					system_state == SYSTEM_RUNNING) {
@@ -4687,7 +4688,8 @@ int __sched cond_resched(void)
 	}
 	return 0;
 }
-EXPORT_SYMBOL(cond_resched);
+EXPORT_SYMBOL(_cond_resched);
+#endif
 
 /*
  * cond_resched_lock() - if a reschedule is pending, drop the given lock,
-- 
cgit v1.2.3


From 8f4d37ec073c17e2d4aa8851df5837d798606d6f Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 25 Jan 2008 21:08:29 +0100
Subject: sched: high-res preemption tick

Use HR-timers (when available) to deliver an accurate preemption tick.

The regular scheduler tick that runs at 1/HZ can be too coarse when nice
level are used. The fairness system will still keep the cpu utilisation 'fair'
by then delaying the task that got an excessive amount of CPU time but try to
minimize this by delivering preemption points spot-on.

The average frequency of this extra interrupt is sched_latency / nr_latency.
Which need not be higher than 1/HZ, its just that the distribution within the
sched_latency period is important.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/entry_64.S       |   6 +-
 arch/x86/kernel/signal_32.c      |   3 +
 arch/x86/kernel/signal_64.c      |   3 +
 include/asm-x86/thread_info_32.h |   2 +
 include/asm-x86/thread_info_64.h |   5 +
 include/linux/hrtimer.h          |   9 ++
 include/linux/sched.h            |   3 +-
 kernel/Kconfig.hz                |   2 +
 kernel/sched.c                   | 210 +++++++++++++++++++++++++++++++++++++--
 kernel/sched_fair.c              |  69 ++++++++++++-
 kernel/sched_idletask.c          |   2 +-
 kernel/sched_rt.c                |   2 +-
 12 files changed, 295 insertions(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 3a058bb16409..e70f3881d7e4 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -283,7 +283,7 @@ sysret_careful:
 sysret_signal:
 	TRACE_IRQS_ON
 	sti
-	testl $(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx
+	testl $_TIF_DO_NOTIFY_MASK,%edx
 	jz    1f
 
 	/* Really a signal */
@@ -377,7 +377,7 @@ int_very_careful:
 	jmp int_restore_rest
 	
 int_signal:
-	testl $(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx
+	testl $_TIF_DO_NOTIFY_MASK,%edx
 	jz 1f
 	movq %rsp,%rdi		# &ptregs -> arg1
 	xorl %esi,%esi		# oldset -> arg2
@@ -603,7 +603,7 @@ retint_careful:
 	jmp retint_check
 	
 retint_signal:
-	testl $(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx
+	testl $_TIF_DO_NOTIFY_MASK,%edx
 	jz    retint_swapgs
 	TRACE_IRQS_ON
 	sti
diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c
index 9bdd83022f5f..20f29e4c1d33 100644
--- a/arch/x86/kernel/signal_32.c
+++ b/arch/x86/kernel/signal_32.c
@@ -658,6 +658,9 @@ void do_notify_resume(struct pt_regs *regs, void *_unused,
 	/* deal with pending signal delivery */
 	if (thread_info_flags & (_TIF_SIGPENDING | _TIF_RESTORE_SIGMASK))
 		do_signal(regs);
+
+	if (thread_info_flags & _TIF_HRTICK_RESCHED)
+		hrtick_resched();
 	
 	clear_thread_flag(TIF_IRET);
 }
diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c
index ab086b0357fc..38d806467c0f 100644
--- a/arch/x86/kernel/signal_64.c
+++ b/arch/x86/kernel/signal_64.c
@@ -480,6 +480,9 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
 	/* deal with pending signal delivery */
 	if (thread_info_flags & (_TIF_SIGPENDING|_TIF_RESTORE_SIGMASK))
 		do_signal(regs);
+
+	if (thread_info_flags & _TIF_HRTICK_RESCHED)
+		hrtick_resched();
 }
 
 void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
diff --git a/include/asm-x86/thread_info_32.h b/include/asm-x86/thread_info_32.h
index 22a8cbcd35e2..ef58fd2a6eb0 100644
--- a/include/asm-x86/thread_info_32.h
+++ b/include/asm-x86/thread_info_32.h
@@ -132,6 +132,7 @@ static inline struct thread_info *current_thread_info(void)
 #define TIF_SYSCALL_AUDIT	6	/* syscall auditing active */
 #define TIF_SECCOMP		7	/* secure computing */
 #define TIF_RESTORE_SIGMASK	8	/* restore signal mask in do_signal() */
+#define TIF_HRTICK_RESCHED	9	/* reprogram hrtick timer */
 #define TIF_MEMDIE		16
 #define TIF_DEBUG		17	/* uses debug registers */
 #define TIF_IO_BITMAP		18	/* uses I/O bitmap */
@@ -147,6 +148,7 @@ static inline struct thread_info *current_thread_info(void)
 #define _TIF_SYSCALL_AUDIT	(1<<TIF_SYSCALL_AUDIT)
 #define _TIF_SECCOMP		(1<<TIF_SECCOMP)
 #define _TIF_RESTORE_SIGMASK	(1<<TIF_RESTORE_SIGMASK)
+#define _TIF_HRTICK_RESCHED	(1<<TIF_HRTICK_RESCHED)
 #define _TIF_DEBUG		(1<<TIF_DEBUG)
 #define _TIF_IO_BITMAP		(1<<TIF_IO_BITMAP)
 #define _TIF_FREEZE		(1<<TIF_FREEZE)
diff --git a/include/asm-x86/thread_info_64.h b/include/asm-x86/thread_info_64.h
index beae2bfb62ca..7f6ee68f0002 100644
--- a/include/asm-x86/thread_info_64.h
+++ b/include/asm-x86/thread_info_64.h
@@ -115,6 +115,7 @@ static inline struct thread_info *stack_thread_info(void)
 #define TIF_SECCOMP		8	/* secure computing */
 #define TIF_RESTORE_SIGMASK	9	/* restore signal mask in do_signal */
 #define TIF_MCE_NOTIFY		10	/* notify userspace of an MCE */
+#define TIF_HRTICK_RESCHED	11	/* reprogram hrtick timer */
 /* 16 free */
 #define TIF_IA32		17	/* 32bit process */ 
 #define TIF_FORK		18	/* ret_from_fork */
@@ -133,6 +134,7 @@ static inline struct thread_info *stack_thread_info(void)
 #define _TIF_SECCOMP		(1<<TIF_SECCOMP)
 #define _TIF_RESTORE_SIGMASK	(1<<TIF_RESTORE_SIGMASK)
 #define _TIF_MCE_NOTIFY		(1<<TIF_MCE_NOTIFY)
+#define _TIF_HRTICK_RESCHED	(1<<TIF_HRTICK_RESCHED)
 #define _TIF_IA32		(1<<TIF_IA32)
 #define _TIF_FORK		(1<<TIF_FORK)
 #define _TIF_ABI_PENDING	(1<<TIF_ABI_PENDING)
@@ -146,6 +148,9 @@ static inline struct thread_info *stack_thread_info(void)
 /* work to do on any return to user space */
 #define _TIF_ALLWORK_MASK (0x0000FFFF & ~_TIF_SECCOMP)
 
+#define _TIF_DO_NOTIFY_MASK \
+	(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY|_TIF_HRTICK_RESCHED)
+
 /* flags to check in __switch_to() */
 #define _TIF_WORK_CTXSW (_TIF_DEBUG|_TIF_IO_BITMAP)
 
diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 7a9398e19704..ecc8e2685e2b 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -217,6 +217,11 @@ static inline ktime_t hrtimer_cb_get_time(struct hrtimer *timer)
 	return timer->base->get_time();
 }
 
+static inline int hrtimer_is_hres_active(struct hrtimer *timer)
+{
+	return timer->base->cpu_base->hres_active;
+}
+
 /*
  * The resolution of the clocks. The resolution value is returned in
  * the clock_getres() system call to give application programmers an
@@ -248,6 +253,10 @@ static inline ktime_t hrtimer_cb_get_time(struct hrtimer *timer)
 	return timer->base->softirq_time;
 }
 
+static inline int hrtimer_is_hres_active(struct hrtimer *timer)
+{
+	return 0;
+}
 #endif
 
 extern ktime_t ktime_get(void);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 7907845c2348..43e0339d65fc 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -257,6 +257,7 @@ extern void trap_init(void);
 extern void account_process_tick(struct task_struct *task, int user);
 extern void update_process_times(int user);
 extern void scheduler_tick(void);
+extern void hrtick_resched(void);
 
 extern void sched_show_task(struct task_struct *p);
 
@@ -849,7 +850,7 @@ struct sched_class {
 #endif
 
 	void (*set_curr_task) (struct rq *rq);
-	void (*task_tick) (struct rq *rq, struct task_struct *p);
+	void (*task_tick) (struct rq *rq, struct task_struct *p, int queued);
 	void (*task_new) (struct rq *rq, struct task_struct *p);
 	void (*set_cpus_allowed)(struct task_struct *p, cpumask_t *newmask);
 
diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz
index 4af15802ccd4..526128a2e622 100644
--- a/kernel/Kconfig.hz
+++ b/kernel/Kconfig.hz
@@ -54,3 +54,5 @@ config HZ
 	default 300 if HZ_300
 	default 1000 if HZ_1000
 
+config SCHED_HRTICK
+	def_bool HIGH_RES_TIMERS && X86
diff --git a/kernel/sched.c b/kernel/sched.c
index 6ee37602a6d8..17f93d3eda91 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -65,6 +65,7 @@
 #include <linux/reciprocal_div.h>
 #include <linux/unistd.h>
 #include <linux/pagemap.h>
+#include <linux/hrtimer.h>
 
 #include <asm/tlb.h>
 #include <asm/irq_regs.h>
@@ -451,6 +452,12 @@ struct rq {
 	struct list_head migration_queue;
 #endif
 
+#ifdef CONFIG_SCHED_HRTICK
+	unsigned long hrtick_flags;
+	ktime_t hrtick_expire;
+	struct hrtimer hrtick_timer;
+#endif
+
 #ifdef CONFIG_SCHEDSTATS
 	/* latency stats */
 	struct sched_info rq_sched_info;
@@ -572,6 +579,8 @@ enum {
 	SCHED_FEAT_START_DEBIT		= 4,
 	SCHED_FEAT_TREE_AVG		= 8,
 	SCHED_FEAT_APPROX_AVG		= 16,
+	SCHED_FEAT_HRTICK		= 32,
+	SCHED_FEAT_DOUBLE_TICK		= 64,
 };
 
 const_debug unsigned int sysctl_sched_features =
@@ -579,7 +588,9 @@ const_debug unsigned int sysctl_sched_features =
 		SCHED_FEAT_WAKEUP_PREEMPT	* 1 |
 		SCHED_FEAT_START_DEBIT		* 1 |
 		SCHED_FEAT_TREE_AVG		* 0 |
-		SCHED_FEAT_APPROX_AVG		* 0;
+		SCHED_FEAT_APPROX_AVG		* 0 |
+		SCHED_FEAT_HRTICK		* 1 |
+		SCHED_FEAT_DOUBLE_TICK		* 0;
 
 #define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x)
 
@@ -796,6 +807,173 @@ void sched_clock_idle_wakeup_event(u64 delta_ns)
 }
 EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
 
+static void __resched_task(struct task_struct *p, int tif_bit);
+
+static inline void resched_task(struct task_struct *p)
+{
+	__resched_task(p, TIF_NEED_RESCHED);
+}
+
+#ifdef CONFIG_SCHED_HRTICK
+/*
+ * Use HR-timers to deliver accurate preemption points.
+ *
+ * Its all a bit involved since we cannot program an hrt while holding the
+ * rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a
+ * reschedule event.
+ *
+ * When we get rescheduled we reprogram the hrtick_timer outside of the
+ * rq->lock.
+ */
+static inline void resched_hrt(struct task_struct *p)
+{
+	__resched_task(p, TIF_HRTICK_RESCHED);
+}
+
+static inline void resched_rq(struct rq *rq)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&rq->lock, flags);
+	resched_task(rq->curr);
+	spin_unlock_irqrestore(&rq->lock, flags);
+}
+
+enum {
+	HRTICK_SET,		/* re-programm hrtick_timer */
+	HRTICK_RESET,		/* not a new slice */
+};
+
+/*
+ * Use hrtick when:
+ *  - enabled by features
+ *  - hrtimer is actually high res
+ */
+static inline int hrtick_enabled(struct rq *rq)
+{
+	if (!sched_feat(HRTICK))
+		return 0;
+	return hrtimer_is_hres_active(&rq->hrtick_timer);
+}
+
+/*
+ * Called to set the hrtick timer state.
+ *
+ * called with rq->lock held and irqs disabled
+ */
+static void hrtick_start(struct rq *rq, u64 delay, int reset)
+{
+	assert_spin_locked(&rq->lock);
+
+	/*
+	 * preempt at: now + delay
+	 */
+	rq->hrtick_expire =
+		ktime_add_ns(rq->hrtick_timer.base->get_time(), delay);
+	/*
+	 * indicate we need to program the timer
+	 */
+	__set_bit(HRTICK_SET, &rq->hrtick_flags);
+	if (reset)
+		__set_bit(HRTICK_RESET, &rq->hrtick_flags);
+
+	/*
+	 * New slices are called from the schedule path and don't need a
+	 * forced reschedule.
+	 */
+	if (reset)
+		resched_hrt(rq->curr);
+}
+
+static void hrtick_clear(struct rq *rq)
+{
+	if (hrtimer_active(&rq->hrtick_timer))
+		hrtimer_cancel(&rq->hrtick_timer);
+}
+
+/*
+ * Update the timer from the possible pending state.
+ */
+static void hrtick_set(struct rq *rq)
+{
+	ktime_t time;
+	int set, reset;
+	unsigned long flags;
+
+	WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
+
+	spin_lock_irqsave(&rq->lock, flags);
+	set = __test_and_clear_bit(HRTICK_SET, &rq->hrtick_flags);
+	reset = __test_and_clear_bit(HRTICK_RESET, &rq->hrtick_flags);
+	time = rq->hrtick_expire;
+	clear_thread_flag(TIF_HRTICK_RESCHED);
+	spin_unlock_irqrestore(&rq->lock, flags);
+
+	if (set) {
+		hrtimer_start(&rq->hrtick_timer, time, HRTIMER_MODE_ABS);
+		if (reset && !hrtimer_active(&rq->hrtick_timer))
+			resched_rq(rq);
+	} else
+		hrtick_clear(rq);
+}
+
+/*
+ * High-resolution timer tick.
+ * Runs from hardirq context with interrupts disabled.
+ */
+static enum hrtimer_restart hrtick(struct hrtimer *timer)
+{
+	struct rq *rq = container_of(timer, struct rq, hrtick_timer);
+
+	WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
+
+	spin_lock(&rq->lock);
+	__update_rq_clock(rq);
+	rq->curr->sched_class->task_tick(rq, rq->curr, 1);
+	spin_unlock(&rq->lock);
+
+	return HRTIMER_NORESTART;
+}
+
+static inline void init_rq_hrtick(struct rq *rq)
+{
+	rq->hrtick_flags = 0;
+	hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	rq->hrtick_timer.function = hrtick;
+	rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
+}
+
+void hrtick_resched(void)
+{
+	struct rq *rq;
+	unsigned long flags;
+
+	if (!test_thread_flag(TIF_HRTICK_RESCHED))
+		return;
+
+	local_irq_save(flags);
+	rq = cpu_rq(smp_processor_id());
+	hrtick_set(rq);
+	local_irq_restore(flags);
+}
+#else
+static inline void hrtick_clear(struct rq *rq)
+{
+}
+
+static inline void hrtick_set(struct rq *rq)
+{
+}
+
+static inline void init_rq_hrtick(struct rq *rq)
+{
+}
+
+void hrtick_resched(void)
+{
+}
+#endif
+
 /*
  * resched_task - mark a task 'to be rescheduled now'.
  *
@@ -809,16 +987,16 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
 #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
 #endif
 
-static void resched_task(struct task_struct *p)
+static void __resched_task(struct task_struct *p, int tif_bit)
 {
 	int cpu;
 
 	assert_spin_locked(&task_rq(p)->lock);
 
-	if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
+	if (unlikely(test_tsk_thread_flag(p, tif_bit)))
 		return;
 
-	set_tsk_thread_flag(p, TIF_NEED_RESCHED);
+	set_tsk_thread_flag(p, tif_bit);
 
 	cpu = task_cpu(p);
 	if (cpu == smp_processor_id())
@@ -841,10 +1019,10 @@ static void resched_cpu(int cpu)
 	spin_unlock_irqrestore(&rq->lock, flags);
 }
 #else
-static inline void resched_task(struct task_struct *p)
+static void __resched_task(struct task_struct *p, int tif_bit)
 {
 	assert_spin_locked(&task_rq(p)->lock);
-	set_tsk_need_resched(p);
+	set_tsk_thread_flag(p, tif_bit);
 }
 #endif
 
@@ -3497,7 +3675,7 @@ void scheduler_tick(void)
 	rq->tick_timestamp = rq->clock;
 	update_cpu_load(rq);
 	if (curr != rq->idle) /* FIXME: needed? */
-		curr->sched_class->task_tick(rq, curr);
+		curr->sched_class->task_tick(rq, curr, 0);
 	spin_unlock(&rq->lock);
 
 #ifdef CONFIG_SMP
@@ -3643,6 +3821,8 @@ need_resched_nonpreemptible:
 
 	schedule_debug(prev);
 
+	hrtick_clear(rq);
+
 	/*
 	 * Do the rq-clock update outside the rq lock:
 	 */
@@ -3680,14 +3860,20 @@ need_resched_nonpreemptible:
 		++*switch_count;
 
 		context_switch(rq, prev, next); /* unlocks the rq */
+		/*
+		 * the context switch might have flipped the stack from under
+		 * us, hence refresh the local variables.
+		 */
+		cpu = smp_processor_id();
+		rq = cpu_rq(cpu);
 	} else
 		spin_unlock_irq(&rq->lock);
 
-	if (unlikely(reacquire_kernel_lock(current) < 0)) {
-		cpu = smp_processor_id();
-		rq = cpu_rq(cpu);
+	hrtick_set(rq);
+
+	if (unlikely(reacquire_kernel_lock(current) < 0))
 		goto need_resched_nonpreemptible;
-	}
+
 	preempt_enable_no_resched();
 	if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
 		goto need_resched;
@@ -6913,6 +7099,8 @@ void __init sched_init(void)
 		rq->rt.overloaded = 0;
 		rq_attach_root(rq, &def_root_domain);
 #endif
+		init_rq_hrtick(rq);
+
 		atomic_set(&rq->nr_iowait, 0);
 
 		array = &rq->rt.active;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index dfa18d55561d..3dab1ff83c4f 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -642,13 +642,29 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
 	cfs_rq->curr = NULL;
 }
 
-static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
+static void
+entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
 {
 	/*
 	 * Update run-time statistics of the 'current'.
 	 */
 	update_curr(cfs_rq);
 
+#ifdef CONFIG_SCHED_HRTICK
+	/*
+	 * queued ticks are scheduled to match the slice, so don't bother
+	 * validating it and just reschedule.
+	 */
+	if (queued)
+		return resched_task(rq_of(cfs_rq)->curr);
+	/*
+	 * don't let the period tick interfere with the hrtick preemption
+	 */
+	if (!sched_feat(DOUBLE_TICK) &&
+			hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
+		return;
+#endif
+
 	if (cfs_rq->nr_running > 1 || !sched_feat(WAKEUP_PREEMPT))
 		check_preempt_tick(cfs_rq, curr);
 }
@@ -754,6 +770,43 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se)
 
 #endif	/* CONFIG_FAIR_GROUP_SCHED */
 
+#ifdef CONFIG_SCHED_HRTICK
+static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
+{
+	int requeue = rq->curr == p;
+	struct sched_entity *se = &p->se;
+	struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+	WARN_ON(task_rq(p) != rq);
+
+	if (hrtick_enabled(rq) && cfs_rq->nr_running > 1) {
+		u64 slice = sched_slice(cfs_rq, se);
+		u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
+		s64 delta = slice - ran;
+
+		if (delta < 0) {
+			if (rq->curr == p)
+				resched_task(p);
+			return;
+		}
+
+		/*
+		 * Don't schedule slices shorter than 10000ns, that just
+		 * doesn't make sense. Rely on vruntime for fairness.
+		 */
+		if (!requeue)
+			delta = max(10000LL, delta);
+
+		hrtick_start(rq, delta, requeue);
+	}
+}
+#else
+static inline void
+hrtick_start_fair(struct rq *rq, struct task_struct *p)
+{
+}
+#endif
+
 /*
  * The enqueue_task method is called before nr_running is
  * increased. Here we update the fair scheduling stats and
@@ -782,6 +835,8 @@ static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup)
 	 */
 	if (incload)
 		inc_cpu_load(rq, topse->load.weight);
+
+	hrtick_start_fair(rq, rq->curr);
 }
 
 /*
@@ -814,6 +869,8 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep)
 	 */
 	if (decload)
 		dec_cpu_load(rq, topse->load.weight);
+
+	hrtick_start_fair(rq, rq->curr);
 }
 
 /*
@@ -1049,6 +1106,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
 
 static struct task_struct *pick_next_task_fair(struct rq *rq)
 {
+	struct task_struct *p;
 	struct cfs_rq *cfs_rq = &rq->cfs;
 	struct sched_entity *se;
 
@@ -1060,7 +1118,10 @@ static struct task_struct *pick_next_task_fair(struct rq *rq)
 		cfs_rq = group_cfs_rq(se);
 	} while (cfs_rq);
 
-	return task_of(se);
+	p = task_of(se);
+	hrtick_start_fair(rq, p);
+
+	return p;
 }
 
 /*
@@ -1235,14 +1296,14 @@ move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
 /*
  * scheduler tick hitting a task of our scheduling class:
  */
-static void task_tick_fair(struct rq *rq, struct task_struct *curr)
+static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
 {
 	struct cfs_rq *cfs_rq;
 	struct sched_entity *se = &curr->se;
 
 	for_each_sched_entity(se) {
 		cfs_rq = cfs_rq_of(se);
-		entity_tick(cfs_rq, se);
+		entity_tick(cfs_rq, se, queued);
 	}
 }
 
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index ef7a2661fa10..2bcafa375633 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -61,7 +61,7 @@ move_one_task_idle(struct rq *this_rq, int this_cpu, struct rq *busiest,
 }
 #endif
 
-static void task_tick_idle(struct rq *rq, struct task_struct *curr)
+static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued)
 {
 }
 
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index f350f7b15158..83fbbcb8019e 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -863,7 +863,7 @@ static void watchdog(struct rq *rq, struct task_struct *p)
 	}
 }
 
-static void task_tick_rt(struct rq *rq, struct task_struct *p)
+static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
 {
 	update_curr_rt(rq);
 
-- 
cgit v1.2.3


From fa85ae2418e6843953107cd6a06f645752829bc0 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 25 Jan 2008 21:08:29 +0100
Subject: sched: rt time limit

Very simple time limit on the realtime scheduling classes.
Allow the rq's realtime class to consume sched_rt_ratio of every
sched_rt_period slice. If the class exceeds this quota the fair class
will preempt the realtime class.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h |  2 ++
 kernel/sched.c        | 70 ++++++++++++++++++++++++++++++++++++---------------
 kernel/sched_rt.c     | 53 ++++++++++++++++++++++++++++++++++++++
 kernel/sysctl.c       | 18 ++++++++++++-
 4 files changed, 122 insertions(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 43e0339d65fc..d5ea144df836 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1490,6 +1490,8 @@ extern unsigned int sysctl_sched_child_runs_first;
 extern unsigned int sysctl_sched_features;
 extern unsigned int sysctl_sched_migration_cost;
 extern unsigned int sysctl_sched_nr_migrate;
+extern unsigned int sysctl_sched_rt_period;
+extern unsigned int sysctl_sched_rt_ratio;
 #if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
 extern unsigned int sysctl_sched_min_bal_int_shares;
 extern unsigned int sysctl_sched_max_bal_int_shares;
diff --git a/kernel/sched.c b/kernel/sched.c
index 17f93d3eda91..e9a7beee9b79 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -342,13 +342,14 @@ struct cfs_rq {
 /* Real-Time classes' related field in a runqueue: */
 struct rt_rq {
 	struct rt_prio_array active;
-	int rt_load_balance_idx;
-	struct list_head *rt_load_balance_head, *rt_load_balance_curr;
 	unsigned long rt_nr_running;
+#ifdef CONFIG_SMP
 	unsigned long rt_nr_migratory;
-	/* highest queued rt task prio */
-	int highest_prio;
+	int highest_prio; /* highest queued rt task prio */
 	int overloaded;
+#endif
+	u64 rt_time;
+	u64 rt_throttled;
 };
 
 #ifdef CONFIG_SMP
@@ -415,6 +416,7 @@ struct rq {
 	struct list_head leaf_cfs_rq_list;
 #endif
 	struct rt_rq rt;
+	u64 rt_period_expire;
 
 	/*
 	 * This is part of a global counter where only the total sum
@@ -600,6 +602,21 @@ const_debug unsigned int sysctl_sched_features =
  */
 const_debug unsigned int sysctl_sched_nr_migrate = 32;
 
+/*
+ * period over which we measure -rt task cpu usage in ms.
+ * default: 1s
+ */
+const_debug unsigned int sysctl_sched_rt_period = 1000;
+
+#define SCHED_RT_FRAC_SHIFT	16
+#define SCHED_RT_FRAC		(1UL << SCHED_RT_FRAC_SHIFT)
+
+/*
+ * ratio of time -rt tasks may consume.
+ * default: 100%
+ */
+const_debug unsigned int sysctl_sched_rt_ratio = SCHED_RT_FRAC;
+
 /*
  * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
  * clock constructed from sched_clock():
@@ -3674,8 +3691,8 @@ void scheduler_tick(void)
 		rq->clock = next_tick;
 	rq->tick_timestamp = rq->clock;
 	update_cpu_load(rq);
-	if (curr != rq->idle) /* FIXME: needed? */
-		curr->sched_class->task_tick(rq, curr, 0);
+	curr->sched_class->task_tick(rq, curr, 0);
+	update_sched_rt_period(rq);
 	spin_unlock(&rq->lock);
 
 #ifdef CONFIG_SMP
@@ -7041,6 +7058,29 @@ static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
 	cfs_rq->min_vruntime = (u64)(-(1LL << 20));
 }
 
+static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
+{
+	struct rt_prio_array *array;
+	int i;
+
+	array = &rt_rq->active;
+	for (i = 0; i < MAX_RT_PRIO; i++) {
+		INIT_LIST_HEAD(array->queue + i);
+		__clear_bit(i, array->bitmap);
+	}
+	/* delimiter for bitsearch: */
+	__set_bit(MAX_RT_PRIO, array->bitmap);
+
+#ifdef CONFIG_SMP
+	rt_rq->rt_nr_migratory = 0;
+	rt_rq->highest_prio = MAX_RT_PRIO;
+	rt_rq->overloaded = 0;
+#endif
+
+	rt_rq->rt_time = 0;
+	rt_rq->rt_throttled = 0;
+}
+
 void __init sched_init(void)
 {
 	int highest_cpu = 0;
@@ -7051,7 +7091,6 @@ void __init sched_init(void)
 #endif
 
 	for_each_possible_cpu(i) {
-		struct rt_prio_array *array;
 		struct rq *rq;
 
 		rq = cpu_rq(i);
@@ -7083,6 +7122,8 @@ void __init sched_init(void)
 		}
 		init_task_group.shares = init_task_group_load;
 #endif
+		init_rt_rq(&rq->rt, rq);
+		rq->rt_period_expire = 0;
 
 		for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
 			rq->cpu_load[j] = 0;
@@ -7095,22 +7136,11 @@ void __init sched_init(void)
 		rq->cpu = i;
 		rq->migration_thread = NULL;
 		INIT_LIST_HEAD(&rq->migration_queue);
-		rq->rt.highest_prio = MAX_RT_PRIO;
-		rq->rt.overloaded = 0;
 		rq_attach_root(rq, &def_root_domain);
 #endif
 		init_rq_hrtick(rq);
-
 		atomic_set(&rq->nr_iowait, 0);
-
-		array = &rq->rt.active;
-		for (j = 0; j < MAX_RT_PRIO; j++) {
-			INIT_LIST_HEAD(array->queue + j);
-			__clear_bit(j, array->bitmap);
-		}
 		highest_cpu = i;
-		/* delimiter for bitsearch: */
-		__set_bit(MAX_RT_PRIO, array->bitmap);
 	}
 
 	set_load_weight(&init_task);
@@ -7282,7 +7312,7 @@ void set_curr_task(int cpu, struct task_struct *p)
 #ifdef CONFIG_SMP
 /*
  * distribute shares of all task groups among their schedulable entities,
- * to reflect load distrbution across cpus.
+ * to reflect load distribution across cpus.
  */
 static int rebalance_shares(struct sched_domain *sd, int this_cpu)
 {
@@ -7349,7 +7379,7 @@ static int rebalance_shares(struct sched_domain *sd, int this_cpu)
  * sysctl_sched_max_bal_int_shares represents the maximum interval between
  * consecutive calls to rebalance_shares() in the same sched domain.
  *
- * These settings allows for the appropriate tradeoff between accuracy of
+ * These settings allows for the appropriate trade-off between accuracy of
  * fairness and the associated overhead.
  *
  */
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 83fbbcb8019e..fd10d965aa06 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -45,6 +45,50 @@ static void update_rt_migration(struct rq *rq)
 }
 #endif /* CONFIG_SMP */
 
+static int sched_rt_ratio_exceeded(struct rq *rq, struct rt_rq *rt_rq)
+{
+	u64 period, ratio;
+
+	if (sysctl_sched_rt_ratio == SCHED_RT_FRAC)
+		return 0;
+
+	if (rt_rq->rt_throttled)
+		return 1;
+
+	period = (u64)sysctl_sched_rt_period * NSEC_PER_MSEC;
+	ratio = (period * sysctl_sched_rt_ratio) >> SCHED_RT_FRAC_SHIFT;
+
+	if (rt_rq->rt_time > ratio) {
+		rt_rq->rt_throttled = rq->clock + period - rt_rq->rt_time;
+		return 1;
+	}
+
+	return 0;
+}
+
+static void update_sched_rt_period(struct rq *rq)
+{
+	while (rq->clock > rq->rt_period_expire) {
+		u64 period, ratio;
+
+		period = (u64)sysctl_sched_rt_period * NSEC_PER_MSEC;
+		ratio = (period * sysctl_sched_rt_ratio) >> SCHED_RT_FRAC_SHIFT;
+
+		rq->rt.rt_time -= min(rq->rt.rt_time, ratio);
+		rq->rt_period_expire += period;
+	}
+
+	/*
+	 * When the rt throttle is expired, let them rip.
+	 * (XXX: use hrtick when available)
+	 */
+	if (rq->rt.rt_throttled && rq->clock > rq->rt.rt_throttled) {
+		rq->rt.rt_throttled = 0;
+		if (!sched_rt_ratio_exceeded(rq, &rq->rt))
+			resched_task(rq->curr);
+	}
+}
+
 /*
  * Update the current task's runtime statistics. Skip current tasks that
  * are not in our scheduling class.
@@ -66,6 +110,11 @@ static void update_curr_rt(struct rq *rq)
 	curr->se.sum_exec_runtime += delta_exec;
 	curr->se.exec_start = rq->clock;
 	cpuacct_charge(curr, delta_exec);
+
+	rq->rt.rt_time += delta_exec;
+	update_sched_rt_period(rq);
+	if (sched_rt_ratio_exceeded(rq, &rq->rt))
+		resched_task(curr);
 }
 
 static inline void inc_rt_tasks(struct task_struct *p, struct rq *rq)
@@ -208,8 +257,12 @@ static struct task_struct *pick_next_task_rt(struct rq *rq)
 	struct rt_prio_array *array = &rq->rt.active;
 	struct task_struct *next;
 	struct list_head *queue;
+	struct rt_rq *rt_rq = &rq->rt;
 	int idx;
 
+	if (sched_rt_ratio_exceeded(rq, rt_rq))
+		return NULL;
+
 	idx = sched_find_first_bit(array->bitmap);
 	if (idx >= MAX_RT_PRIO)
 		return NULL;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 96f31c1bc4f0..3afbd25f43eb 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -306,7 +306,23 @@ static struct ctl_table kern_table[] = {
 		.procname	= "sched_nr_migrate",
 		.data		= &sysctl_sched_nr_migrate,
 		.maxlen		= sizeof(unsigned int),
-		.mode		= 644,
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "sched_rt_period_ms",
+		.data		= &sysctl_sched_rt_period,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "sched_rt_ratio",
+		.data		= &sysctl_sched_rt_ratio,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
 #if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
-- 
cgit v1.2.3


From 6f505b16425a51270058e4a93441fe64de3dd435 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 25 Jan 2008 21:08:30 +0100
Subject: sched: rt group scheduling

Extend group scheduling to also cover the realtime classes. It uses the time
limiting introduced by the previous patch to allow multiple realtime groups.

The hard time limit is required to keep behaviour deterministic.

The algorithms used make the realtime scheduler O(tg), linear scaling wrt the
number of task groups. This is the worst case behaviour I can't seem to get out
of, the avg. case of the algorithms can be improved, I focused on correctness
and worst case.

[ akpm@linux-foundation.org: move side-effects out of BUG_ON(). ]

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/init_task.h |   5 +-
 include/linux/sched.h     |  10 +-
 kernel/fork.c             |   2 +-
 kernel/sched.c            | 283 +++++++++++++++++++---------
 kernel/sched_rt.c         | 455 ++++++++++++++++++++++++++++++++++------------
 5 files changed, 549 insertions(+), 206 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index ee65d87bedb7..796019b22b6f 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -130,12 +130,13 @@ extern struct group_info init_groups;
 	.normal_prio	= MAX_PRIO-20,					\
 	.policy		= SCHED_NORMAL,					\
 	.cpus_allowed	= CPU_MASK_ALL,					\
-	.nr_cpus_allowed = NR_CPUS,					\
 	.mm		= NULL,						\
 	.active_mm	= &init_mm,					\
 	.rt		= {						\
 		.run_list	= LIST_HEAD_INIT(tsk.rt.run_list),	\
-		.time_slice	= HZ, },				\
+		.time_slice	= HZ, 					\
+		.nr_cpus_allowed = NR_CPUS,				\
+	},								\
 	.ioprio		= 0,						\
 	.tasks		= LIST_HEAD_INIT(tsk.tasks),			\
 	.ptrace_children= LIST_HEAD_INIT(tsk.ptrace_children),		\
diff --git a/include/linux/sched.h b/include/linux/sched.h
index d5ea144df836..04eecbf0241e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -934,6 +934,15 @@ struct sched_rt_entity {
 	struct list_head run_list;
 	unsigned int time_slice;
 	unsigned long timeout;
+	int nr_cpus_allowed;
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	struct sched_rt_entity	*parent;
+	/* rq on which this entity is (to be) queued: */
+	struct rt_rq		*rt_rq;
+	/* rq "owned" by this entity/group: */
+	struct rt_rq		*my_q;
+#endif
 };
 
 struct task_struct {
@@ -978,7 +987,6 @@ struct task_struct {
 
 	unsigned int policy;
 	cpumask_t cpus_allowed;
-	int nr_cpus_allowed;
 
 #ifdef CONFIG_PREEMPT_RCU
 	int rcu_read_lock_nesting;
diff --git a/kernel/fork.c b/kernel/fork.c
index 9f8ef32cbc7a..0c969f4fade0 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1246,7 +1246,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	 * parent's CPU). This avoids alot of nasty races.
 	 */
 	p->cpus_allowed = current->cpus_allowed;
-	p->nr_cpus_allowed = current->nr_cpus_allowed;
+	p->rt.nr_cpus_allowed = current->rt.nr_cpus_allowed;
 	if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) ||
 			!cpu_online(task_cpu(p))))
 		set_task_cpu(p, smp_processor_id());
diff --git a/kernel/sched.c b/kernel/sched.c
index e9a7beee9b79..5ea2c533b432 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -161,6 +161,8 @@ struct rt_prio_array {
 
 struct cfs_rq;
 
+static LIST_HEAD(task_groups);
+
 /* task group related information */
 struct task_group {
 #ifdef CONFIG_FAIR_CGROUP_SCHED
@@ -171,6 +173,11 @@ struct task_group {
 	/* runqueue "owned" by this group on each cpu */
 	struct cfs_rq **cfs_rq;
 
+	struct sched_rt_entity **rt_se;
+	struct rt_rq **rt_rq;
+
+	unsigned int rt_ratio;
+
 	/*
 	 * shares assigned to a task group governs how much of cpu bandwidth
 	 * is allocated to the group. The more shares a group has, the more is
@@ -208,6 +215,7 @@ struct task_group {
 	unsigned long shares;
 
 	struct rcu_head rcu;
+	struct list_head list;
 };
 
 /* Default task group's sched entity on each cpu */
@@ -215,9 +223,15 @@ static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
 /* Default task group's cfs_rq on each cpu */
 static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
 
+static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
+static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
+
 static struct sched_entity *init_sched_entity_p[NR_CPUS];
 static struct cfs_rq *init_cfs_rq_p[NR_CPUS];
 
+static struct sched_rt_entity *init_sched_rt_entity_p[NR_CPUS];
+static struct rt_rq *init_rt_rq_p[NR_CPUS];
+
 /* task_group_mutex serializes add/remove of task groups and also changes to
  * a task group's cpu shares.
  */
@@ -240,6 +254,9 @@ static void set_se_shares(struct sched_entity *se, unsigned long shares);
 struct task_group init_task_group = {
 	.se	= init_sched_entity_p,
 	.cfs_rq = init_cfs_rq_p,
+
+	.rt_se	= init_sched_rt_entity_p,
+	.rt_rq	= init_rt_rq_p,
 };
 
 #ifdef CONFIG_FAIR_USER_SCHED
@@ -269,10 +286,13 @@ static inline struct task_group *task_group(struct task_struct *p)
 }
 
 /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
-static inline void set_task_cfs_rq(struct task_struct *p, unsigned int cpu)
+static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
 {
 	p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
 	p->se.parent = task_group(p)->se[cpu];
+
+	p->rt.rt_rq  = task_group(p)->rt_rq[cpu];
+	p->rt.parent = task_group(p)->rt_se[cpu];
 }
 
 static inline void lock_task_group_list(void)
@@ -297,7 +317,7 @@ static inline void unlock_doms_cur(void)
 
 #else
 
-static inline void set_task_cfs_rq(struct task_struct *p, unsigned int cpu) { }
+static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
 static inline void lock_task_group_list(void) { }
 static inline void unlock_task_group_list(void) { }
 static inline void lock_doms_cur(void) { }
@@ -343,13 +363,22 @@ struct cfs_rq {
 struct rt_rq {
 	struct rt_prio_array active;
 	unsigned long rt_nr_running;
+#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED
+	int highest_prio; /* highest queued rt task prio */
+#endif
 #ifdef CONFIG_SMP
 	unsigned long rt_nr_migratory;
-	int highest_prio; /* highest queued rt task prio */
 	int overloaded;
 #endif
+	int rt_throttled;
 	u64 rt_time;
-	u64 rt_throttled;
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	struct rq *rq;
+	struct list_head leaf_rt_rq_list;
+	struct task_group *tg;
+	struct sched_rt_entity *rt_se;
+#endif
 };
 
 #ifdef CONFIG_SMP
@@ -411,12 +440,14 @@ struct rq {
 	u64 nr_switches;
 
 	struct cfs_rq cfs;
+	struct rt_rq rt;
+	u64 rt_period_expire;
+
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	/* list of leaf cfs_rq on this cpu: */
 	struct list_head leaf_cfs_rq_list;
+	struct list_head leaf_rt_rq_list;
 #endif
-	struct rt_rq rt;
-	u64 rt_period_expire;
 
 	/*
 	 * This is part of a global counter where only the total sum
@@ -613,9 +644,9 @@ const_debug unsigned int sysctl_sched_rt_period = 1000;
 
 /*
  * ratio of time -rt tasks may consume.
- * default: 100%
+ * default: 95%
  */
-const_debug unsigned int sysctl_sched_rt_ratio = SCHED_RT_FRAC;
+const_debug unsigned int sysctl_sched_rt_ratio = 62259;
 
 /*
  * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
@@ -1337,7 +1368,7 @@ unsigned long weighted_cpuload(const int cpu)
 
 static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
 {
-	set_task_cfs_rq(p, cpu);
+	set_task_rq(p, cpu);
 #ifdef CONFIG_SMP
 	/*
 	 * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
@@ -5281,7 +5312,7 @@ int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
 		p->sched_class->set_cpus_allowed(p, &new_mask);
 	else {
 		p->cpus_allowed = new_mask;
-		p->nr_cpus_allowed = cpus_weight(new_mask);
+		p->rt.nr_cpus_allowed = cpus_weight(new_mask);
 	}
 
 	/* Can the task run on the task's current CPU? If so, we're done */
@@ -7079,8 +7110,50 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
 
 	rt_rq->rt_time = 0;
 	rt_rq->rt_throttled = 0;
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	rt_rq->rq = rq;
+#endif
 }
 
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static void init_tg_cfs_entry(struct rq *rq, struct task_group *tg,
+		struct cfs_rq *cfs_rq, struct sched_entity *se,
+		int cpu, int add)
+{
+	tg->cfs_rq[cpu] = cfs_rq;
+	init_cfs_rq(cfs_rq, rq);
+	cfs_rq->tg = tg;
+	if (add)
+		list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
+
+	tg->se[cpu] = se;
+	se->cfs_rq = &rq->cfs;
+	se->my_q = cfs_rq;
+	se->load.weight = tg->shares;
+	se->load.inv_weight = div64_64(1ULL<<32, se->load.weight);
+	se->parent = NULL;
+}
+
+static void init_tg_rt_entry(struct rq *rq, struct task_group *tg,
+		struct rt_rq *rt_rq, struct sched_rt_entity *rt_se,
+		int cpu, int add)
+{
+	tg->rt_rq[cpu] = rt_rq;
+	init_rt_rq(rt_rq, rq);
+	rt_rq->tg = tg;
+	rt_rq->rt_se = rt_se;
+	if (add)
+		list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
+
+	tg->rt_se[cpu] = rt_se;
+	rt_se->rt_rq = &rq->rt;
+	rt_se->my_q = rt_rq;
+	rt_se->parent = NULL;
+	INIT_LIST_HEAD(&rt_se->run_list);
+}
+#endif
+
 void __init sched_init(void)
 {
 	int highest_cpu = 0;
@@ -7090,6 +7163,10 @@ void __init sched_init(void)
 	init_defrootdomain();
 #endif
 
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	list_add(&init_task_group.list, &task_groups);
+#endif
+
 	for_each_possible_cpu(i) {
 		struct rq *rq;
 
@@ -7099,30 +7176,20 @@ void __init sched_init(void)
 		rq->nr_running = 0;
 		rq->clock = 1;
 		init_cfs_rq(&rq->cfs, rq);
+		init_rt_rq(&rq->rt, rq);
 #ifdef CONFIG_FAIR_GROUP_SCHED
-		INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
-		{
-			struct cfs_rq *cfs_rq = &per_cpu(init_cfs_rq, i);
-			struct sched_entity *se =
-					 &per_cpu(init_sched_entity, i);
-
-			init_cfs_rq_p[i] = cfs_rq;
-			init_cfs_rq(cfs_rq, rq);
-			cfs_rq->tg = &init_task_group;
-			list_add(&cfs_rq->leaf_cfs_rq_list,
-							 &rq->leaf_cfs_rq_list);
-
-			init_sched_entity_p[i] = se;
-			se->cfs_rq = &rq->cfs;
-			se->my_q = cfs_rq;
-			se->load.weight = init_task_group_load;
-			se->load.inv_weight =
-				 div64_64(1ULL<<32, init_task_group_load);
-			se->parent = NULL;
-		}
 		init_task_group.shares = init_task_group_load;
+		INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
+		init_tg_cfs_entry(rq, &init_task_group,
+				&per_cpu(init_cfs_rq, i),
+				&per_cpu(init_sched_entity, i), i, 1);
+
+		init_task_group.rt_ratio = sysctl_sched_rt_ratio; /* XXX */
+		INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
+		init_tg_rt_entry(rq, &init_task_group,
+				&per_cpu(init_rt_rq, i),
+				&per_cpu(init_sched_rt_entity, i), i, 1);
 #endif
-		init_rt_rq(&rq->rt, rq);
 		rq->rt_period_expire = 0;
 
 		for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
@@ -7460,12 +7527,36 @@ static int load_balance_monitor(void *unused)
 }
 #endif	/* CONFIG_SMP */
 
+static void free_sched_group(struct task_group *tg)
+{
+	int i;
+
+	for_each_possible_cpu(i) {
+		if (tg->cfs_rq)
+			kfree(tg->cfs_rq[i]);
+		if (tg->se)
+			kfree(tg->se[i]);
+		if (tg->rt_rq)
+			kfree(tg->rt_rq[i]);
+		if (tg->rt_se)
+			kfree(tg->rt_se[i]);
+	}
+
+	kfree(tg->cfs_rq);
+	kfree(tg->se);
+	kfree(tg->rt_rq);
+	kfree(tg->rt_se);
+	kfree(tg);
+}
+
 /* allocate runqueue etc for a new task group */
 struct task_group *sched_create_group(void)
 {
 	struct task_group *tg;
 	struct cfs_rq *cfs_rq;
 	struct sched_entity *se;
+	struct rt_rq *rt_rq;
+	struct sched_rt_entity *rt_se;
 	struct rq *rq;
 	int i;
 
@@ -7479,100 +7570,89 @@ struct task_group *sched_create_group(void)
 	tg->se = kzalloc(sizeof(se) * NR_CPUS, GFP_KERNEL);
 	if (!tg->se)
 		goto err;
+	tg->rt_rq = kzalloc(sizeof(rt_rq) * NR_CPUS, GFP_KERNEL);
+	if (!tg->rt_rq)
+		goto err;
+	tg->rt_se = kzalloc(sizeof(rt_se) * NR_CPUS, GFP_KERNEL);
+	if (!tg->rt_se)
+		goto err;
+
+	tg->shares = NICE_0_LOAD;
+	tg->rt_ratio = 0; /* XXX */
 
 	for_each_possible_cpu(i) {
 		rq = cpu_rq(i);
 
-		cfs_rq = kmalloc_node(sizeof(struct cfs_rq), GFP_KERNEL,
-							 cpu_to_node(i));
+		cfs_rq = kmalloc_node(sizeof(struct cfs_rq),
+				GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
 		if (!cfs_rq)
 			goto err;
 
-		se = kmalloc_node(sizeof(struct sched_entity), GFP_KERNEL,
-							cpu_to_node(i));
+		se = kmalloc_node(sizeof(struct sched_entity),
+				GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
 		if (!se)
 			goto err;
 
-		memset(cfs_rq, 0, sizeof(struct cfs_rq));
-		memset(se, 0, sizeof(struct sched_entity));
+		rt_rq = kmalloc_node(sizeof(struct rt_rq),
+				GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
+		if (!rt_rq)
+			goto err;
 
-		tg->cfs_rq[i] = cfs_rq;
-		init_cfs_rq(cfs_rq, rq);
-		cfs_rq->tg = tg;
+		rt_se = kmalloc_node(sizeof(struct sched_rt_entity),
+				GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
+		if (!rt_se)
+			goto err;
 
-		tg->se[i] = se;
-		se->cfs_rq = &rq->cfs;
-		se->my_q = cfs_rq;
-		se->load.weight = NICE_0_LOAD;
-		se->load.inv_weight = div64_64(1ULL<<32, NICE_0_LOAD);
-		se->parent = NULL;
+		init_tg_cfs_entry(rq, tg, cfs_rq, se, i, 0);
+		init_tg_rt_entry(rq, tg, rt_rq, rt_se, i, 0);
 	}
 
-	tg->shares = NICE_0_LOAD;
-
 	lock_task_group_list();
 	for_each_possible_cpu(i) {
 		rq = cpu_rq(i);
 		cfs_rq = tg->cfs_rq[i];
 		list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
+		rt_rq = tg->rt_rq[i];
+		list_add_rcu(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
 	}
+	list_add_rcu(&tg->list, &task_groups);
 	unlock_task_group_list();
 
 	return tg;
 
 err:
-	for_each_possible_cpu(i) {
-		if (tg->cfs_rq)
-			kfree(tg->cfs_rq[i]);
-		if (tg->se)
-			kfree(tg->se[i]);
-	}
-	kfree(tg->cfs_rq);
-	kfree(tg->se);
-	kfree(tg);
-
+	free_sched_group(tg);
 	return ERR_PTR(-ENOMEM);
 }
 
 /* rcu callback to free various structures associated with a task group */
-static void free_sched_group(struct rcu_head *rhp)
+static void free_sched_group_rcu(struct rcu_head *rhp)
 {
-	struct task_group *tg = container_of(rhp, struct task_group, rcu);
-	struct cfs_rq *cfs_rq;
-	struct sched_entity *se;
-	int i;
-
 	/* now it should be safe to free those cfs_rqs */
-	for_each_possible_cpu(i) {
-		cfs_rq = tg->cfs_rq[i];
-		kfree(cfs_rq);
-
-		se = tg->se[i];
-		kfree(se);
-	}
-
-	kfree(tg->cfs_rq);
-	kfree(tg->se);
-	kfree(tg);
+	free_sched_group(container_of(rhp, struct task_group, rcu));
 }
 
 /* Destroy runqueue etc associated with a task group */
 void sched_destroy_group(struct task_group *tg)
 {
 	struct cfs_rq *cfs_rq = NULL;
+	struct rt_rq *rt_rq = NULL;
 	int i;
 
 	lock_task_group_list();
 	for_each_possible_cpu(i) {
 		cfs_rq = tg->cfs_rq[i];
 		list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
+		rt_rq = tg->rt_rq[i];
+		list_del_rcu(&rt_rq->leaf_rt_rq_list);
 	}
+	list_del_rcu(&tg->list);
 	unlock_task_group_list();
 
 	BUG_ON(!cfs_rq);
 
 	/* wait for possible concurrent references to cfs_rqs complete */
-	call_rcu(&tg->rcu, free_sched_group);
+	call_rcu(&tg->rcu, free_sched_group_rcu);
 }
 
 /* change task's runqueue when it moves between groups.
@@ -7588,11 +7668,6 @@ void sched_move_task(struct task_struct *tsk)
 
 	rq = task_rq_lock(tsk, &flags);
 
-	if (tsk->sched_class != &fair_sched_class) {
-		set_task_cfs_rq(tsk, task_cpu(tsk));
-		goto done;
-	}
-
 	update_rq_clock(rq);
 
 	running = task_current(rq, tsk);
@@ -7604,7 +7679,7 @@ void sched_move_task(struct task_struct *tsk)
 			tsk->sched_class->put_prev_task(rq, tsk);
 	}
 
-	set_task_cfs_rq(tsk, task_cpu(tsk));
+	set_task_rq(tsk, task_cpu(tsk));
 
 	if (on_rq) {
 		if (unlikely(running))
@@ -7612,7 +7687,6 @@ void sched_move_task(struct task_struct *tsk)
 		enqueue_task(rq, tsk, 0);
 	}
 
-done:
 	task_rq_unlock(rq, &flags);
 }
 
@@ -7697,6 +7771,31 @@ unsigned long sched_group_shares(struct task_group *tg)
 	return tg->shares;
 }
 
+/*
+ * Ensure the total rt_ratio <= sysctl_sched_rt_ratio
+ */
+int sched_group_set_rt_ratio(struct task_group *tg, unsigned long rt_ratio)
+{
+	struct task_group *tgi;
+	unsigned long total = 0;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(tgi, &task_groups, list)
+		total += tgi->rt_ratio;
+	rcu_read_unlock();
+
+	if (total + rt_ratio - tg->rt_ratio > sysctl_sched_rt_ratio)
+		return -EINVAL;
+
+	tg->rt_ratio = rt_ratio;
+	return 0;
+}
+
+unsigned long sched_group_rt_ratio(struct task_group *tg)
+{
+	return tg->rt_ratio;
+}
+
 #endif	/* CONFIG_FAIR_GROUP_SCHED */
 
 #ifdef CONFIG_FAIR_CGROUP_SCHED
@@ -7772,12 +7871,30 @@ static u64 cpu_shares_read_uint(struct cgroup *cgrp, struct cftype *cft)
 	return (u64) tg->shares;
 }
 
+static int cpu_rt_ratio_write_uint(struct cgroup *cgrp, struct cftype *cftype,
+		u64 rt_ratio_val)
+{
+	return sched_group_set_rt_ratio(cgroup_tg(cgrp), rt_ratio_val);
+}
+
+static u64 cpu_rt_ratio_read_uint(struct cgroup *cgrp, struct cftype *cft)
+{
+	struct task_group *tg = cgroup_tg(cgrp);
+
+	return (u64) tg->rt_ratio;
+}
+
 static struct cftype cpu_files[] = {
 	{
 		.name = "shares",
 		.read_uint = cpu_shares_read_uint,
 		.write_uint = cpu_shares_write_uint,
 	},
+	{
+		.name = "rt_ratio",
+		.read_uint = cpu_rt_ratio_read_uint,
+		.write_uint = cpu_rt_ratio_write_uint,
+	},
 };
 
 static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index fd10d965aa06..1178257613ad 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -45,47 +45,167 @@ static void update_rt_migration(struct rq *rq)
 }
 #endif /* CONFIG_SMP */
 
-static int sched_rt_ratio_exceeded(struct rq *rq, struct rt_rq *rt_rq)
+static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
 {
+	return container_of(rt_se, struct task_struct, rt);
+}
+
+static inline int on_rt_rq(struct sched_rt_entity *rt_se)
+{
+	return !list_empty(&rt_se->run_list);
+}
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+
+static inline unsigned int sched_rt_ratio(struct rt_rq *rt_rq)
+{
+	if (!rt_rq->tg)
+		return SCHED_RT_FRAC;
+
+	return rt_rq->tg->rt_ratio;
+}
+
+#define for_each_leaf_rt_rq(rt_rq, rq) \
+	list_for_each_entry(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list)
+
+static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
+{
+	return rt_rq->rq;
+}
+
+static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
+{
+	return rt_se->rt_rq;
+}
+
+#define for_each_sched_rt_entity(rt_se) \
+	for (; rt_se; rt_se = rt_se->parent)
+
+static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
+{
+	return rt_se->my_q;
+}
+
+static void enqueue_rt_entity(struct sched_rt_entity *rt_se);
+static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
+
+static void sched_rt_ratio_enqueue(struct rt_rq *rt_rq)
+{
+	struct sched_rt_entity *rt_se = rt_rq->rt_se;
+
+	if (rt_se && !on_rt_rq(rt_se) && rt_rq->rt_nr_running) {
+		enqueue_rt_entity(rt_se);
+		resched_task(rq_of_rt_rq(rt_rq)->curr);
+	}
+}
+
+static void sched_rt_ratio_dequeue(struct rt_rq *rt_rq)
+{
+	struct sched_rt_entity *rt_se = rt_rq->rt_se;
+
+	if (rt_se && on_rt_rq(rt_se))
+		dequeue_rt_entity(rt_se);
+}
+
+#else
+
+static inline unsigned int sched_rt_ratio(struct rt_rq *rt_rq)
+{
+	return sysctl_sched_rt_ratio;
+}
+
+#define for_each_leaf_rt_rq(rt_rq, rq) \
+	for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
+
+static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
+{
+	return container_of(rt_rq, struct rq, rt);
+}
+
+static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
+{
+	struct task_struct *p = rt_task_of(rt_se);
+	struct rq *rq = task_rq(p);
+
+	return &rq->rt;
+}
+
+#define for_each_sched_rt_entity(rt_se) \
+	for (; rt_se; rt_se = NULL)
+
+static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
+{
+	return NULL;
+}
+
+static inline void sched_rt_ratio_enqueue(struct rt_rq *rt_rq)
+{
+}
+
+static inline void sched_rt_ratio_dequeue(struct rt_rq *rt_rq)
+{
+}
+
+#endif
+
+static inline int rt_se_prio(struct sched_rt_entity *rt_se)
+{
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	struct rt_rq *rt_rq = group_rt_rq(rt_se);
+
+	if (rt_rq)
+		return rt_rq->highest_prio;
+#endif
+
+	return rt_task_of(rt_se)->prio;
+}
+
+static int sched_rt_ratio_exceeded(struct rt_rq *rt_rq)
+{
+	unsigned int rt_ratio = sched_rt_ratio(rt_rq);
 	u64 period, ratio;
 
-	if (sysctl_sched_rt_ratio == SCHED_RT_FRAC)
+	if (rt_ratio == SCHED_RT_FRAC)
 		return 0;
 
 	if (rt_rq->rt_throttled)
 		return 1;
 
 	period = (u64)sysctl_sched_rt_period * NSEC_PER_MSEC;
-	ratio = (period * sysctl_sched_rt_ratio) >> SCHED_RT_FRAC_SHIFT;
+	ratio = (period * rt_ratio) >> SCHED_RT_FRAC_SHIFT;
 
 	if (rt_rq->rt_time > ratio) {
-		rt_rq->rt_throttled = rq->clock + period - rt_rq->rt_time;
+		rt_rq->rt_throttled = 1;
+		sched_rt_ratio_dequeue(rt_rq);
 		return 1;
 	}
 
 	return 0;
 }
 
+static void __update_sched_rt_period(struct rt_rq *rt_rq, u64 period)
+{
+	unsigned long rt_ratio = sched_rt_ratio(rt_rq);
+	u64 ratio = (period * rt_ratio) >> SCHED_RT_FRAC_SHIFT;
+
+	rt_rq->rt_time -= min(rt_rq->rt_time, ratio);
+	if (rt_rq->rt_throttled) {
+		rt_rq->rt_throttled = 0;
+		sched_rt_ratio_enqueue(rt_rq);
+	}
+}
+
 static void update_sched_rt_period(struct rq *rq)
 {
-	while (rq->clock > rq->rt_period_expire) {
-		u64 period, ratio;
+	struct rt_rq *rt_rq;
+	u64 period;
 
+	while (rq->clock > rq->rt_period_expire) {
 		period = (u64)sysctl_sched_rt_period * NSEC_PER_MSEC;
-		ratio = (period * sysctl_sched_rt_ratio) >> SCHED_RT_FRAC_SHIFT;
-
-		rq->rt.rt_time -= min(rq->rt.rt_time, ratio);
 		rq->rt_period_expire += period;
-	}
 
-	/*
-	 * When the rt throttle is expired, let them rip.
-	 * (XXX: use hrtick when available)
-	 */
-	if (rq->rt.rt_throttled && rq->clock > rq->rt.rt_throttled) {
-		rq->rt.rt_throttled = 0;
-		if (!sched_rt_ratio_exceeded(rq, &rq->rt))
-			resched_task(rq->curr);
+		for_each_leaf_rt_rq(rt_rq, rq)
+			__update_sched_rt_period(rt_rq, period);
 	}
 }
 
@@ -96,6 +216,8 @@ static void update_sched_rt_period(struct rq *rq)
 static void update_curr_rt(struct rq *rq)
 {
 	struct task_struct *curr = rq->curr;
+	struct sched_rt_entity *rt_se = &curr->rt;
+	struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
 	u64 delta_exec;
 
 	if (!task_has_rt_policy(curr))
@@ -111,95 +233,184 @@ static void update_curr_rt(struct rq *rq)
 	curr->se.exec_start = rq->clock;
 	cpuacct_charge(curr, delta_exec);
 
-	rq->rt.rt_time += delta_exec;
-	update_sched_rt_period(rq);
-	if (sched_rt_ratio_exceeded(rq, &rq->rt))
+	rt_rq->rt_time += delta_exec;
+	/*
+	 * might make it a tad more accurate:
+	 *
+	 * update_sched_rt_period(rq);
+	 */
+	if (sched_rt_ratio_exceeded(rt_rq))
 		resched_task(curr);
 }
 
-static inline void inc_rt_tasks(struct task_struct *p, struct rq *rq)
+static inline
+void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 {
-	WARN_ON(!rt_task(p));
-	rq->rt.rt_nr_running++;
+	WARN_ON(!rt_prio(rt_se_prio(rt_se)));
+	rt_rq->rt_nr_running++;
+#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED
+	if (rt_se_prio(rt_se) < rt_rq->highest_prio)
+		rt_rq->highest_prio = rt_se_prio(rt_se);
+#endif
 #ifdef CONFIG_SMP
-	if (p->prio < rq->rt.highest_prio)
-		rq->rt.highest_prio = p->prio;
-	if (p->nr_cpus_allowed > 1)
+	if (rt_se->nr_cpus_allowed > 1) {
+		struct rq *rq = rq_of_rt_rq(rt_rq);
 		rq->rt.rt_nr_migratory++;
+	}
 
-	update_rt_migration(rq);
-#endif /* CONFIG_SMP */
+	update_rt_migration(rq_of_rt_rq(rt_rq));
+#endif
 }
 
-static inline void dec_rt_tasks(struct task_struct *p, struct rq *rq)
+static inline
+void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 {
-	WARN_ON(!rt_task(p));
-	WARN_ON(!rq->rt.rt_nr_running);
-	rq->rt.rt_nr_running--;
-#ifdef CONFIG_SMP
-	if (rq->rt.rt_nr_running) {
+	WARN_ON(!rt_prio(rt_se_prio(rt_se)));
+	WARN_ON(!rt_rq->rt_nr_running);
+	rt_rq->rt_nr_running--;
+#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED
+	if (rt_rq->rt_nr_running) {
 		struct rt_prio_array *array;
 
-		WARN_ON(p->prio < rq->rt.highest_prio);
-		if (p->prio == rq->rt.highest_prio) {
+		WARN_ON(rt_se_prio(rt_se) < rt_rq->highest_prio);
+		if (rt_se_prio(rt_se) == rt_rq->highest_prio) {
 			/* recalculate */
-			array = &rq->rt.active;
-			rq->rt.highest_prio =
+			array = &rt_rq->active;
+			rt_rq->highest_prio =
 				sched_find_first_bit(array->bitmap);
 		} /* otherwise leave rq->highest prio alone */
 	} else
-		rq->rt.highest_prio = MAX_RT_PRIO;
-	if (p->nr_cpus_allowed > 1)
+		rt_rq->highest_prio = MAX_RT_PRIO;
+#endif
+#ifdef CONFIG_SMP
+	if (rt_se->nr_cpus_allowed > 1) {
+		struct rq *rq = rq_of_rt_rq(rt_rq);
 		rq->rt.rt_nr_migratory--;
+	}
 
-	update_rt_migration(rq);
+	update_rt_migration(rq_of_rt_rq(rt_rq));
 #endif /* CONFIG_SMP */
 }
 
-static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
+static void enqueue_rt_entity(struct sched_rt_entity *rt_se)
 {
-	struct rt_prio_array *array = &rq->rt.active;
+	struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
+	struct rt_prio_array *array = &rt_rq->active;
+	struct rt_rq *group_rq = group_rt_rq(rt_se);
 
-	list_add_tail(&p->rt.run_list, array->queue + p->prio);
-	__set_bit(p->prio, array->bitmap);
-	inc_cpu_load(rq, p->se.load.weight);
+	if (group_rq && group_rq->rt_throttled)
+		return;
 
-	inc_rt_tasks(p, rq);
+	list_add_tail(&rt_se->run_list, array->queue + rt_se_prio(rt_se));
+	__set_bit(rt_se_prio(rt_se), array->bitmap);
 
-	if (wakeup)
-		p->rt.timeout = 0;
+	inc_rt_tasks(rt_se, rt_rq);
+}
+
+static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
+{
+	struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
+	struct rt_prio_array *array = &rt_rq->active;
+
+	list_del_init(&rt_se->run_list);
+	if (list_empty(array->queue + rt_se_prio(rt_se)))
+		__clear_bit(rt_se_prio(rt_se), array->bitmap);
+
+	dec_rt_tasks(rt_se, rt_rq);
+}
+
+/*
+ * Because the prio of an upper entry depends on the lower
+ * entries, we must remove entries top - down.
+ *
+ * XXX: O(1/2 h^2) because we can only walk up, not down the chain.
+ *      doesn't matter much for now, as h=2 for GROUP_SCHED.
+ */
+static void dequeue_rt_stack(struct task_struct *p)
+{
+	struct sched_rt_entity *rt_se, *top_se;
+
+	/*
+	 * dequeue all, top - down.
+	 */
+	do {
+		rt_se = &p->rt;
+		top_se = NULL;
+		for_each_sched_rt_entity(rt_se) {
+			if (on_rt_rq(rt_se))
+				top_se = rt_se;
+		}
+		if (top_se)
+			dequeue_rt_entity(top_se);
+	} while (top_se);
 }
 
 /*
  * Adding/removing a task to/from a priority array:
  */
+static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
+{
+	struct sched_rt_entity *rt_se = &p->rt;
+
+	if (wakeup)
+		rt_se->timeout = 0;
+
+	dequeue_rt_stack(p);
+
+	/*
+	 * enqueue everybody, bottom - up.
+	 */
+	for_each_sched_rt_entity(rt_se)
+		enqueue_rt_entity(rt_se);
+
+	inc_cpu_load(rq, p->se.load.weight);
+}
+
 static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
 {
-	struct rt_prio_array *array = &rq->rt.active;
+	struct sched_rt_entity *rt_se = &p->rt;
+	struct rt_rq *rt_rq;
 
 	update_curr_rt(rq);
 
-	list_del(&p->rt.run_list);
-	if (list_empty(array->queue + p->prio))
-		__clear_bit(p->prio, array->bitmap);
-	dec_cpu_load(rq, p->se.load.weight);
+	dequeue_rt_stack(p);
+
+	/*
+	 * re-enqueue all non-empty rt_rq entities.
+	 */
+	for_each_sched_rt_entity(rt_se) {
+		rt_rq = group_rt_rq(rt_se);
+		if (rt_rq && rt_rq->rt_nr_running)
+			enqueue_rt_entity(rt_se);
+	}
 
-	dec_rt_tasks(p, rq);
+	dec_cpu_load(rq, p->se.load.weight);
 }
 
 /*
  * Put task to the end of the run list without the overhead of dequeue
  * followed by enqueue.
  */
+static
+void requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se)
+{
+	struct rt_prio_array *array = &rt_rq->active;
+
+	list_move_tail(&rt_se->run_list, array->queue + rt_se_prio(rt_se));
+}
+
 static void requeue_task_rt(struct rq *rq, struct task_struct *p)
 {
-	struct rt_prio_array *array = &rq->rt.active;
+	struct sched_rt_entity *rt_se = &p->rt;
+	struct rt_rq *rt_rq;
 
-	list_move_tail(&p->rt.run_list, array->queue + p->prio);
+	for_each_sched_rt_entity(rt_se) {
+		rt_rq = rt_rq_of_se(rt_se);
+		requeue_rt_entity(rt_rq, rt_se);
+	}
 }
 
-static void
-yield_task_rt(struct rq *rq)
+static void yield_task_rt(struct rq *rq)
 {
 	requeue_task_rt(rq, rq->curr);
 }
@@ -229,7 +440,7 @@ static int select_task_rq_rt(struct task_struct *p, int sync)
 	 * cold cache anyway.
 	 */
 	if (unlikely(rt_task(rq->curr)) &&
-	    (p->nr_cpus_allowed > 1)) {
+	    (p->rt.nr_cpus_allowed > 1)) {
 		int cpu = find_lowest_rq(p);
 
 		return (cpu == -1) ? task_cpu(p) : cpu;
@@ -252,27 +463,51 @@ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p)
 		resched_task(rq->curr);
 }
 
-static struct task_struct *pick_next_task_rt(struct rq *rq)
+static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
+						   struct rt_rq *rt_rq)
 {
-	struct rt_prio_array *array = &rq->rt.active;
-	struct task_struct *next;
+	struct rt_prio_array *array = &rt_rq->active;
+	struct sched_rt_entity *next = NULL;
 	struct list_head *queue;
-	struct rt_rq *rt_rq = &rq->rt;
 	int idx;
 
-	if (sched_rt_ratio_exceeded(rq, rt_rq))
-		return NULL;
+	if (sched_rt_ratio_exceeded(rt_rq))
+		goto out;
 
 	idx = sched_find_first_bit(array->bitmap);
-	if (idx >= MAX_RT_PRIO)
-		return NULL;
+	BUG_ON(idx >= MAX_RT_PRIO);
 
 	queue = array->queue + idx;
-	next = list_entry(queue->next, struct task_struct, rt.run_list);
+	next = list_entry(queue->next, struct sched_rt_entity, run_list);
+ out:
+	return next;
+}
 
-	next->se.exec_start = rq->clock;
+static struct task_struct *pick_next_task_rt(struct rq *rq)
+{
+	struct sched_rt_entity *rt_se;
+	struct task_struct *p;
+	struct rt_rq *rt_rq;
 
-	return next;
+ retry:
+	rt_rq = &rq->rt;
+
+	if (unlikely(!rt_rq->rt_nr_running))
+		return NULL;
+
+	if (sched_rt_ratio_exceeded(rt_rq))
+		return NULL;
+
+	do {
+		rt_se = pick_next_rt_entity(rq, rt_rq);
+		if (unlikely(!rt_se))
+			goto retry;
+		rt_rq = group_rt_rq(rt_se);
+	} while (rt_rq);
+
+	p = rt_task_of(rt_se);
+	p->se.exec_start = rq->clock;
+	return p;
 }
 
 static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
@@ -282,6 +517,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
 }
 
 #ifdef CONFIG_SMP
+
 /* Only try algorithms three times */
 #define RT_MAX_TRIES 3
 
@@ -292,7 +528,7 @@ static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
 {
 	if (!task_running(rq, p) &&
 	    (cpu < 0 || cpu_isset(cpu, p->cpus_allowed)) &&
-	    (p->nr_cpus_allowed > 1))
+	    (p->rt.nr_cpus_allowed > 1))
 		return 1;
 	return 0;
 }
@@ -300,52 +536,33 @@ static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
 /* Return the second highest RT task, NULL otherwise */
 static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
 {
-	struct rt_prio_array *array = &rq->rt.active;
-	struct task_struct *next;
-	struct list_head *queue;
+	struct task_struct *next = NULL;
+	struct sched_rt_entity *rt_se;
+	struct rt_prio_array *array;
+	struct rt_rq *rt_rq;
 	int idx;
 
-	if (likely(rq->rt.rt_nr_running < 2))
-		return NULL;
-
-	idx = sched_find_first_bit(array->bitmap);
-	if (unlikely(idx >= MAX_RT_PRIO)) {
-		WARN_ON(1); /* rt_nr_running is bad */
-		return NULL;
-	}
-
-	queue = array->queue + idx;
-	BUG_ON(list_empty(queue));
-
-	next = list_entry(queue->next, struct task_struct, rt.run_list);
-	if (unlikely(pick_rt_task(rq, next, cpu)))
-		goto out;
-
-	if (queue->next->next != queue) {
-		/* same prio task */
-		next = list_entry(queue->next->next, struct task_struct,
-				  rt.run_list);
-		if (pick_rt_task(rq, next, cpu))
-			goto out;
-	}
-
- retry:
-	/* slower, but more flexible */
-	idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1);
-	if (unlikely(idx >= MAX_RT_PRIO))
-		return NULL;
-
-	queue = array->queue + idx;
-	BUG_ON(list_empty(queue));
-
-	list_for_each_entry(next, queue, rt.run_list) {
-		if (pick_rt_task(rq, next, cpu))
-			goto out;
+	for_each_leaf_rt_rq(rt_rq, rq) {
+		array = &rt_rq->active;
+		idx = sched_find_first_bit(array->bitmap);
+ next_idx:
+		if (idx >= MAX_RT_PRIO)
+			continue;
+		if (next && next->prio < idx)
+			continue;
+		list_for_each_entry(rt_se, array->queue + idx, run_list) {
+			struct task_struct *p = rt_task_of(rt_se);
+			if (pick_rt_task(rq, p, cpu)) {
+				next = p;
+				break;
+			}
+		}
+		if (!next) {
+			idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1);
+			goto next_idx;
+		}
 	}
 
-	goto retry;
-
- out:
 	return next;
 }
 
@@ -774,12 +991,12 @@ static void set_cpus_allowed_rt(struct task_struct *p, cpumask_t *new_mask)
 	 * Update the migration status of the RQ if we have an RT task
 	 * which is running AND changing its weight value.
 	 */
-	if (p->se.on_rq && (weight != p->nr_cpus_allowed)) {
+	if (p->se.on_rq && (weight != p->rt.nr_cpus_allowed)) {
 		struct rq *rq = task_rq(p);
 
-		if ((p->nr_cpus_allowed <= 1) && (weight > 1)) {
+		if ((p->rt.nr_cpus_allowed <= 1) && (weight > 1)) {
 			rq->rt.rt_nr_migratory++;
-		} else if ((p->nr_cpus_allowed > 1) && (weight <= 1)) {
+		} else if ((p->rt.nr_cpus_allowed > 1) && (weight <= 1)) {
 			BUG_ON(!rq->rt.rt_nr_migratory);
 			rq->rt.rt_nr_migratory--;
 		}
@@ -788,7 +1005,7 @@ static void set_cpus_allowed_rt(struct task_struct *p, cpumask_t *new_mask)
 	}
 
 	p->cpus_allowed    = *new_mask;
-	p->nr_cpus_allowed = weight;
+	p->rt.nr_cpus_allowed = weight;
 }
 
 /* Assumes rq->lock is held */
-- 
cgit v1.2.3


From 614ee1f61f667b02165c1ae0c1357048dc6d94a0 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Fri, 25 Jan 2008 21:08:30 +0100
Subject: sched: pull_rt_task() cleanup

"goto out" is an odd way to spell "skip".

Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_rt.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 1178257613ad..1144bf55669d 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -877,10 +877,8 @@ static int pull_rt_task(struct rq *this_rq)
 		/*
 		 * Are there still pullable RT tasks?
 		 */
-		if (src_rq->rt.rt_nr_running <= 1) {
-			spin_unlock(&src_rq->lock);
-			continue;
-		}
+		if (src_rq->rt.rt_nr_running <= 1)
+			goto skip;
 
 		p = pick_next_highest_task_rt(src_rq, this_cpu);
 
@@ -904,7 +902,7 @@ static int pull_rt_task(struct rq *this_rq)
 			 */
 			if (p->prio < src_rq->curr->prio ||
 			    (next && next->prio < src_rq->curr->prio))
-				goto out;
+				goto skip;
 
 			ret = 1;
 
@@ -924,7 +922,7 @@ static int pull_rt_task(struct rq *this_rq)
 			next = p;
 
 		}
- out:
+ skip:
 		spin_unlock(&src_rq->lock);
 	}
 
-- 
cgit v1.2.3


From 48d5e258216f1c7713633439beb98a38c7290649 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 25 Jan 2008 21:08:31 +0100
Subject: sched: rt throttling vs no_hz

We need to teach no_hz about the rt throttling because its tick driven.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h    |  2 ++
 kernel/sched.c           | 23 ++++++++++++++++++++++-
 kernel/sched_rt.c        | 30 ++++++++++++++++--------------
 kernel/time/tick-sched.c |  5 +++++
 4 files changed, 45 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 04eecbf0241e..acadcab89ef9 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -230,6 +230,8 @@ static inline int select_nohz_load_balancer(int cpu)
 }
 #endif
 
+extern unsigned long rt_needs_cpu(int cpu);
+
 /*
  * Only dump TASK_* tasks. (0 for all tasks)
  */
diff --git a/kernel/sched.c b/kernel/sched.c
index 5ea2c533b432..22712b2e058a 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -442,6 +442,7 @@ struct rq {
 	struct cfs_rq cfs;
 	struct rt_rq rt;
 	u64 rt_period_expire;
+	int rt_throttled;
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	/* list of leaf cfs_rq on this cpu: */
@@ -594,6 +595,23 @@ static void update_rq_clock(struct rq *rq)
 #define task_rq(p)		cpu_rq(task_cpu(p))
 #define cpu_curr(cpu)		(cpu_rq(cpu)->curr)
 
+unsigned long rt_needs_cpu(int cpu)
+{
+	struct rq *rq = cpu_rq(cpu);
+	u64 delta;
+
+	if (!rq->rt_throttled)
+		return 0;
+
+	if (rq->clock > rq->rt_period_expire)
+		return 1;
+
+	delta = rq->rt_period_expire - rq->clock;
+	do_div(delta, NSEC_PER_SEC / HZ);
+
+	return (unsigned long)delta;
+}
+
 /*
  * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
  */
@@ -7102,9 +7120,11 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
 	/* delimiter for bitsearch: */
 	__set_bit(MAX_RT_PRIO, array->bitmap);
 
+#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED
+	rt_rq->highest_prio = MAX_RT_PRIO;
+#endif
 #ifdef CONFIG_SMP
 	rt_rq->rt_nr_migratory = 0;
-	rt_rq->highest_prio = MAX_RT_PRIO;
 	rt_rq->overloaded = 0;
 #endif
 
@@ -7191,6 +7211,7 @@ void __init sched_init(void)
 				&per_cpu(init_sched_rt_entity, i), i, 1);
 #endif
 		rq->rt_period_expire = 0;
+		rq->rt_throttled = 0;
 
 		for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
 			rq->cpu_load[j] = 0;
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 1144bf55669d..8bfdb3f8a52d 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -175,7 +175,11 @@ static int sched_rt_ratio_exceeded(struct rt_rq *rt_rq)
 	ratio = (period * rt_ratio) >> SCHED_RT_FRAC_SHIFT;
 
 	if (rt_rq->rt_time > ratio) {
+		struct rq *rq = rq_of_rt_rq(rt_rq);
+
+		rq->rt_throttled = 1;
 		rt_rq->rt_throttled = 1;
+
 		sched_rt_ratio_dequeue(rt_rq);
 		return 1;
 	}
@@ -183,18 +187,6 @@ static int sched_rt_ratio_exceeded(struct rt_rq *rt_rq)
 	return 0;
 }
 
-static void __update_sched_rt_period(struct rt_rq *rt_rq, u64 period)
-{
-	unsigned long rt_ratio = sched_rt_ratio(rt_rq);
-	u64 ratio = (period * rt_ratio) >> SCHED_RT_FRAC_SHIFT;
-
-	rt_rq->rt_time -= min(rt_rq->rt_time, ratio);
-	if (rt_rq->rt_throttled) {
-		rt_rq->rt_throttled = 0;
-		sched_rt_ratio_enqueue(rt_rq);
-	}
-}
-
 static void update_sched_rt_period(struct rq *rq)
 {
 	struct rt_rq *rt_rq;
@@ -204,8 +196,18 @@ static void update_sched_rt_period(struct rq *rq)
 		period = (u64)sysctl_sched_rt_period * NSEC_PER_MSEC;
 		rq->rt_period_expire += period;
 
-		for_each_leaf_rt_rq(rt_rq, rq)
-			__update_sched_rt_period(rt_rq, period);
+		for_each_leaf_rt_rq(rt_rq, rq) {
+			unsigned long rt_ratio = sched_rt_ratio(rt_rq);
+			u64 ratio = (period * rt_ratio) >> SCHED_RT_FRAC_SHIFT;
+
+			rt_rq->rt_time -= min(rt_rq->rt_time, ratio);
+			if (rt_rq->rt_throttled) {
+				rt_rq->rt_throttled = 0;
+				sched_rt_ratio_enqueue(rt_rq);
+			}
+		}
+
+		rq->rt_throttled = 0;
 	}
 }
 
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index cb89fa8db110..5f9fb645b725 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -153,6 +153,7 @@ void tick_nohz_update_jiffies(void)
 void tick_nohz_stop_sched_tick(void)
 {
 	unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags;
+	unsigned long rt_jiffies;
 	struct tick_sched *ts;
 	ktime_t last_update, expires, now, delta;
 	struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
@@ -216,6 +217,10 @@ void tick_nohz_stop_sched_tick(void)
 	next_jiffies = get_next_timer_interrupt(last_jiffies);
 	delta_jiffies = next_jiffies - last_jiffies;
 
+	rt_jiffies = rt_needs_cpu(cpu);
+	if (rt_jiffies && rt_jiffies < delta_jiffies)
+		delta_jiffies = rt_jiffies;
+
 	if (rcu_needs_cpu(cpu))
 		delta_jiffies = 1;
 	/*
-- 
cgit v1.2.3


From 2d44ae4d7135b9aee26439b3523b43473381bc5f Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 25 Jan 2008 21:08:31 +0100
Subject: hrtimer: clean up cpu->base locking tricks

In order to more easily allow for the scheduler to use timers, clean up
the locking a bit.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/hrtimer.c         | 20 +++++++++++++++++++-
 kernel/time/tick-sched.c |  8 --------
 2 files changed, 19 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index f994bb8065e6..9f850ca032b6 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -1063,7 +1063,9 @@ void hrtimer_interrupt(struct clock_event_device *dev)
 		basenow = ktime_add(now, base->offset);
 
 		while ((node = base->first)) {
+			enum hrtimer_restart (*fn)(struct hrtimer *);
 			struct hrtimer *timer;
+			int restart;
 
 			timer = rb_entry(node, struct hrtimer, node);
 
@@ -1091,13 +1093,29 @@ void hrtimer_interrupt(struct clock_event_device *dev)
 					 HRTIMER_STATE_CALLBACK, 0);
 			timer_stats_account_hrtimer(timer);
 
+			fn = timer->function;
+			if (timer->cb_mode == HRTIMER_CB_IRQSAFE_NO_SOFTIRQ) {
+				/*
+				 * Used for scheduler timers, avoid lock
+				 * inversion with rq->lock and tasklist_lock.
+				 *
+				 * These timers are required to deal with
+				 * enqueue expiry themselves and are not
+				 * allowed to migrate.
+				 */
+				spin_unlock(&cpu_base->lock);
+				restart = fn(timer);
+				spin_lock(&cpu_base->lock);
+			} else
+				restart = fn(timer);
+
 			/*
 			 * Note: We clear the CALLBACK bit after
 			 * enqueue_hrtimer to avoid reprogramming of
 			 * the event hardware. This happens at the end
 			 * of this function anyway.
 			 */
-			if (timer->function(timer) != HRTIMER_NORESTART) {
+			if (restart != HRTIMER_NORESTART) {
 				BUG_ON(timer->state != HRTIMER_STATE_CALLBACK);
 				enqueue_hrtimer(timer, base, 0);
 			}
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 5f9fb645b725..1a21b6fdb674 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -514,7 +514,6 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
 {
 	struct tick_sched *ts =
 		container_of(timer, struct tick_sched, sched_timer);
-	struct hrtimer_cpu_base *base = timer->base->cpu_base;
 	struct pt_regs *regs = get_irq_regs();
 	ktime_t now = ktime_get();
 	int cpu = smp_processor_id();
@@ -552,15 +551,8 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
 			touch_softlockup_watchdog();
 			ts->idle_jiffies++;
 		}
-		/*
-		 * update_process_times() might take tasklist_lock, hence
-		 * drop the base lock. sched-tick hrtimers are per-CPU and
-		 * never accessible by userspace APIs, so this is safe to do.
-		 */
-		spin_unlock(&base->lock);
 		update_process_times(user_mode(regs));
 		profile_tick(CPU_PROFILING);
-		spin_lock(&base->lock);
 	}
 
 	/* Do not restart, when we are in the idle loop */
-- 
cgit v1.2.3


From d3d74453c34f8fd87674a8cf5b8a327c68f22e99 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 25 Jan 2008 21:08:31 +0100
Subject: hrtimer: fixup the HRTIMER_CB_IRQSAFE_NO_SOFTIRQ fallback

Currently all highres=off timers are run from softirq context, but
HRTIMER_CB_IRQSAFE_NO_SOFTIRQ timers expect to run from irq context.

Fix this up by splitting it similar to the highres=on case.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/hrtimer.h |   5 +-
 kernel/hrtimer.c        | 270 +++++++++++++++++++++++++-----------------------
 kernel/timer.c          |   3 +-
 3 files changed, 143 insertions(+), 135 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index ecc8e2685e2b..49067f14fac1 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -115,10 +115,8 @@ struct hrtimer {
 	enum hrtimer_restart		(*function)(struct hrtimer *);
 	struct hrtimer_clock_base	*base;
 	unsigned long			state;
-#ifdef CONFIG_HIGH_RES_TIMERS
 	enum hrtimer_cb_mode		cb_mode;
 	struct list_head		cb_entry;
-#endif
 #ifdef CONFIG_TIMER_STATS
 	void				*start_site;
 	char				start_comm[16];
@@ -194,10 +192,10 @@ struct hrtimer_cpu_base {
 	spinlock_t			lock;
 	struct lock_class_key		lock_key;
 	struct hrtimer_clock_base	clock_base[HRTIMER_MAX_CLOCK_BASES];
+	struct list_head		cb_pending;
 #ifdef CONFIG_HIGH_RES_TIMERS
 	ktime_t				expires_next;
 	int				hres_active;
-	struct list_head		cb_pending;
 	unsigned long			nr_events;
 #endif
 };
@@ -319,6 +317,7 @@ extern void hrtimer_init_sleeper(struct hrtimer_sleeper *sl,
 
 /* Soft interrupt function to run the hrtimer queues: */
 extern void hrtimer_run_queues(void);
+extern void hrtimer_run_pending(void);
 
 /* Bootup initialization: */
 extern void __init hrtimers_init(void);
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 9f850ca032b6..061ae28a36a0 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -325,6 +325,22 @@ unsigned long ktime_divns(const ktime_t kt, s64 div)
 }
 #endif /* BITS_PER_LONG >= 64 */
 
+/*
+ * Check, whether the timer is on the callback pending list
+ */
+static inline int hrtimer_cb_pending(const struct hrtimer *timer)
+{
+	return timer->state & HRTIMER_STATE_PENDING;
+}
+
+/*
+ * Remove a timer from the callback pending list
+ */
+static inline void hrtimer_remove_cb_pending(struct hrtimer *timer)
+{
+	list_del_init(&timer->cb_entry);
+}
+
 /* High resolution timer related functions */
 #ifdef CONFIG_HIGH_RES_TIMERS
 
@@ -493,22 +509,6 @@ void hres_timers_resume(void)
 	retrigger_next_event(NULL);
 }
 
-/*
- * Check, whether the timer is on the callback pending list
- */
-static inline int hrtimer_cb_pending(const struct hrtimer *timer)
-{
-	return timer->state & HRTIMER_STATE_PENDING;
-}
-
-/*
- * Remove a timer from the callback pending list
- */
-static inline void hrtimer_remove_cb_pending(struct hrtimer *timer)
-{
-	list_del_init(&timer->cb_entry);
-}
-
 /*
  * Initialize the high resolution related parts of cpu_base
  */
@@ -516,7 +516,6 @@ static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base)
 {
 	base->expires_next.tv64 = KTIME_MAX;
 	base->hres_active = 0;
-	INIT_LIST_HEAD(&base->cb_pending);
 }
 
 /*
@@ -524,7 +523,6 @@ static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base)
  */
 static inline void hrtimer_init_timer_hres(struct hrtimer *timer)
 {
-	INIT_LIST_HEAD(&timer->cb_entry);
 }
 
 /*
@@ -618,10 +616,13 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
 {
 	return 0;
 }
-static inline int hrtimer_cb_pending(struct hrtimer *timer) { return 0; }
-static inline void hrtimer_remove_cb_pending(struct hrtimer *timer) { }
 static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { }
 static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { }
+static inline int hrtimer_reprogram(struct hrtimer *timer,
+				    struct hrtimer_clock_base *base)
+{
+	return 0;
+}
 
 #endif /* CONFIG_HIGH_RES_TIMERS */
 
@@ -1001,6 +1002,7 @@ void hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
 		clock_id = CLOCK_MONOTONIC;
 
 	timer->base = &cpu_base->clock_base[clock_id];
+	INIT_LIST_HEAD(&timer->cb_entry);
 	hrtimer_init_timer_hres(timer);
 
 #ifdef CONFIG_TIMER_STATS
@@ -1030,6 +1032,85 @@ int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp)
 }
 EXPORT_SYMBOL_GPL(hrtimer_get_res);
 
+static void run_hrtimer_pending(struct hrtimer_cpu_base *cpu_base)
+{
+	spin_lock_irq(&cpu_base->lock);
+
+	while (!list_empty(&cpu_base->cb_pending)) {
+		enum hrtimer_restart (*fn)(struct hrtimer *);
+		struct hrtimer *timer;
+		int restart;
+
+		timer = list_entry(cpu_base->cb_pending.next,
+				   struct hrtimer, cb_entry);
+
+		timer_stats_account_hrtimer(timer);
+
+		fn = timer->function;
+		__remove_hrtimer(timer, timer->base, HRTIMER_STATE_CALLBACK, 0);
+		spin_unlock_irq(&cpu_base->lock);
+
+		restart = fn(timer);
+
+		spin_lock_irq(&cpu_base->lock);
+
+		timer->state &= ~HRTIMER_STATE_CALLBACK;
+		if (restart == HRTIMER_RESTART) {
+			BUG_ON(hrtimer_active(timer));
+			/*
+			 * Enqueue the timer, allow reprogramming of the event
+			 * device
+			 */
+			enqueue_hrtimer(timer, timer->base, 1);
+		} else if (hrtimer_active(timer)) {
+			/*
+			 * If the timer was rearmed on another CPU, reprogram
+			 * the event device.
+			 */
+			if (timer->base->first == &timer->node)
+				hrtimer_reprogram(timer, timer->base);
+		}
+	}
+	spin_unlock_irq(&cpu_base->lock);
+}
+
+static void __run_hrtimer(struct hrtimer *timer)
+{
+	struct hrtimer_clock_base *base = timer->base;
+	struct hrtimer_cpu_base *cpu_base = base->cpu_base;
+	enum hrtimer_restart (*fn)(struct hrtimer *);
+	int restart;
+
+	__remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0);
+	timer_stats_account_hrtimer(timer);
+
+	fn = timer->function;
+	if (timer->cb_mode == HRTIMER_CB_IRQSAFE_NO_SOFTIRQ) {
+		/*
+		 * Used for scheduler timers, avoid lock inversion with
+		 * rq->lock and tasklist_lock.
+		 *
+		 * These timers are required to deal with enqueue expiry
+		 * themselves and are not allowed to migrate.
+		 */
+		spin_unlock(&cpu_base->lock);
+		restart = fn(timer);
+		spin_lock(&cpu_base->lock);
+	} else
+		restart = fn(timer);
+
+	/*
+	 * Note: We clear the CALLBACK bit after enqueue_hrtimer to avoid
+	 * reprogramming of the event hardware. This happens at the end of this
+	 * function anyway.
+	 */
+	if (restart != HRTIMER_NORESTART) {
+		BUG_ON(timer->state != HRTIMER_STATE_CALLBACK);
+		enqueue_hrtimer(timer, base, 0);
+	}
+	timer->state &= ~HRTIMER_STATE_CALLBACK;
+}
+
 #ifdef CONFIG_HIGH_RES_TIMERS
 
 /*
@@ -1063,9 +1144,7 @@ void hrtimer_interrupt(struct clock_event_device *dev)
 		basenow = ktime_add(now, base->offset);
 
 		while ((node = base->first)) {
-			enum hrtimer_restart (*fn)(struct hrtimer *);
 			struct hrtimer *timer;
-			int restart;
 
 			timer = rb_entry(node, struct hrtimer, node);
 
@@ -1089,37 +1168,7 @@ void hrtimer_interrupt(struct clock_event_device *dev)
 				continue;
 			}
 
-			__remove_hrtimer(timer, base,
-					 HRTIMER_STATE_CALLBACK, 0);
-			timer_stats_account_hrtimer(timer);
-
-			fn = timer->function;
-			if (timer->cb_mode == HRTIMER_CB_IRQSAFE_NO_SOFTIRQ) {
-				/*
-				 * Used for scheduler timers, avoid lock
-				 * inversion with rq->lock and tasklist_lock.
-				 *
-				 * These timers are required to deal with
-				 * enqueue expiry themselves and are not
-				 * allowed to migrate.
-				 */
-				spin_unlock(&cpu_base->lock);
-				restart = fn(timer);
-				spin_lock(&cpu_base->lock);
-			} else
-				restart = fn(timer);
-
-			/*
-			 * Note: We clear the CALLBACK bit after
-			 * enqueue_hrtimer to avoid reprogramming of
-			 * the event hardware. This happens at the end
-			 * of this function anyway.
-			 */
-			if (restart != HRTIMER_NORESTART) {
-				BUG_ON(timer->state != HRTIMER_STATE_CALLBACK);
-				enqueue_hrtimer(timer, base, 0);
-			}
-			timer->state &= ~HRTIMER_STATE_CALLBACK;
+			__run_hrtimer(timer);
 		}
 		spin_unlock(&cpu_base->lock);
 		base++;
@@ -1140,52 +1189,41 @@ void hrtimer_interrupt(struct clock_event_device *dev)
 
 static void run_hrtimer_softirq(struct softirq_action *h)
 {
-	struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
-
-	spin_lock_irq(&cpu_base->lock);
-
-	while (!list_empty(&cpu_base->cb_pending)) {
-		enum hrtimer_restart (*fn)(struct hrtimer *);
-		struct hrtimer *timer;
-		int restart;
-
-		timer = list_entry(cpu_base->cb_pending.next,
-				   struct hrtimer, cb_entry);
+	run_hrtimer_pending(&__get_cpu_var(hrtimer_bases));
+}
 
-		timer_stats_account_hrtimer(timer);
+#endif	/* CONFIG_HIGH_RES_TIMERS */
 
-		fn = timer->function;
-		__remove_hrtimer(timer, timer->base, HRTIMER_STATE_CALLBACK, 0);
-		spin_unlock_irq(&cpu_base->lock);
+/*
+ * Called from timer softirq every jiffy, expire hrtimers:
+ *
+ * For HRT its the fall back code to run the softirq in the timer
+ * softirq context in case the hrtimer initialization failed or has
+ * not been done yet.
+ */
+void hrtimer_run_pending(void)
+{
+	struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
 
-		restart = fn(timer);
+	if (hrtimer_hres_active())
+		return;
 
-		spin_lock_irq(&cpu_base->lock);
+	/*
+	 * This _is_ ugly: We have to check in the softirq context,
+	 * whether we can switch to highres and / or nohz mode. The
+	 * clocksource switch happens in the timer interrupt with
+	 * xtime_lock held. Notification from there only sets the
+	 * check bit in the tick_oneshot code, otherwise we might
+	 * deadlock vs. xtime_lock.
+	 */
+	if (tick_check_oneshot_change(!hrtimer_is_hres_enabled()))
+		hrtimer_switch_to_hres();
 
-		timer->state &= ~HRTIMER_STATE_CALLBACK;
-		if (restart == HRTIMER_RESTART) {
-			BUG_ON(hrtimer_active(timer));
-			/*
-			 * Enqueue the timer, allow reprogramming of the event
-			 * device
-			 */
-			enqueue_hrtimer(timer, timer->base, 1);
-		} else if (hrtimer_active(timer)) {
-			/*
-			 * If the timer was rearmed on another CPU, reprogram
-			 * the event device.
-			 */
-			if (timer->base->first == &timer->node)
-				hrtimer_reprogram(timer, timer->base);
-		}
-	}
-	spin_unlock_irq(&cpu_base->lock);
+	run_hrtimer_pending(cpu_base);
 }
 
-#endif	/* CONFIG_HIGH_RES_TIMERS */
-
 /*
- * Expire the per base hrtimer-queue:
+ * Called from hardirq context every jiffy
  */
 static inline void run_hrtimer_queue(struct hrtimer_cpu_base *cpu_base,
 				     int index)
@@ -1199,46 +1237,27 @@ static inline void run_hrtimer_queue(struct hrtimer_cpu_base *cpu_base,
 	if (base->get_softirq_time)
 		base->softirq_time = base->get_softirq_time();
 
-	spin_lock_irq(&cpu_base->lock);
+	spin_lock(&cpu_base->lock);
 
 	while ((node = base->first)) {
 		struct hrtimer *timer;
-		enum hrtimer_restart (*fn)(struct hrtimer *);
-		int restart;
 
 		timer = rb_entry(node, struct hrtimer, node);
 		if (base->softirq_time.tv64 <= timer->expires.tv64)
 			break;
 
-#ifdef CONFIG_HIGH_RES_TIMERS
-		WARN_ON_ONCE(timer->cb_mode == HRTIMER_CB_IRQSAFE_NO_SOFTIRQ);
-#endif
-		timer_stats_account_hrtimer(timer);
-
-		fn = timer->function;
-		__remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0);
-		spin_unlock_irq(&cpu_base->lock);
-
-		restart = fn(timer);
-
-		spin_lock_irq(&cpu_base->lock);
-
-		timer->state &= ~HRTIMER_STATE_CALLBACK;
-		if (restart != HRTIMER_NORESTART) {
-			BUG_ON(hrtimer_active(timer));
-			enqueue_hrtimer(timer, base, 0);
+		if (timer->cb_mode == HRTIMER_CB_SOFTIRQ) {
+			__remove_hrtimer(timer, base, HRTIMER_STATE_PENDING, 0);
+			list_add_tail(&timer->cb_entry,
+					&base->cpu_base->cb_pending);
+			continue;
 		}
+
+		__run_hrtimer(timer);
 	}
-	spin_unlock_irq(&cpu_base->lock);
+	spin_unlock(&cpu_base->lock);
 }
 
-/*
- * Called from timer softirq every jiffy, expire hrtimers:
- *
- * For HRT its the fall back code to run the softirq in the timer
- * softirq context in case the hrtimer initialization failed or has
- * not been done yet.
- */
 void hrtimer_run_queues(void)
 {
 	struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
@@ -1247,18 +1266,6 @@ void hrtimer_run_queues(void)
 	if (hrtimer_hres_active())
 		return;
 
-	/*
-	 * This _is_ ugly: We have to check in the softirq context,
-	 * whether we can switch to highres and / or nohz mode. The
-	 * clocksource switch happens in the timer interrupt with
-	 * xtime_lock held. Notification from there only sets the
-	 * check bit in the tick_oneshot code, otherwise we might
-	 * deadlock vs. xtime_lock.
-	 */
-	if (tick_check_oneshot_change(!hrtimer_is_hres_enabled()))
-		if (hrtimer_switch_to_hres())
-			return;
-
 	hrtimer_get_softirq_time(cpu_base);
 
 	for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++)
@@ -1407,6 +1414,7 @@ static void __cpuinit init_hrtimers_cpu(int cpu)
 	for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++)
 		cpu_base->clock_base[i].cpu_base = cpu_base;
 
+	INIT_LIST_HEAD(&cpu_base->cb_pending);
 	hrtimer_init_hres(cpu_base);
 }
 
diff --git a/kernel/timer.c b/kernel/timer.c
index 2a00c22203f3..f739dfb539ce 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -896,7 +896,7 @@ static void run_timer_softirq(struct softirq_action *h)
 {
 	tvec_base_t *base = __get_cpu_var(tvec_bases);
 
-	hrtimer_run_queues();
+	hrtimer_run_pending();
 
 	if (time_after_eq(jiffies, base->timer_jiffies))
 		__run_timers(base);
@@ -907,6 +907,7 @@ static void run_timer_softirq(struct softirq_action *h)
  */
 void run_local_timers(void)
 {
+	hrtimer_run_queues();
 	raise_softirq(TIMER_SOFTIRQ);
 	softlockup_tick();
 }
-- 
cgit v1.2.3


From 37bb6cb4097e29ffee970065b74499cbf10603a3 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 25 Jan 2008 21:08:32 +0100
Subject: hrtimer: unlock hrtimer_wakeup

hrtimer_wakeup creates a

  base->lock
    rq->lock

lock dependancy. Avoid this by switching to HRTIMER_CB_IRQSAFE_NO_SOFTIRQ
which doesn't hold base->lock.

This fully untangles hrtimer locks from the scheduler locks, and allows
hrtimer usage in the scheduler proper.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/hrtimer.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 061ae28a36a0..bd5d6b5060bc 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -1293,7 +1293,7 @@ void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task)
 	sl->timer.function = hrtimer_wakeup;
 	sl->task = task;
 #ifdef CONFIG_HIGH_RES_TIMERS
-	sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_RESTART;
+	sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
 #endif
 }
 
@@ -1304,6 +1304,8 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod
 	do {
 		set_current_state(TASK_INTERRUPTIBLE);
 		hrtimer_start(&t->timer, t->timer.expires, mode);
+		if (!hrtimer_active(&t->timer))
+			t->task = NULL;
 
 		if (likely(t->task))
 			schedule();
-- 
cgit v1.2.3


From 1020387f5f3b52929b387103cf976321981f8e26 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 25 Jan 2008 21:08:32 +0100
Subject: sched: rt-group: reduce rescheduling

Only reschedule if the new group has a higher prio task.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_rt.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 8bfdb3f8a52d..f1f215db3bd0 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -94,8 +94,11 @@ static void sched_rt_ratio_enqueue(struct rt_rq *rt_rq)
 	struct sched_rt_entity *rt_se = rt_rq->rt_se;
 
 	if (rt_se && !on_rt_rq(rt_se) && rt_rq->rt_nr_running) {
+		struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
+
 		enqueue_rt_entity(rt_se);
-		resched_task(rq_of_rt_rq(rt_rq)->curr);
+		if (rt_rq->highest_prio < curr->prio)
+			resched_task(curr);
 	}
 }
 
-- 
cgit v1.2.3


From 5a52dd50091b6a6e710a1293db741028f8cc5aac Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 25 Jan 2008 21:08:32 +0100
Subject: sched: rt-watchdog: fix .rlim_max = RLIM_INFINITY

Remove the curious logic to set it_sched_expires in the future. It useless
because rt.timeout wouldn't be incremented anyway.

Explicity check for RLIM_INFINITY as a test programm that had a 1s soft limit
and a inf hard limit would SIGKILL at 1s. This is because RLIM_INFINITY+d-1
is d-2.

Signed-off-by: Peter Zijlsta <a.p.zijlstra@chello.nl>
CC: Michal Schmidt <mschmidt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/posix-cpu-timers.c | 3 ++-
 kernel/sched_rt.c         | 8 +-------
 2 files changed, 3 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 2c076b36c4f6..0b7c82ac467e 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -1020,7 +1020,8 @@ static void check_thread_timers(struct task_struct *tsk,
 		unsigned long hard = sig->rlim[RLIMIT_RTTIME].rlim_max;
 		unsigned long *soft = &sig->rlim[RLIMIT_RTTIME].rlim_cur;
 
-		if (tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) {
+		if (hard != RLIM_INFINITY &&
+		    tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) {
 			/*
 			 * At the hard limit, we just die.
 			 * No need to calculate anything else now.
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index f1f215db3bd0..2dac5ebb8bcb 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -1125,13 +1125,7 @@ static void watchdog(struct rq *rq, struct task_struct *p)
 
 		p->rt.timeout++;
 		next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ);
-		if (next > p->rt.timeout) {
-			u64 next_time = p->se.sum_exec_runtime;
-
-			next_time += next * (NSEC_PER_SEC/HZ);
-			if (p->it_sched_expires > next_time)
-				p->it_sched_expires = next_time;
-		} else
+		if (p->rt.timeout > next)
 			p->it_sched_expires = p->se.sum_exec_runtime;
 	}
 }
-- 
cgit v1.2.3


From 21aa9280b9f4e9e68d3fa8990df6c9d7fd71f994 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Fri, 25 Jan 2008 21:08:33 +0100
Subject: debug: show being-loaded/being-unloaded indicator for modules

It's rather common that an oops/WARN_ON/BUG happens during the load or
unload of a module. Unfortunatly, it's not always easy to see directly
which module is being loaded/unloaded from the oops itself. Worse,
it's not even always possible to ask the bug reporter, since there
are so many components (udev etc) that auto-load modules that there's
a good chance that even the reporter doesn't know which module this is.

This patch extends the existing "show if it's tainting" print code,
which is used as part of printing the modules in the oops/BUG/WARN_ON
to include a "+" for "being loaded" and a "-" for "being unloaded".

As a result this extension, the "taint_flags()" function gets renamed to
"module_flags()" (and takes a module struct as argument, not a taint
flags int).

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/module.c | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index dcb8a2cbf75e..5bbf225ca07a 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2357,21 +2357,30 @@ static void m_stop(struct seq_file *m, void *p)
 	mutex_unlock(&module_mutex);
 }
 
-static char *taint_flags(unsigned int taints, char *buf)
+static char *module_flags(struct module *mod, char *buf)
 {
 	int bx = 0;
 
-	if (taints) {
+	if (mod->taints ||
+	    mod->state == MODULE_STATE_GOING ||
+	    mod->state == MODULE_STATE_COMING) {
 		buf[bx++] = '(';
-		if (taints & TAINT_PROPRIETARY_MODULE)
+		if (mod->taints & TAINT_PROPRIETARY_MODULE)
 			buf[bx++] = 'P';
-		if (taints & TAINT_FORCED_MODULE)
+		if (mod->taints & TAINT_FORCED_MODULE)
 			buf[bx++] = 'F';
 		/*
 		 * TAINT_FORCED_RMMOD: could be added.
 		 * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't
 		 * apply to modules.
 		 */
+
+		/* Show a - for module-is-being-unloaded */
+		if (mod->state == MODULE_STATE_GOING)
+			buf[bx++] = '-';
+		/* Show a + for module-is-being-loaded */
+		if (mod->state == MODULE_STATE_COMING)
+			buf[bx++] = '+';
 		buf[bx++] = ')';
 	}
 	buf[bx] = '\0';
@@ -2398,7 +2407,7 @@ static int m_show(struct seq_file *m, void *p)
 
 	/* Taints info */
 	if (mod->taints)
-		seq_printf(m, " %s", taint_flags(mod->taints, buf));
+		seq_printf(m, " %s", module_flags(mod, buf));
 
 	seq_printf(m, "\n");
 	return 0;
@@ -2493,7 +2502,7 @@ void print_modules(void)
 
 	printk("Modules linked in:");
 	list_for_each_entry(mod, &modules, list)
-		printk(" %s%s", mod->name, taint_flags(mod->taints, buf));
+		printk(" %s%s", mod->name, module_flags(mod, buf));
 	printk("\n");
 }
 
-- 
cgit v1.2.3


From e14af7eeb47ea96c52741c5e5fa010d33daf6973 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Fri, 25 Jan 2008 21:08:33 +0100
Subject: debug: track and print last unloaded module in the oops trace

Based on a suggestion from Andi:

 In various cases, the unload of a module may leave some bad state around
 that causes a kernel crash AFTER a module is unloaded; and it's then hard
 to find which module caused that.

This patch tracks the last unloaded module, and prints this as part of the
module list in the oops trace.

Right now, only the last 1 module is tracked; I expect that this is enough
for the vast majority of cases where this information matters; if it turns
out that tracking more is important, we can always extend it to that.

[ mingo@elte.hu: build fix ]

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/module.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 5bbf225ca07a..1bb4c5e0d56e 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -496,6 +496,8 @@ static struct module_attribute modinfo_##field = {                    \
 MODINFO_ATTR(version);
 MODINFO_ATTR(srcversion);
 
+static char last_unloaded_module[MODULE_NAME_LEN+1];
+
 #ifdef CONFIG_MODULE_UNLOAD
 /* Init the unload section of the module. */
 static void module_unload_init(struct module *mod)
@@ -719,6 +721,8 @@ sys_delete_module(const char __user *name_user, unsigned int flags)
 		mod->exit();
 		mutex_lock(&module_mutex);
 	}
+	/* Store the name of the last unloaded module for diagnostic purposes */
+	sprintf(last_unloaded_module, mod->name);
 	free_module(mod);
 
  out:
@@ -2503,6 +2507,8 @@ void print_modules(void)
 	printk("Modules linked in:");
 	list_for_each_entry(mod, &modules, list)
 		printk(" %s%s", mod->name, module_flags(mod, buf));
+	if (last_unloaded_module[0])
+		printk(" [last unloaded: %s]", last_unloaded_module);
 	printk("\n");
 }
 
-- 
cgit v1.2.3


From 58b8a73ab8becfcaea84abc2a06038281efa4c8a Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 25 Jan 2008 21:08:33 +0100
Subject: sched: make PREEMPT_BKL the default

make PREEMPT_BKL the default.

precursor to removal of the !PREEMPT_BKL code.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/Kconfig.preempt | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index 61fa116efcde..4420ef427f83 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -53,15 +53,8 @@ config PREEMPT
 endchoice
 
 config PREEMPT_BKL
-	bool "Preempt The Big Kernel Lock"
+	def_bool y
 	depends on SMP || PREEMPT
-	default y
-	help
-	  This option reduces the latency of the kernel by making the
-	  big kernel lock preemptible.
-
-	  Say Y here if you are building a kernel for a desktop system.
-	  Say N if you are unsure.
 
 config RCU_TRACE
 	bool "Enable tracing for RCU - currently stats in debugfs"
-- 
cgit v1.2.3


From 6478d8800b75253b2a934ddcb734e13ade023ad0 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 25 Jan 2008 21:08:33 +0100
Subject: sched: remove the !PREEMPT_BKL code

remove the !PREEMPT_BKL code.

this removes 160 lines of legacy code.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/hardirq.h  |   6 +--
 include/linux/smp_lock.h |  14 +-----
 kernel/Kconfig.preempt   |   4 --
 kernel/sched.c           |  19 ++------
 lib/kernel_lock.c        | 123 -----------------------------------------------
 5 files changed, 5 insertions(+), 161 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index 8d302298a161..2961ec788046 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -72,11 +72,7 @@
 #define in_softirq()		(softirq_count())
 #define in_interrupt()		(irq_count())
 
-#if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL)
-# define in_atomic()	((preempt_count() & ~PREEMPT_ACTIVE) != kernel_locked())
-#else
-# define in_atomic()	((preempt_count() & ~PREEMPT_ACTIVE) != 0)
-#endif
+#define in_atomic()		((preempt_count() & ~PREEMPT_ACTIVE) != 0)
 
 #ifdef CONFIG_PREEMPT
 # define PREEMPT_CHECK_OFFSET 1
diff --git a/include/linux/smp_lock.h b/include/linux/smp_lock.h
index 58962c51dee1..aab3a4cff4e1 100644
--- a/include/linux/smp_lock.h
+++ b/include/linux/smp_lock.h
@@ -17,22 +17,10 @@ extern void __lockfunc __release_kernel_lock(void);
 		__release_kernel_lock();	\
 } while (0)
 
-/*
- * Non-SMP kernels will never block on the kernel lock,
- * so we are better off returning a constant zero from
- * reacquire_kernel_lock() so that the compiler can see
- * it at compile-time.
- */
-#if defined(CONFIG_SMP) && !defined(CONFIG_PREEMPT_BKL)
-# define return_value_on_smp return
-#else
-# define return_value_on_smp
-#endif
-
 static inline int reacquire_kernel_lock(struct task_struct *task)
 {
 	if (unlikely(task->lock_depth >= 0))
-		return_value_on_smp __reacquire_kernel_lock();
+		return __reacquire_kernel_lock();
 	return 0;
 }
 
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index 4420ef427f83..0669b70fa6a3 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -52,10 +52,6 @@ config PREEMPT
 
 endchoice
 
-config PREEMPT_BKL
-	def_bool y
-	depends on SMP || PREEMPT
-
 config RCU_TRACE
 	bool "Enable tracing for RCU - currently stats in debugfs"
 	select DEBUG_FS
diff --git a/kernel/sched.c b/kernel/sched.c
index 22712b2e058a..629614ad0358 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3955,10 +3955,9 @@ EXPORT_SYMBOL(schedule);
 asmlinkage void __sched preempt_schedule(void)
 {
 	struct thread_info *ti = current_thread_info();
-#ifdef CONFIG_PREEMPT_BKL
 	struct task_struct *task = current;
 	int saved_lock_depth;
-#endif
+
 	/*
 	 * If there is a non-zero preempt_count or interrupts are disabled,
 	 * we do not want to preempt the current task. Just return..
@@ -3974,14 +3973,10 @@ asmlinkage void __sched preempt_schedule(void)
 		 * clear ->lock_depth so that schedule() doesnt
 		 * auto-release the semaphore:
 		 */
-#ifdef CONFIG_PREEMPT_BKL
 		saved_lock_depth = task->lock_depth;
 		task->lock_depth = -1;
-#endif
 		schedule();
-#ifdef CONFIG_PREEMPT_BKL
 		task->lock_depth = saved_lock_depth;
-#endif
 		sub_preempt_count(PREEMPT_ACTIVE);
 
 		/*
@@ -4002,10 +3997,9 @@ EXPORT_SYMBOL(preempt_schedule);
 asmlinkage void __sched preempt_schedule_irq(void)
 {
 	struct thread_info *ti = current_thread_info();
-#ifdef CONFIG_PREEMPT_BKL
 	struct task_struct *task = current;
 	int saved_lock_depth;
-#endif
+
 	/* Catch callers which need to be fixed */
 	BUG_ON(ti->preempt_count || !irqs_disabled());
 
@@ -4017,16 +4011,12 @@ asmlinkage void __sched preempt_schedule_irq(void)
 		 * clear ->lock_depth so that schedule() doesnt
 		 * auto-release the semaphore:
 		 */
-#ifdef CONFIG_PREEMPT_BKL
 		saved_lock_depth = task->lock_depth;
 		task->lock_depth = -1;
-#endif
 		local_irq_enable();
 		schedule();
 		local_irq_disable();
-#ifdef CONFIG_PREEMPT_BKL
 		task->lock_depth = saved_lock_depth;
-#endif
 		sub_preempt_count(PREEMPT_ACTIVE);
 
 		/*
@@ -5241,11 +5231,8 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
 	spin_unlock_irqrestore(&rq->lock, flags);
 
 	/* Set the preempt count _outside_ the spinlocks! */
-#if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL)
-	task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);
-#else
 	task_thread_info(idle)->preempt_count = 0;
-#endif
+
 	/*
 	 * The idle tasks have their own, simple scheduling class:
 	 */
diff --git a/lib/kernel_lock.c b/lib/kernel_lock.c
index f73e2f8c308f..812dbf00844b 100644
--- a/lib/kernel_lock.c
+++ b/lib/kernel_lock.c
@@ -9,7 +9,6 @@
 #include <linux/module.h>
 #include <linux/kallsyms.h>
 
-#ifdef CONFIG_PREEMPT_BKL
 /*
  * The 'big kernel semaphore'
  *
@@ -86,128 +85,6 @@ void __lockfunc unlock_kernel(void)
 		up(&kernel_sem);
 }
 
-#else
-
-/*
- * The 'big kernel lock'
- *
- * This spinlock is taken and released recursively by lock_kernel()
- * and unlock_kernel().  It is transparently dropped and reacquired
- * over schedule().  It is used to protect legacy code that hasn't
- * been migrated to a proper locking design yet.
- *
- * Don't use in new code.
- */
-static  __cacheline_aligned_in_smp DEFINE_SPINLOCK(kernel_flag);
-
-
-/*
- * Acquire/release the underlying lock from the scheduler.
- *
- * This is called with preemption disabled, and should
- * return an error value if it cannot get the lock and
- * TIF_NEED_RESCHED gets set.
- *
- * If it successfully gets the lock, it should increment
- * the preemption count like any spinlock does.
- *
- * (This works on UP too - _raw_spin_trylock will never
- * return false in that case)
- */
-int __lockfunc __reacquire_kernel_lock(void)
-{
-	while (!_raw_spin_trylock(&kernel_flag)) {
-		if (test_thread_flag(TIF_NEED_RESCHED))
-			return -EAGAIN;
-		cpu_relax();
-	}
-	preempt_disable();
-	return 0;
-}
-
-void __lockfunc __release_kernel_lock(void)
-{
-	_raw_spin_unlock(&kernel_flag);
-	preempt_enable_no_resched();
-}
-
-/*
- * These are the BKL spinlocks - we try to be polite about preemption. 
- * If SMP is not on (ie UP preemption), this all goes away because the
- * _raw_spin_trylock() will always succeed.
- */
-#ifdef CONFIG_PREEMPT
-static inline void __lock_kernel(void)
-{
-	preempt_disable();
-	if (unlikely(!_raw_spin_trylock(&kernel_flag))) {
-		/*
-		 * If preemption was disabled even before this
-		 * was called, there's nothing we can be polite
-		 * about - just spin.
-		 */
-		if (preempt_count() > 1) {
-			_raw_spin_lock(&kernel_flag);
-			return;
-		}
-
-		/*
-		 * Otherwise, let's wait for the kernel lock
-		 * with preemption enabled..
-		 */
-		do {
-			preempt_enable();
-			while (spin_is_locked(&kernel_flag))
-				cpu_relax();
-			preempt_disable();
-		} while (!_raw_spin_trylock(&kernel_flag));
-	}
-}
-
-#else
-
-/*
- * Non-preemption case - just get the spinlock
- */
-static inline void __lock_kernel(void)
-{
-	_raw_spin_lock(&kernel_flag);
-}
-#endif
-
-static inline void __unlock_kernel(void)
-{
-	/*
-	 * the BKL is not covered by lockdep, so we open-code the
-	 * unlocking sequence (and thus avoid the dep-chain ops):
-	 */
-	_raw_spin_unlock(&kernel_flag);
-	preempt_enable();
-}
-
-/*
- * Getting the big kernel lock.
- *
- * This cannot happen asynchronously, so we only need to
- * worry about other CPU's.
- */
-void __lockfunc lock_kernel(void)
-{
-	int depth = current->lock_depth+1;
-	if (likely(!depth))
-		__lock_kernel();
-	current->lock_depth = depth;
-}
-
-void __lockfunc unlock_kernel(void)
-{
-	BUG_ON(current->lock_depth < 0);
-	if (likely(--current->lock_depth < 0))
-		__unlock_kernel();
-}
-
-#endif
-
 EXPORT_SYMBOL(lock_kernel);
 EXPORT_SYMBOL(unlock_kernel);
 
-- 
cgit v1.2.3


From 1ad82fd547c716f96e544b477e0bdbfa2d647529 Mon Sep 17 00:00:00 2001
From: Paolo Ciarrocchi <paolo.ciarrocchi@gmail.com>
Date: Fri, 25 Jan 2008 21:08:33 +0100
Subject: debug: clean up kernel/profile.c

 Before:
 total: 25 errors, 13 warnings, 602 lines checked

 After:
 total: 0 errors, 2 warnings, 601 lines checked

No code changed:

kernel/profile.o:
   text    data     bss     dec     hex filename
   3048     236      24    3308     cec profile.o.before
   3048     236      24    3308     cec profile.o.after
 md5:
   2501d64748a4d350dffb11203e2a5182  profile.o.before.asm
   2501d64748a4d350dffb11203e2a5182  profile.o.after.asm

Signed-off-by: Paolo Ciarrocchi <paolo.ciarrocchi@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/profile.c | 99 ++++++++++++++++++++++++++++----------------------------
 1 file changed, 49 insertions(+), 50 deletions(-)

(limited to 'kernel')

diff --git a/kernel/profile.c b/kernel/profile.c
index 5e95330e5120..e64c2da11c0f 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -52,7 +52,7 @@ static DEFINE_PER_CPU(int, cpu_profile_flip);
 static DEFINE_MUTEX(profile_flip_mutex);
 #endif /* CONFIG_SMP */
 
-static int __init profile_setup(char * str)
+static int __init profile_setup(char *str)
 {
 	static char __initdata schedstr[] = "schedule";
 	static char __initdata sleepstr[] = "sleep";
@@ -104,28 +104,28 @@ __setup("profile=", profile_setup);
 
 void __init profile_init(void)
 {
-	if (!prof_on) 
+	if (!prof_on)
 		return;
- 
+
 	/* only text is profiled */
 	prof_len = (_etext - _stext) >> prof_shift;
 	prof_buffer = alloc_bootmem(prof_len*sizeof(atomic_t));
 }
 
 /* Profile event notifications */
- 
+
 #ifdef CONFIG_PROFILING
- 
+
 static BLOCKING_NOTIFIER_HEAD(task_exit_notifier);
 static ATOMIC_NOTIFIER_HEAD(task_free_notifier);
 static BLOCKING_NOTIFIER_HEAD(munmap_notifier);
- 
-void profile_task_exit(struct task_struct * task)
+
+void profile_task_exit(struct task_struct *task)
 {
 	blocking_notifier_call_chain(&task_exit_notifier, 0, task);
 }
- 
-int profile_handoff_task(struct task_struct * task)
+
+int profile_handoff_task(struct task_struct *task)
 {
 	int ret;
 	ret = atomic_notifier_call_chain(&task_free_notifier, 0, task);
@@ -137,52 +137,55 @@ void profile_munmap(unsigned long addr)
 	blocking_notifier_call_chain(&munmap_notifier, 0, (void *)addr);
 }
 
-int task_handoff_register(struct notifier_block * n)
+int task_handoff_register(struct notifier_block *n)
 {
 	return atomic_notifier_chain_register(&task_free_notifier, n);
 }
+EXPORT_SYMBOL_GPL(task_handoff_register);
 
-int task_handoff_unregister(struct notifier_block * n)
+int task_handoff_unregister(struct notifier_block *n)
 {
 	return atomic_notifier_chain_unregister(&task_free_notifier, n);
 }
+EXPORT_SYMBOL_GPL(task_handoff_unregister);
 
-int profile_event_register(enum profile_type type, struct notifier_block * n)
+int profile_event_register(enum profile_type type, struct notifier_block *n)
 {
 	int err = -EINVAL;
- 
+
 	switch (type) {
-		case PROFILE_TASK_EXIT:
-			err = blocking_notifier_chain_register(
-					&task_exit_notifier, n);
-			break;
-		case PROFILE_MUNMAP:
-			err = blocking_notifier_chain_register(
-					&munmap_notifier, n);
-			break;
+	case PROFILE_TASK_EXIT:
+		err = blocking_notifier_chain_register(
+				&task_exit_notifier, n);
+		break;
+	case PROFILE_MUNMAP:
+		err = blocking_notifier_chain_register(
+				&munmap_notifier, n);
+		break;
 	}
- 
+
 	return err;
 }
+EXPORT_SYMBOL_GPL(profile_event_register);
 
- 
-int profile_event_unregister(enum profile_type type, struct notifier_block * n)
+int profile_event_unregister(enum profile_type type, struct notifier_block *n)
 {
 	int err = -EINVAL;
- 
+
 	switch (type) {
-		case PROFILE_TASK_EXIT:
-			err = blocking_notifier_chain_unregister(
-					&task_exit_notifier, n);
-			break;
-		case PROFILE_MUNMAP:
-			err = blocking_notifier_chain_unregister(
-					&munmap_notifier, n);
-			break;
+	case PROFILE_TASK_EXIT:
+		err = blocking_notifier_chain_unregister(
+				&task_exit_notifier, n);
+		break;
+	case PROFILE_MUNMAP:
+		err = blocking_notifier_chain_unregister(
+				&munmap_notifier, n);
+		break;
 	}
 
 	return err;
 }
+EXPORT_SYMBOL_GPL(profile_event_unregister);
 
 int register_timer_hook(int (*hook)(struct pt_regs *))
 {
@@ -191,6 +194,7 @@ int register_timer_hook(int (*hook)(struct pt_regs *))
 	timer_hook = hook;
 	return 0;
 }
+EXPORT_SYMBOL_GPL(register_timer_hook);
 
 void unregister_timer_hook(int (*hook)(struct pt_regs *))
 {
@@ -199,13 +203,7 @@ void unregister_timer_hook(int (*hook)(struct pt_regs *))
 	/* make sure all CPUs see the NULL hook */
 	synchronize_sched();  /* Allow ongoing interrupts to complete. */
 }
-
-EXPORT_SYMBOL_GPL(register_timer_hook);
 EXPORT_SYMBOL_GPL(unregister_timer_hook);
-EXPORT_SYMBOL_GPL(task_handoff_register);
-EXPORT_SYMBOL_GPL(task_handoff_unregister);
-EXPORT_SYMBOL_GPL(profile_event_register);
-EXPORT_SYMBOL_GPL(profile_event_unregister);
 
 #endif /* CONFIG_PROFILING */
 
@@ -366,7 +364,7 @@ static int __devinit profile_cpu_callback(struct notifier_block *info,
 			per_cpu(cpu_profile_hits, cpu)[0] = page_address(page);
 		}
 		break;
-	out_free:
+out_free:
 		page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]);
 		per_cpu(cpu_profile_hits, cpu)[1] = NULL;
 		__free_page(page);
@@ -409,7 +407,6 @@ void profile_hits(int type, void *__pc, unsigned int nr_hits)
 	atomic_add(nr_hits, &prof_buffer[min(pc, prof_len - 1)]);
 }
 #endif /* !CONFIG_SMP */
-
 EXPORT_SYMBOL_GPL(profile_hits);
 
 void profile_tick(int type)
@@ -427,7 +424,7 @@ void profile_tick(int type)
 #include <asm/uaccess.h>
 #include <asm/ptrace.h>
 
-static int prof_cpu_mask_read_proc (char *page, char **start, off_t off,
+static int prof_cpu_mask_read_proc(char *page, char **start, off_t off,
 			int count, int *eof, void *data)
 {
 	int len = cpumask_scnprintf(page, count, *(cpumask_t *)data);
@@ -437,8 +434,8 @@ static int prof_cpu_mask_read_proc (char *page, char **start, off_t off,
 	return len;
 }
 
-static int prof_cpu_mask_write_proc (struct file *file, const char __user *buffer,
-					unsigned long count, void *data)
+static int prof_cpu_mask_write_proc(struct file *file,
+	const char __user *buffer,  unsigned long count, void *data)
 {
 	cpumask_t *mask = (cpumask_t *)data;
 	unsigned long full_count = count, err;
@@ -457,7 +454,8 @@ void create_prof_cpu_mask(struct proc_dir_entry *root_irq_dir)
 	struct proc_dir_entry *entry;
 
 	/* create /proc/irq/prof_cpu_mask */
-	if (!(entry = create_proc_entry("prof_cpu_mask", 0600, root_irq_dir)))
+	entry = create_proc_entry("prof_cpu_mask", 0600, root_irq_dir);
+	if (!entry)
 		return;
 	entry->data = (void *)&prof_cpu_mask;
 	entry->read_proc = prof_cpu_mask_read_proc;
@@ -475,7 +473,7 @@ read_profile(struct file *file, char __user *buf, size_t count, loff_t *ppos)
 {
 	unsigned long p = *ppos;
 	ssize_t read;
-	char * pnt;
+	char *pnt;
 	unsigned int sample_step = 1 << prof_shift;
 
 	profile_flip_buffers();
@@ -486,12 +484,12 @@ read_profile(struct file *file, char __user *buf, size_t count, loff_t *ppos)
 	read = 0;
 
 	while (p < sizeof(unsigned int) && count > 0) {
-		if (put_user(*((char *)(&sample_step)+p),buf))
+		if (put_user(*((char *)(&sample_step)+p), buf))
 			return -EFAULT;
 		buf++; p++; count--; read++;
 	}
 	pnt = (char *)prof_buffer + p - sizeof(atomic_t);
-	if (copy_to_user(buf,(void *)pnt,count))
+	if (copy_to_user(buf, (void *)pnt, count))
 		return -EFAULT;
 	read += count;
 	*ppos += read;
@@ -508,7 +506,7 @@ static ssize_t write_profile(struct file *file, const char __user *buf,
 			     size_t count, loff_t *ppos)
 {
 #ifdef CONFIG_SMP
-	extern int setup_profiling_timer (unsigned int multiplier);
+	extern int setup_profiling_timer(unsigned int multiplier);
 
 	if (count == sizeof(int)) {
 		unsigned int multiplier;
@@ -591,7 +589,8 @@ static int __init create_proc_profile(void)
 		return 0;
 	if (create_hash_tables())
 		return -1;
-	if (!(entry = create_proc_entry("profile", S_IWUSR | S_IRUGO, NULL)))
+	entry = create_proc_entry("profile", S_IWUSR | S_IRUGO, NULL);
+	if (!entry)
 		return 0;
 	entry->proc_fops = &proc_profile_operations;
 	entry->size = (1+prof_len) * sizeof(atomic_t);
-- 
cgit v1.2.3


From 4f05b98d54b140ed3c5851d5d5156e9918c6305d Mon Sep 17 00:00:00 2001
From: Michal Schmidt <mschmidt@redhat.com>
Date: Fri, 25 Jan 2008 21:08:33 +0100
Subject: sched: fix, always create kernel threads with normal priority

Ensure that the kernel threads are created with the usual nice level
and affinity even if kthreadd's properties were changed from the
default by root.

Signed-off-by: Michal Schmidt <mschmidt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/kthread.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/kthread.c b/kernel/kthread.c
index dcfe724300eb..0ac887882f90 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -15,6 +15,8 @@
 #include <linux/mutex.h>
 #include <asm/semaphore.h>
 
+#define KTHREAD_NICE_LEVEL (-5)
+
 static DEFINE_SPINLOCK(kthread_create_lock);
 static LIST_HEAD(kthread_create_list);
 struct task_struct *kthreadd_task;
@@ -94,10 +96,18 @@ static void create_kthread(struct kthread_create_info *create)
 	if (pid < 0) {
 		create->result = ERR_PTR(pid);
 	} else {
+		struct sched_param param = { .sched_priority = 0 };
 		wait_for_completion(&create->started);
 		read_lock(&tasklist_lock);
 		create->result = find_task_by_pid(pid);
 		read_unlock(&tasklist_lock);
+		/*
+		 * root may have changed our (kthreadd's) priority or CPU mask.
+		 * The kernel thread should not inherit these properties.
+		 */
+		sched_setscheduler(create->result, SCHED_NORMAL, &param);
+		set_user_nice(create->result, KTHREAD_NICE_LEVEL);
+		set_cpus_allowed(create->result, CPU_MASK_ALL);
 	}
 	complete(&create->done);
 }
@@ -221,7 +231,7 @@ int kthreadd(void *unused)
 	/* Setup a clean context for our children to inherit. */
 	set_task_comm(tsk, "kthreadd");
 	ignore_signals(tsk);
-	set_user_nice(tsk, -5);
+	set_user_nice(tsk, KTHREAD_NICE_LEVEL);
 	set_cpus_allowed(tsk, CPU_MASK_ALL);
 
 	current->flags |= PF_NOFREEZE;
-- 
cgit v1.2.3


From 782daeee3d596282bfee4cd9e976c86be0e194a8 Mon Sep 17 00:00:00 2001
From: Guillaume Chazarain <guichaz@yahoo.fr>
Date: Fri, 25 Jan 2008 21:08:33 +0100
Subject: sched: fix rq->clock warps on frequency changes

sched: fix rq->clock warps on frequency changes

Fix 2bacec8c318ca0418c0ee9ac662ee44207765dd4
(sched: touch softlockup watchdog after idling) that reintroduced warps
on frequency changes. touch_softlockup_watchdog() calls __update_rq_clock
that checks rq->clock for warps, so call it after adjusting rq->clock.

Signed-off-by: Guillaume Chazarain <guichaz@yahoo.fr>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 629614ad0358..3995d1679858 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -858,7 +858,6 @@ void sched_clock_idle_wakeup_event(u64 delta_ns)
 	struct rq *rq = cpu_rq(smp_processor_id());
 	u64 now = sched_clock();
 
-	touch_softlockup_watchdog();
 	rq->idle_clock += delta_ns;
 	/*
 	 * Override the previous timestamp and ignore all
@@ -870,6 +869,7 @@ void sched_clock_idle_wakeup_event(u64 delta_ns)
 	rq->prev_clock_raw = now;
 	rq->clock += delta_ns;
 	spin_unlock(&rq->lock);
+	touch_softlockup_watchdog();
 }
 EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
 
-- 
cgit v1.2.3


From cc203d2422004498909c4886d1b94a2e388d973e Mon Sep 17 00:00:00 2001
From: Guillaume Chazarain <guichaz@yahoo.fr>
Date: Fri, 25 Jan 2008 21:08:34 +0100
Subject: sched: monitor clock underflows in /proc/sched_debug

We monitor clock overflows, let's also monitor clock underflows.

Signed-off-by: Guillaume Chazarain <guichaz@yahoo.fr>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c       | 6 ++++--
 kernel/sched_debug.c | 1 +
 2 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 3995d1679858..4d3a5a700866 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -465,7 +465,7 @@ struct rq {
 	u64 clock, prev_clock_raw;
 	s64 clock_max_delta;
 
-	unsigned int clock_warps, clock_overflows;
+	unsigned int clock_warps, clock_overflows, clock_underflows;
 	u64 idle_clock;
 	unsigned int clock_deep_idle_events;
 	u64 tick_timestamp;
@@ -3736,8 +3736,10 @@ void scheduler_tick(void)
 	/*
 	 * Let rq->clock advance by at least TICK_NSEC:
 	 */
-	if (unlikely(rq->clock < next_tick))
+	if (unlikely(rq->clock < next_tick)) {
 		rq->clock = next_tick;
+		rq->clock_underflows++;
+	}
 	rq->tick_timestamp = rq->clock;
 	update_cpu_load(rq);
 	curr->sched_class->task_tick(rq, curr, 0);
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 80fbbfc04290..9e5de098d471 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -179,6 +179,7 @@ static void print_cpu(struct seq_file *m, int cpu)
 	PN(prev_clock_raw);
 	P(clock_warps);
 	P(clock_overflows);
+	P(clock_underflows);
 	P(clock_deep_idle_events);
 	PN(clock_max_delta);
 	P(cpu_load[0]);
-- 
cgit v1.2.3


From 326587b840785c60f5dc18557235a23bafefd620 Mon Sep 17 00:00:00 2001
From: Dmitry Adamushko <dmitry.adamushko@gmail.com>
Date: Fri, 25 Jan 2008 21:08:34 +0100
Subject: sched: fix goto retry in pick_next_task_rt()

looking at it one more time:

(1) it looks to me that there is no need to call
sched_rt_ratio_exceeded() from pick_next_rt_entity()

- [ for CONFIG_FAIR_GROUP_SCHED ] queues with rt_rq->rt_throttled are
not within this 'tree-like hierarchy' (or whatever we should call it
:-)

- there is also no need to re-check 'rt_rq->rt_time > ratio' at this
point as 'rt_rq->rt_time' couldn't have been increased since the last
call to update_curr_rt() (which obviously calls
sched_rt_ratio_esceeded())
well, it might be that 'ratio' for this rt_rq has been re-configured
(and the period over which this rt_rq was active has not yet been
finished)... but I don't think we should really take this into
account.

(2) now pick_next_rt_entity() must never return NULL, so let's change
pick_next_task_rt() accordingly.

Signed-off-by: Dmitry Adamushko <dmitry.adamushko@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_rt.c | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 2dac5ebb8bcb..274b40d7bef2 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -476,15 +476,12 @@ static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
 	struct list_head *queue;
 	int idx;
 
-	if (sched_rt_ratio_exceeded(rt_rq))
-		goto out;
-
 	idx = sched_find_first_bit(array->bitmap);
 	BUG_ON(idx >= MAX_RT_PRIO);
 
 	queue = array->queue + idx;
 	next = list_entry(queue->next, struct sched_rt_entity, run_list);
- out:
+
 	return next;
 }
 
@@ -494,7 +491,6 @@ static struct task_struct *pick_next_task_rt(struct rq *rq)
 	struct task_struct *p;
 	struct rt_rq *rt_rq;
 
- retry:
 	rt_rq = &rq->rt;
 
 	if (unlikely(!rt_rq->rt_nr_running))
@@ -505,8 +501,7 @@ static struct task_struct *pick_next_task_rt(struct rq *rq)
 
 	do {
 		rt_se = pick_next_rt_entity(rq, rt_rq);
-		if (unlikely(!rt_se))
-			goto retry;
+		BUG_ON(!rt_se);
 		rt_rq = group_rt_rq(rt_se);
 	} while (rt_rq);
 
-- 
cgit v1.2.3


From 9745512ce79de686df354dc70a8d1a74d801892d Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Fri, 25 Jan 2008 21:08:34 +0100
Subject: sched: latencytop support

LatencyTOP kernel infrastructure; it measures latencies in the
scheduler and tracks it system wide and per process.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/stacktrace.c |  27 +++++
 fs/proc/base.c               |  78 ++++++++++++++
 include/linux/latencytop.h   |  44 ++++++++
 include/linux/sched.h        |   5 +
 include/linux/stacktrace.h   |   3 +
 kernel/Makefile              |   1 +
 kernel/fork.c                |   1 +
 kernel/latencytop.c          | 239 +++++++++++++++++++++++++++++++++++++++++++
 kernel/sched_fair.c          |   8 +-
 kernel/sysctl.c              |  10 ++
 lib/Kconfig.debug            |  14 +++
 11 files changed, 429 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/latencytop.h
 create mode 100644 kernel/latencytop.c

(limited to 'kernel')

diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index 6fa6cf036c70..55771fd7e545 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -33,6 +33,19 @@ static void save_stack_address(void *data, unsigned long addr)
 		trace->entries[trace->nr_entries++] = addr;
 }
 
+static void save_stack_address_nosched(void *data, unsigned long addr)
+{
+	struct stack_trace *trace = (struct stack_trace *)data;
+	if (in_sched_functions(addr))
+		return;
+	if (trace->skip > 0) {
+		trace->skip--;
+		return;
+	}
+	if (trace->nr_entries < trace->max_entries)
+		trace->entries[trace->nr_entries++] = addr;
+}
+
 static const struct stacktrace_ops save_stack_ops = {
 	.warning = save_stack_warning,
 	.warning_symbol = save_stack_warning_symbol,
@@ -40,6 +53,13 @@ static const struct stacktrace_ops save_stack_ops = {
 	.address = save_stack_address,
 };
 
+static const struct stacktrace_ops save_stack_ops_nosched = {
+	.warning = save_stack_warning,
+	.warning_symbol = save_stack_warning_symbol,
+	.stack = save_stack_stack,
+	.address = save_stack_address_nosched,
+};
+
 /*
  * Save stack-backtrace addresses into a stack_trace buffer.
  */
@@ -50,3 +70,10 @@ void save_stack_trace(struct stack_trace *trace)
 		trace->entries[trace->nr_entries++] = ULONG_MAX;
 }
 EXPORT_SYMBOL(save_stack_trace);
+
+void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
+{
+	dump_trace(tsk, NULL, NULL, &save_stack_ops_nosched, trace);
+	if (trace->nr_entries < trace->max_entries)
+		trace->entries[trace->nr_entries++] = ULONG_MAX;
+}
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 7411bfb0b7cc..91fa8e6ce8ad 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -310,6 +310,77 @@ static int proc_pid_schedstat(struct task_struct *task, char *buffer)
 }
 #endif
 
+#ifdef CONFIG_LATENCYTOP
+static int lstats_show_proc(struct seq_file *m, void *v)
+{
+	int i;
+	struct task_struct *task = m->private;
+	seq_puts(m, "Latency Top version : v0.1\n");
+
+	for (i = 0; i < 32; i++) {
+		if (task->latency_record[i].backtrace[0]) {
+			int q;
+			seq_printf(m, "%i %li %li ",
+				task->latency_record[i].count,
+				task->latency_record[i].time,
+				task->latency_record[i].max);
+			for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
+				char sym[KSYM_NAME_LEN];
+				char *c;
+				if (!task->latency_record[i].backtrace[q])
+					break;
+				if (task->latency_record[i].backtrace[q] == ULONG_MAX)
+					break;
+				sprint_symbol(sym, task->latency_record[i].backtrace[q]);
+				c = strchr(sym, '+');
+				if (c)
+					*c = 0;
+				seq_printf(m, "%s ", sym);
+			}
+			seq_printf(m, "\n");
+		}
+
+	}
+	return 0;
+}
+
+static int lstats_open(struct inode *inode, struct file *file)
+{
+	int ret;
+	struct seq_file *m;
+	struct task_struct *task = get_proc_task(inode);
+
+	ret = single_open(file, lstats_show_proc, NULL);
+	if (!ret) {
+		m = file->private_data;
+		m->private = task;
+	}
+	return ret;
+}
+
+static ssize_t lstats_write(struct file *file, const char __user *buf,
+			    size_t count, loff_t *offs)
+{
+	struct seq_file *m;
+	struct task_struct *task;
+
+	m = file->private_data;
+	task = m->private;
+	clear_all_latency_tracing(task);
+
+	return count;
+}
+
+static const struct file_operations proc_lstats_operations = {
+	.open		= lstats_open,
+	.read		= seq_read,
+	.write		= lstats_write,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+#endif
+
 /* The badness from the OOM killer */
 unsigned long badness(struct task_struct *p, unsigned long uptime);
 static int proc_oom_score(struct task_struct *task, char *buffer)
@@ -1020,6 +1091,7 @@ static const struct file_operations proc_fault_inject_operations = {
 };
 #endif
 
+
 #ifdef CONFIG_SCHED_DEBUG
 /*
  * Print out various scheduling related per-task fields:
@@ -2230,6 +2302,9 @@ static const struct pid_entry tgid_base_stuff[] = {
 #ifdef CONFIG_SCHEDSTATS
 	INF("schedstat",  S_IRUGO, pid_schedstat),
 #endif
+#ifdef CONFIG_LATENCYTOP
+	REG("latency",  S_IRUGO, lstats),
+#endif
 #ifdef CONFIG_PROC_PID_CPUSET
 	REG("cpuset",     S_IRUGO, cpuset),
 #endif
@@ -2555,6 +2630,9 @@ static const struct pid_entry tid_base_stuff[] = {
 #ifdef CONFIG_SCHEDSTATS
 	INF("schedstat", S_IRUGO, pid_schedstat),
 #endif
+#ifdef CONFIG_LATENCYTOP
+	REG("latency",  S_IRUGO, lstats),
+#endif
 #ifdef CONFIG_PROC_PID_CPUSET
 	REG("cpuset",    S_IRUGO, cpuset),
 #endif
diff --git a/include/linux/latencytop.h b/include/linux/latencytop.h
new file mode 100644
index 000000000000..901c2d6377a8
--- /dev/null
+++ b/include/linux/latencytop.h
@@ -0,0 +1,44 @@
+/*
+ * latencytop.h: Infrastructure for displaying latency
+ *
+ * (C) Copyright 2008 Intel Corporation
+ * Author: Arjan van de Ven <arjan@linux.intel.com>
+ *
+ */
+
+#ifndef _INCLUDE_GUARD_LATENCYTOP_H_
+#define _INCLUDE_GUARD_LATENCYTOP_H_
+
+#ifdef CONFIG_LATENCYTOP
+
+#define LT_SAVECOUNT		32
+#define LT_BACKTRACEDEPTH	12
+
+struct latency_record {
+	unsigned long	backtrace[LT_BACKTRACEDEPTH];
+	unsigned int	count;
+	unsigned long	time;
+	unsigned long	max;
+};
+
+
+struct task_struct;
+
+void account_scheduler_latency(struct task_struct *task, int usecs, int inter);
+
+void clear_all_latency_tracing(struct task_struct *p);
+
+#else
+
+static inline void
+account_scheduler_latency(struct task_struct *task, int usecs, int inter)
+{
+}
+
+static inline void clear_all_latency_tracing(struct task_struct *p)
+{
+}
+
+#endif
+
+#endif
diff --git a/include/linux/sched.h b/include/linux/sched.h
index acadcab89ef9..dfc76e172f3f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -88,6 +88,7 @@ struct sched_param {
 #include <linux/hrtimer.h>
 #include <linux/task_io_accounting.h>
 #include <linux/kobject.h>
+#include <linux/latencytop.h>
 
 #include <asm/processor.h>
 
@@ -1220,6 +1221,10 @@ struct task_struct {
 	int make_it_fail;
 #endif
 	struct prop_local_single dirties;
+#ifdef CONFIG_LATENCYTOP
+	int latency_record_count;
+	struct latency_record latency_record[LT_SAVECOUNT];
+#endif
 };
 
 /*
diff --git a/include/linux/stacktrace.h b/include/linux/stacktrace.h
index e7fa657d0c49..5da9794b2d78 100644
--- a/include/linux/stacktrace.h
+++ b/include/linux/stacktrace.h
@@ -9,10 +9,13 @@ struct stack_trace {
 };
 
 extern void save_stack_trace(struct stack_trace *trace);
+extern void save_stack_trace_tsk(struct task_struct *tsk,
+				struct stack_trace *trace);
 
 extern void print_stack_trace(struct stack_trace *trace, int spaces);
 #else
 # define save_stack_trace(trace)			do { } while (0)
+# define save_stack_trace_tsk(tsk, trace)		do { } while (0)
 # define print_stack_trace(trace, spaces)		do { } while (0)
 #endif
 
diff --git a/kernel/Makefile b/kernel/Makefile
index 68755cd9a7e4..390d42146267 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -62,6 +62,7 @@ obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
 obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
 obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
 obj-$(CONFIG_MARKERS) += marker.o
+obj-$(CONFIG_LATENCYTOP) += latencytop.o
 
 ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/fork.c b/kernel/fork.c
index 0c969f4fade0..39d22b3357de 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1205,6 +1205,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 #ifdef TIF_SYSCALL_EMU
 	clear_tsk_thread_flag(p, TIF_SYSCALL_EMU);
 #endif
+	clear_all_latency_tracing(p);
 
 	/* Our parent execution domain becomes current domain
 	   These must match for thread signalling to apply */
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
new file mode 100644
index 000000000000..b4e3c85abe74
--- /dev/null
+++ b/kernel/latencytop.c
@@ -0,0 +1,239 @@
+/*
+ * latencytop.c: Latency display infrastructure
+ *
+ * (C) Copyright 2008 Intel Corporation
+ * Author: Arjan van de Ven <arjan@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+#include <linux/latencytop.h>
+#include <linux/kallsyms.h>
+#include <linux/seq_file.h>
+#include <linux/notifier.h>
+#include <linux/spinlock.h>
+#include <linux/proc_fs.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/stacktrace.h>
+
+static DEFINE_SPINLOCK(latency_lock);
+
+#define MAXLR 128
+static struct latency_record latency_record[MAXLR];
+
+int latencytop_enabled;
+
+void clear_all_latency_tracing(struct task_struct *p)
+{
+	unsigned long flags;
+
+	if (!latencytop_enabled)
+		return;
+
+	spin_lock_irqsave(&latency_lock, flags);
+	memset(&p->latency_record, 0, sizeof(p->latency_record));
+	p->latency_record_count = 0;
+	spin_unlock_irqrestore(&latency_lock, flags);
+}
+
+static void clear_global_latency_tracing(void)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&latency_lock, flags);
+	memset(&latency_record, 0, sizeof(latency_record));
+	spin_unlock_irqrestore(&latency_lock, flags);
+}
+
+static void __sched
+account_global_scheduler_latency(struct task_struct *tsk, struct latency_record *lat)
+{
+	int firstnonnull = MAXLR + 1;
+	int i;
+
+	if (!latencytop_enabled)
+		return;
+
+	/* skip kernel threads for now */
+	if (!tsk->mm)
+		return;
+
+	for (i = 0; i < MAXLR; i++) {
+		int q;
+		int same = 1;
+		/* Nothing stored: */
+		if (!latency_record[i].backtrace[0]) {
+			if (firstnonnull > i)
+				firstnonnull = i;
+			continue;
+		}
+		for (q = 0 ; q < LT_BACKTRACEDEPTH ; q++) {
+			if (latency_record[i].backtrace[q] !=
+				lat->backtrace[q])
+				same = 0;
+			if (same && lat->backtrace[q] == 0)
+				break;
+			if (same && lat->backtrace[q] == ULONG_MAX)
+				break;
+		}
+		if (same) {
+			latency_record[i].count++;
+			latency_record[i].time += lat->time;
+			if (lat->time > latency_record[i].max)
+				latency_record[i].max = lat->time;
+			return;
+		}
+	}
+
+	i = firstnonnull;
+	if (i >= MAXLR - 1)
+		return;
+
+	/* Allocted a new one: */
+	memcpy(&latency_record[i], lat, sizeof(struct latency_record));
+}
+
+static inline void store_stacktrace(struct task_struct *tsk, struct latency_record *lat)
+{
+	struct stack_trace trace;
+
+	memset(&trace, 0, sizeof(trace));
+	trace.max_entries = LT_BACKTRACEDEPTH;
+	trace.entries = &lat->backtrace[0];
+	trace.skip = 0;
+	save_stack_trace_tsk(tsk, &trace);
+}
+
+void __sched
+account_scheduler_latency(struct task_struct *tsk, int usecs, int inter)
+{
+	unsigned long flags;
+	int i, q;
+	struct latency_record lat;
+
+	if (!latencytop_enabled)
+		return;
+
+	/* Long interruptible waits are generally user requested... */
+	if (inter && usecs > 5000)
+		return;
+
+	memset(&lat, 0, sizeof(lat));
+	lat.count = 1;
+	lat.time = usecs;
+	lat.max = usecs;
+	store_stacktrace(tsk, &lat);
+
+	spin_lock_irqsave(&latency_lock, flags);
+
+	account_global_scheduler_latency(tsk, &lat);
+
+	/*
+	 * short term hack; if we're > 32 we stop; future we recycle:
+	 */
+	tsk->latency_record_count++;
+	if (tsk->latency_record_count >= LT_SAVECOUNT)
+		goto out_unlock;
+
+	for (i = 0; i < LT_SAVECOUNT ; i++) {
+		struct latency_record *mylat;
+		int same = 1;
+		mylat = &tsk->latency_record[i];
+		for (q = 0 ; q < LT_BACKTRACEDEPTH ; q++) {
+			if (mylat->backtrace[q] !=
+				lat.backtrace[q])
+				same = 0;
+			if (same && lat.backtrace[q] == 0)
+				break;
+			if (same && lat.backtrace[q] == ULONG_MAX)
+				break;
+		}
+		if (same) {
+			mylat->count++;
+			mylat->time += lat.time;
+			if (lat.time > mylat->max)
+				mylat->max = lat.time;
+			goto out_unlock;
+		}
+	}
+
+	/* Allocated a new one: */
+	i = tsk->latency_record_count;
+	memcpy(&tsk->latency_record[i], &lat, sizeof(struct latency_record));
+
+out_unlock:
+	spin_unlock_irqrestore(&latency_lock, flags);
+}
+
+static int lstats_show(struct seq_file *m, void *v)
+{
+	int i;
+
+	seq_puts(m, "Latency Top version : v0.1\n");
+
+	for (i = 0; i < MAXLR; i++) {
+		if (latency_record[i].backtrace[0]) {
+			int q;
+			seq_printf(m, "%i %li %li ",
+				latency_record[i].count,
+				latency_record[i].time,
+				latency_record[i].max);
+			for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
+				char sym[KSYM_NAME_LEN];
+				char *c;
+				if (!latency_record[i].backtrace[q])
+					break;
+				if (latency_record[i].backtrace[q] == ULONG_MAX)
+					break;
+				sprint_symbol(sym, latency_record[i].backtrace[q]);
+				c = strchr(sym, '+');
+				if (c)
+					*c = 0;
+				seq_printf(m, "%s ", sym);
+			}
+			seq_printf(m, "\n");
+		}
+	}
+	return 0;
+}
+
+static ssize_t
+lstats_write(struct file *file, const char __user *buf, size_t count,
+	     loff_t *offs)
+{
+	clear_global_latency_tracing();
+
+	return count;
+}
+
+static int lstats_open(struct inode *inode, struct file *filp)
+{
+	return single_open(filp, lstats_show, NULL);
+}
+
+static struct file_operations lstats_fops = {
+	.open		= lstats_open,
+	.read		= seq_read,
+	.write		= lstats_write,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int __init init_lstats_procfs(void)
+{
+	struct proc_dir_entry *pe;
+
+	pe = create_proc_entry("latency_stats", 0644, NULL);
+	if (!pe)
+		return -ENOMEM;
+
+	pe->proc_fops = &lstats_fops;
+
+	return 0;
+}
+__initcall(init_lstats_procfs);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 3dab1ff83c4f..1b3b40ad7c54 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -20,6 +20,8 @@
  *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
  */
 
+#include <linux/latencytop.h>
+
 /*
  * Targeted preemption latency for CPU-bound tasks:
  * (default: 20ms * (1 + ilog(ncpus)), units: nanoseconds)
@@ -434,6 +436,7 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
 #ifdef CONFIG_SCHEDSTATS
 	if (se->sleep_start) {
 		u64 delta = rq_of(cfs_rq)->clock - se->sleep_start;
+		struct task_struct *tsk = task_of(se);
 
 		if ((s64)delta < 0)
 			delta = 0;
@@ -443,9 +446,12 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
 
 		se->sleep_start = 0;
 		se->sum_sleep_runtime += delta;
+
+		account_scheduler_latency(tsk, delta >> 10, 1);
 	}
 	if (se->block_start) {
 		u64 delta = rq_of(cfs_rq)->clock - se->block_start;
+		struct task_struct *tsk = task_of(se);
 
 		if ((s64)delta < 0)
 			delta = 0;
@@ -462,11 +468,11 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
 		 * time that the task spent sleeping:
 		 */
 		if (unlikely(prof_on == SLEEP_PROFILING)) {
-			struct task_struct *tsk = task_of(se);
 
 			profile_hits(SLEEP_PROFILING, (void *)get_wchan(tsk),
 				     delta >> 20);
 		}
+		account_scheduler_latency(tsk, delta >> 10, 0);
 	}
 #endif
 }
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 3afbd25f43eb..5418ef61e16e 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -81,6 +81,7 @@ extern int compat_log;
 extern int maps_protect;
 extern int sysctl_stat_interval;
 extern int audit_argv_kb;
+extern int latencytop_enabled;
 
 /* Constants used for minimum and  maximum */
 #ifdef CONFIG_DETECT_SOFTLOCKUP
@@ -416,6 +417,15 @@ static struct ctl_table kern_table[] = {
 		.proc_handler	= &proc_dointvec_taint,
 	},
 #endif
+#ifdef CONFIG_LATENCYTOP
+	{
+		.procname	= "latencytop",
+		.data		= &latencytop_enabled,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
 #ifdef CONFIG_SECURITY_CAPABILITIES
 	{
 		.procname	= "cap-bound",
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index a60109307d32..14fb355e3caa 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -517,4 +517,18 @@ config FAULT_INJECTION_STACKTRACE_FILTER
 	help
 	  Provide stacktrace filter for fault-injection capabilities
 
+config LATENCYTOP
+	bool "Latency measuring infrastructure"
+	select FRAME_POINTER if !MIPS
+	select KALLSYMS
+	select KALLSYMS_ALL
+	select STACKTRACE
+	select SCHEDSTATS
+	select SCHED_DEBUG
+	depends on X86 || X86_64
+	help
+	  Enable this option if you want to use the LatencyTOP tool
+	  to find out which userspace is blocking on what kernel operations.
+
+
 source "samples/Kconfig"
-- 
cgit v1.2.3


From 90739081ef8d5495d50abba9c5d333be9acd872a Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 25 Jan 2008 21:08:34 +0100
Subject: softlockup: fix signedness

fix softlockup tunables signedness.

mark tunables read-mostly.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h |  4 ++--
 kernel/softlockup.c   | 10 +++++-----
 kernel/sysctl.c       | 16 ++++++++--------
 3 files changed, 15 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index dfc76e172f3f..53534f90a96a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -269,10 +269,10 @@ extern void softlockup_tick(void);
 extern void spawn_softlockup_task(void);
 extern void touch_softlockup_watchdog(void);
 extern void touch_all_softlockup_watchdogs(void);
-extern int softlockup_thresh;
+extern unsigned long  softlockup_thresh;
 extern unsigned long sysctl_hung_task_check_count;
 extern unsigned long sysctl_hung_task_timeout_secs;
-extern long sysctl_hung_task_warnings;
+extern unsigned long sysctl_hung_task_warnings;
 #else
 static inline void softlockup_tick(void)
 {
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 02f0ad534441..c1d76552446e 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -24,8 +24,8 @@ static DEFINE_PER_CPU(unsigned long, touch_timestamp);
 static DEFINE_PER_CPU(unsigned long, print_timestamp);
 static DEFINE_PER_CPU(struct task_struct *, watchdog_task);
 
-static int did_panic;
-int softlockup_thresh = 60;
+static int __read_mostly did_panic;
+unsigned long __read_mostly softlockup_thresh = 60;
 
 static int
 softlock_panic(struct notifier_block *this, unsigned long event, void *ptr)
@@ -121,14 +121,14 @@ void softlockup_tick(void)
 /*
  * Have a reasonable limit on the number of tasks checked:
  */
-unsigned long sysctl_hung_task_check_count = 1024;
+unsigned long __read_mostly sysctl_hung_task_check_count = 1024;
 
 /*
  * Zero means infinite timeout - no checking done:
  */
-unsigned long sysctl_hung_task_timeout_secs = 120;
+unsigned long __read_mostly sysctl_hung_task_timeout_secs = 120;
 
-long sysctl_hung_task_warnings = 10;
+unsigned long __read_mostly sysctl_hung_task_warnings = 10;
 
 /*
  * Only do the hung-tasks check on one CPU:
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 5418ef61e16e..8e96558cb8f3 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -772,9 +772,9 @@ static struct ctl_table kern_table[] = {
 		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "softlockup_thresh",
 		.data		= &softlockup_thresh,
-		.maxlen		= sizeof(int),
+		.maxlen		= sizeof(unsigned long),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
+		.proc_handler	= &proc_doulongvec_minmax,
 		.strategy	= &sysctl_intvec,
 		.extra1		= &one,
 		.extra2		= &sixty,
@@ -783,27 +783,27 @@ static struct ctl_table kern_table[] = {
 		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "hung_task_check_count",
 		.data		= &sysctl_hung_task_check_count,
-		.maxlen		= sizeof(int),
+		.maxlen		= sizeof(unsigned long),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
+		.proc_handler	= &proc_doulongvec_minmax,
 		.strategy	= &sysctl_intvec,
 	},
 	{
 		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "hung_task_timeout_secs",
 		.data		= &sysctl_hung_task_timeout_secs,
-		.maxlen		= sizeof(int),
+		.maxlen		= sizeof(unsigned long),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
+		.proc_handler	= &proc_doulongvec_minmax,
 		.strategy	= &sysctl_intvec,
 	},
 	{
 		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "hung_task_warnings",
 		.data		= &sysctl_hung_task_warnings,
-		.maxlen		= sizeof(int),
+		.maxlen		= sizeof(unsigned long),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
+		.proc_handler	= &proc_doulongvec_minmax,
 		.strategy	= &sysctl_intvec,
 	},
 #endif
-- 
cgit v1.2.3


From 19ef9309273d26cb005cb23e6a370353dca91099 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 25 Jan 2008 21:08:34 +0100
Subject: printk: use ktime_get()

printk timestamps: use ktime_get().

Some platforms have a functioning clocksource function only after
they are done with early bootup, so delay this until out of
SYSTEM_BOOTING state.

it's also inherently safe now, as any bugs in this area will be
caught by the printk recursion checks.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/printk.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/printk.c b/kernel/printk.c
index 3b7c968d0ef9..423a8c765a57 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -702,7 +702,9 @@ asmlinkage int vprintk(const char *fmt, va_list args)
 					loglev_char = default_message_loglevel
 						+ '0';
 				}
-				t = cpu_clock(printk_cpu);
+				t = 0;
+				if (system_state != SYSTEM_BOOTING)
+					t = ktime_to_ns(ktime_get());
 				nanosec_rem = do_div(t, 1000000000);
 				tlen = sprintf(tbuf,
 						"<%c>[%5lu.%06lu] ",
-- 
cgit v1.2.3


From 5fb5e6de55860a99c2d8fe7e0c8222d5c53d8464 Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Fri, 25 Jan 2008 21:08:34 +0100
Subject: sched: print backtrace of running tasks too

The attached patch is something really simple that can sometimes help
in getting more info out of a hung system.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 4d3a5a700866..524285e46fa7 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5161,8 +5161,7 @@ void sched_show_task(struct task_struct *p)
 	printk(KERN_CONT "%5lu %5d %6d\n", free,
 		task_pid_nr(p), task_pid_nr(p->real_parent));
 
-	if (state != TASK_RUNNING)
-		show_stack(p, NULL);
+	show_stack(p, NULL);
 }
 
 void show_state_filter(unsigned long state_filter)
-- 
cgit v1.2.3


From 5973e5b954848c63855a357ad4ff39882e3904f7 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 25 Jan 2008 21:08:34 +0100
Subject: sched: fix: don't take a mutex from interrupt context

print_cfs_stats is callable from interrupt context (sysrq), hence it should
not take mutexes. Change it to use RCU since the task group data is RCU
freed anyway.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 1b3b40ad7c54..45ff4e9411e0 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1434,9 +1434,9 @@ static void print_cfs_stats(struct seq_file *m, int cpu)
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	print_cfs_rq(m, cpu, &cpu_rq(cpu)->cfs);
 #endif
-	lock_task_group_list();
+	rcu_read_lock();
 	for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
 		print_cfs_rq(m, cpu, cfs_rq);
-	unlock_task_group_list();
+	rcu_read_unlock();
 }
 #endif
-- 
cgit v1.2.3


From 6d082592b62689fb91578d0338d04a9f50991990 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Fri, 25 Jan 2008 21:08:35 +0100
Subject: sched: keep total / count stats in addition to the max for

Right now, the linux kernel (with scheduler statistics enabled) keeps track
of the maximum time a process is waiting to be scheduled. While the maximum
is a very useful metric, tracking average and total is equally useful
(at least for latencytop) to figure out the accumulated effect of scheduler
delays. The accumulated effect is important to judge the performance impact
of scheduler tuning/behavior.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h | 2 ++
 kernel/sched_debug.c  | 4 ++++
 kernel/sched_fair.c   | 3 +++
 3 files changed, 9 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 734f6d8f6ed5..df5b24ee80b3 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -895,6 +895,8 @@ struct sched_entity {
 #ifdef CONFIG_SCHEDSTATS
 	u64			wait_start;
 	u64			wait_max;
+	u64			wait_count;
+	u64			wait_sum;
 
 	u64			sleep_start;
 	u64			sleep_max;
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 9e5de098d471..4b5e24cf2f4a 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -300,6 +300,8 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 	PN(se.exec_max);
 	PN(se.slice_max);
 	PN(se.wait_max);
+	PN(se.wait_sum);
+	P(se.wait_count);
 	P(sched_info.bkl_count);
 	P(se.nr_migrations);
 	P(se.nr_migrations_cold);
@@ -367,6 +369,8 @@ void proc_sched_set_task(struct task_struct *p)
 {
 #ifdef CONFIG_SCHEDSTATS
 	p->se.wait_max				= 0;
+	p->se.wait_sum				= 0;
+	p->se.wait_count			= 0;
 	p->se.sleep_max				= 0;
 	p->se.sum_sleep_runtime			= 0;
 	p->se.block_max				= 0;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 45ff4e9411e0..72e25c7a3a18 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -385,6 +385,9 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	schedstat_set(se->wait_max, max(se->wait_max,
 			rq_of(cfs_rq)->clock - se->wait_start));
+	schedstat_set(se->wait_count, se->wait_count + 1);
+	schedstat_set(se->wait_sum, se->wait_sum +
+			rq_of(cfs_rq)->clock - se->wait_start);
 	schedstat_set(se->wait_start, 0);
 }
 
-- 
cgit v1.2.3


From 81ef16e763bb899053e06f6050603a305456a085 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Sat, 26 Jan 2008 14:11:06 +0100
Subject: [S390] Remove appldata include from sysctl_check.c

Forgot to remove this when removing the appldata binary sysctls.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 kernel/sysctl_check.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c
index a68425a5cc1d..d8a5558a47b4 100644
--- a/kernel/sysctl_check.c
+++ b/kernel/sysctl_check.c
@@ -1,6 +1,5 @@
 #include <linux/stat.h>
 #include <linux/sysctl.h>
-#include "../arch/s390/appldata/appldata.h"
 #include "../fs/xfs/linux-2.6/xfs_sysctl.h"
 #include <linux/sunrpc/debug.h>
 #include <linux/string.h>
-- 
cgit v1.2.3


From 326e96b92306b7af24a3608ec01156cba17a3fc1 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 27 Jan 2008 08:03:54 +0100
Subject: printk: revert ktime_get() timestamps

revert 19ef9309273d26cb005cb23e6a370353dca91099.

Kevin Winchester reported a lockup during X startup an bisected
it to this commit.

Reported-by: Kevin Winchester <kjwinchester@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/printk.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk.c b/kernel/printk.c
index 423a8c765a57..3b7c968d0ef9 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -702,9 +702,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
 					loglev_char = default_message_loglevel
 						+ '0';
 				}
-				t = 0;
-				if (system_state != SYSTEM_BOOTING)
-					t = ktime_to_ns(ktime_get());
+				t = cpu_clock(printk_cpu);
 				nanosec_rem = do_div(t, 1000000000);
 				tlen = sprintf(tbuf,
 						"<%c>[%5lu.%06lu] ",
-- 
cgit v1.2.3


From fd0928df98b9578be8a786ac0cb78a47a5e17a20 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Thu, 24 Jan 2008 08:52:45 +0100
Subject: ioprio: move io priority from task_struct to io_context

This is where it belongs and then it doesn't take up space for a
process that doesn't do IO.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c       | 34 ++++++++++----------
 block/ll_rw_blk.c         | 30 ++++++++++++------
 fs/ioprio.c               | 29 ++++++++++++-----
 include/linux/blkdev.h    | 81 ++++-------------------------------------------
 include/linux/init_task.h |  1 -
 include/linux/iocontext.h | 79 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/ioprio.h    | 13 ++++----
 include/linux/sched.h     |  1 -
 kernel/fork.c             | 32 ++++++++++++++++---
 9 files changed, 178 insertions(+), 122 deletions(-)
 create mode 100644 include/linux/iocontext.h

(limited to 'kernel')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 13553e015d72..533af75329e6 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -199,7 +199,7 @@ CFQ_CFQQ_FNS(sync);
 
 static void cfq_dispatch_insert(struct request_queue *, struct request *);
 static struct cfq_queue *cfq_get_queue(struct cfq_data *, int,
-				       struct task_struct *, gfp_t);
+				       struct io_context *, gfp_t);
 static struct cfq_io_context *cfq_cic_rb_lookup(struct cfq_data *,
 						struct io_context *);
 
@@ -1273,7 +1273,7 @@ cfq_alloc_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
 	return cic;
 }
 
-static void cfq_init_prio_data(struct cfq_queue *cfqq)
+static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc)
 {
 	struct task_struct *tsk = current;
 	int ioprio_class;
@@ -1281,7 +1281,7 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq)
 	if (!cfq_cfqq_prio_changed(cfqq))
 		return;
 
-	ioprio_class = IOPRIO_PRIO_CLASS(tsk->ioprio);
+	ioprio_class = IOPRIO_PRIO_CLASS(ioc->ioprio);
 	switch (ioprio_class) {
 		default:
 			printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class);
@@ -1293,11 +1293,11 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq)
 			cfqq->ioprio_class = IOPRIO_CLASS_BE;
 			break;
 		case IOPRIO_CLASS_RT:
-			cfqq->ioprio = task_ioprio(tsk);
+			cfqq->ioprio = task_ioprio(ioc);
 			cfqq->ioprio_class = IOPRIO_CLASS_RT;
 			break;
 		case IOPRIO_CLASS_BE:
-			cfqq->ioprio = task_ioprio(tsk);
+			cfqq->ioprio = task_ioprio(ioc);
 			cfqq->ioprio_class = IOPRIO_CLASS_BE;
 			break;
 		case IOPRIO_CLASS_IDLE:
@@ -1330,8 +1330,7 @@ static inline void changed_ioprio(struct cfq_io_context *cic)
 	cfqq = cic->cfqq[ASYNC];
 	if (cfqq) {
 		struct cfq_queue *new_cfqq;
-		new_cfqq = cfq_get_queue(cfqd, ASYNC, cic->ioc->task,
-					 GFP_ATOMIC);
+		new_cfqq = cfq_get_queue(cfqd, ASYNC, cic->ioc, GFP_ATOMIC);
 		if (new_cfqq) {
 			cic->cfqq[ASYNC] = new_cfqq;
 			cfq_put_queue(cfqq);
@@ -1363,13 +1362,13 @@ static void cfq_ioc_set_ioprio(struct io_context *ioc)
 
 static struct cfq_queue *
 cfq_find_alloc_queue(struct cfq_data *cfqd, int is_sync,
-		     struct task_struct *tsk, gfp_t gfp_mask)
+		     struct io_context *ioc, gfp_t gfp_mask)
 {
 	struct cfq_queue *cfqq, *new_cfqq = NULL;
 	struct cfq_io_context *cic;
 
 retry:
-	cic = cfq_cic_rb_lookup(cfqd, tsk->io_context);
+	cic = cfq_cic_rb_lookup(cfqd, ioc);
 	/* cic always exists here */
 	cfqq = cic_to_cfqq(cic, is_sync);
 
@@ -1412,7 +1411,7 @@ retry:
 		cfq_mark_cfqq_prio_changed(cfqq);
 		cfq_mark_cfqq_queue_new(cfqq);
 
-		cfq_init_prio_data(cfqq);
+		cfq_init_prio_data(cfqq, ioc);
 	}
 
 	if (new_cfqq)
@@ -1439,11 +1438,11 @@ cfq_async_queue_prio(struct cfq_data *cfqd, int ioprio_class, int ioprio)
 }
 
 static struct cfq_queue *
-cfq_get_queue(struct cfq_data *cfqd, int is_sync, struct task_struct *tsk,
+cfq_get_queue(struct cfq_data *cfqd, int is_sync, struct io_context *ioc,
 	      gfp_t gfp_mask)
 {
-	const int ioprio = task_ioprio(tsk);
-	const int ioprio_class = task_ioprio_class(tsk);
+	const int ioprio = task_ioprio(ioc);
+	const int ioprio_class = task_ioprio_class(ioc);
 	struct cfq_queue **async_cfqq = NULL;
 	struct cfq_queue *cfqq = NULL;
 
@@ -1453,7 +1452,7 @@ cfq_get_queue(struct cfq_data *cfqd, int is_sync, struct task_struct *tsk,
 	}
 
 	if (!cfqq) {
-		cfqq = cfq_find_alloc_queue(cfqd, is_sync, tsk, gfp_mask);
+		cfqq = cfq_find_alloc_queue(cfqd, is_sync, ioc, gfp_mask);
 		if (!cfqq)
 			return NULL;
 	}
@@ -1793,7 +1792,7 @@ static void cfq_insert_request(struct request_queue *q, struct request *rq)
 	struct cfq_data *cfqd = q->elevator->elevator_data;
 	struct cfq_queue *cfqq = RQ_CFQQ(rq);
 
-	cfq_init_prio_data(cfqq);
+	cfq_init_prio_data(cfqq, RQ_CIC(rq)->ioc);
 
 	cfq_add_rq_rb(rq);
 
@@ -1900,7 +1899,7 @@ static int cfq_may_queue(struct request_queue *q, int rw)
 
 	cfqq = cic_to_cfqq(cic, rw & REQ_RW_SYNC);
 	if (cfqq) {
-		cfq_init_prio_data(cfqq);
+		cfq_init_prio_data(cfqq, cic->ioc);
 		cfq_prio_boost(cfqq);
 
 		return __cfq_may_queue(cfqq);
@@ -1938,7 +1937,6 @@ static int
 cfq_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask)
 {
 	struct cfq_data *cfqd = q->elevator->elevator_data;
-	struct task_struct *tsk = current;
 	struct cfq_io_context *cic;
 	const int rw = rq_data_dir(rq);
 	const int is_sync = rq_is_sync(rq);
@@ -1956,7 +1954,7 @@ cfq_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask)
 
 	cfqq = cic_to_cfqq(cic, is_sync);
 	if (!cfqq) {
-		cfqq = cfq_get_queue(cfqd, is_sync, tsk, gfp_mask);
+		cfqq = cfq_get_queue(cfqd, is_sync, cic->ioc, gfp_mask);
 
 		if (!cfqq)
 			goto queue_fail;
diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c
index 3d0422f48453..b9bb02e845cd 100644
--- a/block/ll_rw_blk.c
+++ b/block/ll_rw_blk.c
@@ -3904,6 +3904,26 @@ void exit_io_context(void)
 	put_io_context(ioc);
 }
 
+struct io_context *alloc_io_context(gfp_t gfp_flags, int node)
+{
+	struct io_context *ret;
+
+	ret = kmem_cache_alloc_node(iocontext_cachep, gfp_flags, node);
+	if (ret) {
+		atomic_set(&ret->refcount, 1);
+		ret->task = current;
+		ret->ioprio_changed = 0;
+		ret->ioprio = 0;
+		ret->last_waited = jiffies; /* doesn't matter... */
+		ret->nr_batch_requests = 0; /* because this is 0 */
+		ret->aic = NULL;
+		ret->cic_root.rb_node = NULL;
+		ret->ioc_data = NULL;
+	}
+
+	return ret;
+}
+
 /*
  * If the current task has no IO context then create one and initialise it.
  * Otherwise, return its existing IO context.
@@ -3921,16 +3941,8 @@ static struct io_context *current_io_context(gfp_t gfp_flags, int node)
 	if (likely(ret))
 		return ret;
 
-	ret = kmem_cache_alloc_node(iocontext_cachep, gfp_flags, node);
+	ret = alloc_io_context(gfp_flags, node);
 	if (ret) {
-		atomic_set(&ret->refcount, 1);
-		ret->task = current;
-		ret->ioprio_changed = 0;
-		ret->last_waited = jiffies; /* doesn't matter... */
-		ret->nr_batch_requests = 0; /* because this is 0 */
-		ret->aic = NULL;
-		ret->cic_root.rb_node = NULL;
-		ret->ioc_data = NULL;
 		/* make sure set_task_ioprio() sees the settings above */
 		smp_wmb();
 		tsk->io_context = ret;
diff --git a/fs/ioprio.c b/fs/ioprio.c
index e4e01bc7f338..a7600401ecf7 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -41,18 +41,29 @@ static int set_task_ioprio(struct task_struct *task, int ioprio)
 		return err;
 
 	task_lock(task);
+	do {
+		ioc = task->io_context;
+		/* see wmb() in current_io_context() */
+		smp_read_barrier_depends();
+		if (ioc)
+			break;
 
-	task->ioprio = ioprio;
-
-	ioc = task->io_context;
-	/* see wmb() in current_io_context() */
-	smp_read_barrier_depends();
+		ioc = alloc_io_context(GFP_ATOMIC, -1);
+		if (!ioc) {
+			err = -ENOMEM;
+			break;
+		}
+		task->io_context = ioc;
+		ioc->task = task;
+	} while (1);
 
-	if (ioc)
+	if (!err) {
+		ioc->ioprio = ioprio;
 		ioc->ioprio_changed = 1;
+	}
 
 	task_unlock(task);
-	return 0;
+	return err;
 }
 
 asmlinkage long sys_ioprio_set(int which, int who, int ioprio)
@@ -148,7 +159,9 @@ static int get_task_ioprio(struct task_struct *p)
 	ret = security_task_getioprio(p);
 	if (ret)
 		goto out;
-	ret = p->ioprio;
+	ret = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, IOPRIO_NORM);
+	if (p->io_context)
+		ret = p->io_context->ioprio;
 out:
 	return ret;
 }
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 49b7a4c31a6d..510a18ba1ec5 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -34,83 +34,10 @@ struct sg_io_hdr;
 #define BLKDEV_MIN_RQ	4
 #define BLKDEV_MAX_RQ	128	/* Default maximum */
 
-/*
- * This is the per-process anticipatory I/O scheduler state.
- */
-struct as_io_context {
-	spinlock_t lock;
-
-	void (*dtor)(struct as_io_context *aic); /* destructor */
-	void (*exit)(struct as_io_context *aic); /* called on task exit */
-
-	unsigned long state;
-	atomic_t nr_queued; /* queued reads & sync writes */
-	atomic_t nr_dispatched; /* number of requests gone to the drivers */
-
-	/* IO History tracking */
-	/* Thinktime */
-	unsigned long last_end_request;
-	unsigned long ttime_total;
-	unsigned long ttime_samples;
-	unsigned long ttime_mean;
-	/* Layout pattern */
-	unsigned int seek_samples;
-	sector_t last_request_pos;
-	u64 seek_total;
-	sector_t seek_mean;
-};
-
-struct cfq_queue;
-struct cfq_io_context {
-	struct rb_node rb_node;
-	void *key;
-
-	struct cfq_queue *cfqq[2];
-
-	struct io_context *ioc;
-
-	unsigned long last_end_request;
-	sector_t last_request_pos;
-
-	unsigned long ttime_total;
-	unsigned long ttime_samples;
-	unsigned long ttime_mean;
-
-	unsigned int seek_samples;
-	u64 seek_total;
-	sector_t seek_mean;
-
-	struct list_head queue_list;
-
-	void (*dtor)(struct io_context *); /* destructor */
-	void (*exit)(struct io_context *); /* called on task exit */
-};
-
-/*
- * This is the per-process I/O subsystem state.  It is refcounted and
- * kmalloc'ed. Currently all fields are modified in process io context
- * (apart from the atomic refcount), so require no locking.
- */
-struct io_context {
-	atomic_t refcount;
-	struct task_struct *task;
-
-	unsigned int ioprio_changed;
-
-	/*
-	 * For request batching
-	 */
-	unsigned long last_waited; /* Time last woken after wait for request */
-	int nr_batch_requests;     /* Number of requests left in the batch */
-
-	struct as_io_context *aic;
-	struct rb_root cic_root;
-	void *ioc_data;
-};
-
 void put_io_context(struct io_context *ioc);
 void exit_io_context(void);
 struct io_context *get_io_context(gfp_t gfp_flags, int node);
+struct io_context *alloc_io_context(gfp_t gfp_flags, int node);
 void copy_io_context(struct io_context **pdst, struct io_context **psrc);
 void swap_io_context(struct io_context **ioc1, struct io_context **ioc2);
 
@@ -894,6 +821,12 @@ static inline void exit_io_context(void)
 {
 }
 
+static inline int put_io_context(struct io_context *ioc)
+{
+	return 1;
+}
+
+
 #endif /* CONFIG_BLOCK */
 
 #endif
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 796019b22b6f..e6b3f7080679 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -137,7 +137,6 @@ extern struct group_info init_groups;
 		.time_slice	= HZ, 					\
 		.nr_cpus_allowed = NR_CPUS,				\
 	},								\
-	.ioprio		= 0,						\
 	.tasks		= LIST_HEAD_INIT(tsk.tasks),			\
 	.ptrace_children= LIST_HEAD_INIT(tsk.ptrace_children),		\
 	.ptrace_list	= LIST_HEAD_INIT(tsk.ptrace_list),		\
diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h
new file mode 100644
index 000000000000..186807ea62e2
--- /dev/null
+++ b/include/linux/iocontext.h
@@ -0,0 +1,79 @@
+#ifndef IOCONTEXT_H
+#define IOCONTEXT_H
+
+/*
+ * This is the per-process anticipatory I/O scheduler state.
+ */
+struct as_io_context {
+	spinlock_t lock;
+
+	void (*dtor)(struct as_io_context *aic); /* destructor */
+	void (*exit)(struct as_io_context *aic); /* called on task exit */
+
+	unsigned long state;
+	atomic_t nr_queued; /* queued reads & sync writes */
+	atomic_t nr_dispatched; /* number of requests gone to the drivers */
+
+	/* IO History tracking */
+	/* Thinktime */
+	unsigned long last_end_request;
+	unsigned long ttime_total;
+	unsigned long ttime_samples;
+	unsigned long ttime_mean;
+	/* Layout pattern */
+	unsigned int seek_samples;
+	sector_t last_request_pos;
+	u64 seek_total;
+	sector_t seek_mean;
+};
+
+struct cfq_queue;
+struct cfq_io_context {
+	struct rb_node rb_node;
+	void *key;
+
+	struct cfq_queue *cfqq[2];
+
+	struct io_context *ioc;
+
+	unsigned long last_end_request;
+	sector_t last_request_pos;
+
+	unsigned long ttime_total;
+	unsigned long ttime_samples;
+	unsigned long ttime_mean;
+
+	unsigned int seek_samples;
+	u64 seek_total;
+	sector_t seek_mean;
+
+	struct list_head queue_list;
+
+	void (*dtor)(struct io_context *); /* destructor */
+	void (*exit)(struct io_context *); /* called on task exit */
+};
+
+/*
+ * This is the per-process I/O subsystem state.  It is refcounted and
+ * kmalloc'ed. Currently all fields are modified in process io context
+ * (apart from the atomic refcount), so require no locking.
+ */
+struct io_context {
+	atomic_t refcount;
+	struct task_struct *task;
+
+	unsigned short ioprio;
+	unsigned short ioprio_changed;
+
+	/*
+	 * For request batching
+	 */
+	unsigned long last_waited; /* Time last woken after wait for request */
+	int nr_batch_requests;     /* Number of requests left in the batch */
+
+	struct as_io_context *aic;
+	struct rb_root cic_root;
+	void *ioc_data;
+};
+
+#endif
diff --git a/include/linux/ioprio.h b/include/linux/ioprio.h
index baf29387cab4..2a3bb1bb7433 100644
--- a/include/linux/ioprio.h
+++ b/include/linux/ioprio.h
@@ -2,6 +2,7 @@
 #define IOPRIO_H
 
 #include <linux/sched.h>
+#include <linux/iocontext.h>
 
 /*
  * Gives us 8 prio classes with 13-bits of data for each class
@@ -45,18 +46,18 @@ enum {
  * the cpu scheduler nice value to an io priority
  */
 #define IOPRIO_NORM	(4)
-static inline int task_ioprio(struct task_struct *task)
+static inline int task_ioprio(struct io_context *ioc)
 {
-	if (ioprio_valid(task->ioprio))
-		return IOPRIO_PRIO_DATA(task->ioprio);
+	if (ioprio_valid(ioc->ioprio))
+		return IOPRIO_PRIO_DATA(ioc->ioprio);
 
 	return IOPRIO_NORM;
 }
 
-static inline int task_ioprio_class(struct task_struct *task)
+static inline int task_ioprio_class(struct io_context *ioc)
 {
-	if (ioprio_valid(task->ioprio))
-		return IOPRIO_PRIO_CLASS(task->ioprio);
+	if (ioprio_valid(ioc->ioprio))
+		return IOPRIO_PRIO_CLASS(ioc->ioprio);
 
 	return IOPRIO_CLASS_BE;
 }
diff --git a/include/linux/sched.h b/include/linux/sched.h
index df5b24ee80b3..80837e7d527e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -975,7 +975,6 @@ struct task_struct {
 	struct hlist_head preempt_notifiers;
 #endif
 
-	unsigned short ioprio;
 	/*
 	 * fpu_counter contains the number of consecutive context switches
 	 * that the FPU is used. If this is over a threshold, the lazy fpu
diff --git a/kernel/fork.c b/kernel/fork.c
index 39d22b3357de..2a86c9dff744 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -51,6 +51,7 @@
 #include <linux/random.h>
 #include <linux/tty.h>
 #include <linux/proc_fs.h>
+#include <linux/blkdev.h>
 
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
@@ -791,6 +792,26 @@ out:
 	return error;
 }
 
+static int copy_io(struct task_struct *tsk)
+{
+#ifdef CONFIG_BLOCK
+	struct io_context *ioc = current->io_context;
+
+	if (!ioc)
+		return 0;
+
+	if (ioprio_valid(ioc->ioprio)) {
+		tsk->io_context = alloc_io_context(GFP_KERNEL, -1);
+		if (unlikely(!tsk->io_context))
+			return -ENOMEM;
+
+		tsk->io_context->task = tsk;
+		tsk->io_context->ioprio = ioc->ioprio;
+	}
+#endif
+	return 0;
+}
+
 /*
  *	Helper to unshare the files of the current task.
  *	We don't want to expose copy_files internals to
@@ -1156,15 +1177,17 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 		goto bad_fork_cleanup_mm;
 	if ((retval = copy_namespaces(clone_flags, p)))
 		goto bad_fork_cleanup_keys;
+	if ((retval = copy_io(p)))
+		goto bad_fork_cleanup_namespaces;
 	retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs);
 	if (retval)
-		goto bad_fork_cleanup_namespaces;
+		goto bad_fork_cleanup_io;
 
 	if (pid != &init_struct_pid) {
 		retval = -ENOMEM;
 		pid = alloc_pid(task_active_pid_ns(p));
 		if (!pid)
-			goto bad_fork_cleanup_namespaces;
+			goto bad_fork_cleanup_io;
 
 		if (clone_flags & CLONE_NEWPID) {
 			retval = pid_ns_prepare_proc(task_active_pid_ns(p));
@@ -1234,9 +1257,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	/* Need tasklist lock for parent etc handling! */
 	write_lock_irq(&tasklist_lock);
 
-	/* for sys_ioprio_set(IOPRIO_WHO_PGRP) */
-	p->ioprio = current->ioprio;
-
 	/*
 	 * The task hasn't been attached yet, so its cpus_allowed mask will
 	 * not be changed, nor will its assigned CPU.
@@ -1328,6 +1348,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 bad_fork_free_pid:
 	if (pid != &init_struct_pid)
 		free_pid(pid);
+bad_fork_cleanup_io:
+	put_io_context(p->io_context);
 bad_fork_cleanup_namespaces:
 	exit_task_namespaces(p);
 bad_fork_cleanup_keys:
-- 
cgit v1.2.3


From d38ecf935fcb10264a6bc190855d9595165e6eeb Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Thu, 24 Jan 2008 08:53:35 +0100
Subject: io context sharing: preliminary support

Detach task state from ioc, instead keep track of how many processes
are accessing the ioc.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/ll_rw_blk.c         | 46 ++++++++++++++++++++++++++++++----------------
 fs/ioprio.c               |  1 -
 include/linux/blkdev.h    |  2 +-
 include/linux/iocontext.h | 22 ++++++++++++++++++----
 kernel/fork.c             |  1 -
 5 files changed, 49 insertions(+), 23 deletions(-)

(limited to 'kernel')

diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c
index b9bb02e845cd..d4550ecae443 100644
--- a/block/ll_rw_blk.c
+++ b/block/ll_rw_blk.c
@@ -3854,12 +3854,13 @@ int __init blk_dev_init(void)
 }
 
 /*
- * IO Context helper functions
+ * IO Context helper functions. put_io_context() returns 1 if there are no
+ * more users of this io context, 0 otherwise.
  */
-void put_io_context(struct io_context *ioc)
+int put_io_context(struct io_context *ioc)
 {
 	if (ioc == NULL)
-		return;
+		return 1;
 
 	BUG_ON(atomic_read(&ioc->refcount) == 0);
 
@@ -3878,7 +3879,9 @@ void put_io_context(struct io_context *ioc)
 		rcu_read_unlock();
 
 		kmem_cache_free(iocontext_cachep, ioc);
+		return 1;
 	}
+	return 0;
 }
 EXPORT_SYMBOL(put_io_context);
 
@@ -3893,15 +3896,17 @@ void exit_io_context(void)
 	current->io_context = NULL;
 	task_unlock(current);
 
-	ioc->task = NULL;
-	if (ioc->aic && ioc->aic->exit)
-		ioc->aic->exit(ioc->aic);
-	if (ioc->cic_root.rb_node != NULL) {
-		cic = rb_entry(rb_first(&ioc->cic_root), struct cfq_io_context, rb_node);
-		cic->exit(ioc);
-	}
+	if (atomic_dec_and_test(&ioc->nr_tasks)) {
+		if (ioc->aic && ioc->aic->exit)
+			ioc->aic->exit(ioc->aic);
+		if (ioc->cic_root.rb_node != NULL) {
+			cic = rb_entry(rb_first(&ioc->cic_root),
+				struct cfq_io_context, rb_node);
+			cic->exit(ioc);
+		}
 
-	put_io_context(ioc);
+		put_io_context(ioc);
+	}
 }
 
 struct io_context *alloc_io_context(gfp_t gfp_flags, int node)
@@ -3911,7 +3916,8 @@ struct io_context *alloc_io_context(gfp_t gfp_flags, int node)
 	ret = kmem_cache_alloc_node(iocontext_cachep, gfp_flags, node);
 	if (ret) {
 		atomic_set(&ret->refcount, 1);
-		ret->task = current;
+		atomic_set(&ret->nr_tasks, 1);
+		spin_lock_init(&ret->lock);
 		ret->ioprio_changed = 0;
 		ret->ioprio = 0;
 		ret->last_waited = jiffies; /* doesn't matter... */
@@ -3959,10 +3965,18 @@ static struct io_context *current_io_context(gfp_t gfp_flags, int node)
  */
 struct io_context *get_io_context(gfp_t gfp_flags, int node)
 {
-	struct io_context *ret;
-	ret = current_io_context(gfp_flags, node);
-	if (likely(ret))
-		atomic_inc(&ret->refcount);
+	struct io_context *ret = NULL;
+
+	/*
+	 * Check for unlikely race with exiting task. ioc ref count is
+	 * zero when ioc is being detached.
+	 */
+	do {
+		ret = current_io_context(gfp_flags, node);
+		if (unlikely(!ret))
+			break;
+	} while (!atomic_inc_not_zero(&ret->refcount));
+
 	return ret;
 }
 EXPORT_SYMBOL(get_io_context);
diff --git a/fs/ioprio.c b/fs/ioprio.c
index a7600401ecf7..06b5d97c5fdd 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -54,7 +54,6 @@ static int set_task_ioprio(struct task_struct *task, int ioprio)
 			break;
 		}
 		task->io_context = ioc;
-		ioc->task = task;
 	} while (1);
 
 	if (!err) {
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 510a18ba1ec5..2483a05231c7 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -34,7 +34,7 @@ struct sg_io_hdr;
 #define BLKDEV_MIN_RQ	4
 #define BLKDEV_MAX_RQ	128	/* Default maximum */
 
-void put_io_context(struct io_context *ioc);
+int put_io_context(struct io_context *ioc);
 void exit_io_context(void);
 struct io_context *get_io_context(gfp_t gfp_flags, int node);
 struct io_context *alloc_io_context(gfp_t gfp_flags, int node);
diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h
index 186807ea62e2..cd44d458124a 100644
--- a/include/linux/iocontext.h
+++ b/include/linux/iocontext.h
@@ -54,13 +54,15 @@ struct cfq_io_context {
 };
 
 /*
- * This is the per-process I/O subsystem state.  It is refcounted and
- * kmalloc'ed. Currently all fields are modified in process io context
- * (apart from the atomic refcount), so require no locking.
+ * I/O subsystem state of the associated processes.  It is refcounted
+ * and kmalloc'ed. These could be shared between processes.
  */
 struct io_context {
 	atomic_t refcount;
-	struct task_struct *task;
+	atomic_t nr_tasks;
+
+	/* all the fields below are protected by this lock */
+	spinlock_t lock;
 
 	unsigned short ioprio;
 	unsigned short ioprio_changed;
@@ -76,4 +78,16 @@ struct io_context {
 	void *ioc_data;
 };
 
+static inline struct io_context *ioc_task_link(struct io_context *ioc)
+{
+	/*
+	 * if ref count is zero, don't allow sharing (ioc is going away, it's
+	 * a race).
+	 */
+	if (ioc && atomic_inc_not_zero(&ioc->refcount))
+		return ioc;
+
+	return NULL;
+}
+
 #endif
diff --git a/kernel/fork.c b/kernel/fork.c
index 2a86c9dff744..1987c57abb08 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -805,7 +805,6 @@ static int copy_io(struct task_struct *tsk)
 		if (unlikely(!tsk->io_context))
 			return -ENOMEM;
 
-		tsk->io_context->task = tsk;
 		tsk->io_context->ioprio = ioc->ioprio;
 	}
 #endif
-- 
cgit v1.2.3


From fadad878cc0640cc9cd5569998bf54b693f7b38b Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Thu, 24 Jan 2008 08:54:47 +0100
Subject: kernel: add CLONE_IO to specifically request sharing of IO contexts

syslets (or other threads/processes that want io context sharing) can
set this to enforce sharing of io context.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 include/linux/sched.h |  1 +
 kernel/fork.c         | 14 ++++++++++----
 2 files changed, 11 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 80837e7d527e..2d0546e884ea 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -27,6 +27,7 @@
 #define CLONE_NEWUSER		0x10000000	/* New user namespace */
 #define CLONE_NEWPID		0x20000000	/* New pid namespace */
 #define CLONE_NEWNET		0x40000000	/* New network namespace */
+#define CLONE_IO		0x80000000	/* Clone io context */
 
 /*
  * Scheduling policies
diff --git a/kernel/fork.c b/kernel/fork.c
index 1987c57abb08..314f5101d2b0 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -792,15 +792,21 @@ out:
 	return error;
 }
 
-static int copy_io(struct task_struct *tsk)
+static int copy_io(unsigned long clone_flags, struct task_struct *tsk)
 {
 #ifdef CONFIG_BLOCK
 	struct io_context *ioc = current->io_context;
 
 	if (!ioc)
 		return 0;
-
-	if (ioprio_valid(ioc->ioprio)) {
+	/*
+	 * Share io context with parent, if CLONE_IO is set
+	 */
+	if (clone_flags & CLONE_IO) {
+		tsk->io_context = ioc_task_link(ioc);
+		if (unlikely(!tsk->io_context))
+			return -ENOMEM;
+	} else if (ioprio_valid(ioc->ioprio)) {
 		tsk->io_context = alloc_io_context(GFP_KERNEL, -1);
 		if (unlikely(!tsk->io_context))
 			return -ENOMEM;
@@ -1176,7 +1182,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 		goto bad_fork_cleanup_mm;
 	if ((retval = copy_namespaces(clone_flags, p)))
 		goto bad_fork_cleanup_keys;
-	if ((retval = copy_io(p)))
+	if ((retval = copy_io(clone_flags, p)))
 		goto bad_fork_cleanup_namespaces;
 	retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs);
 	if (retval)
-- 
cgit v1.2.3


From 29e796fd4de54b8f5bc30d897611210ece4fd0f2 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 30 Nov 2007 23:50:18 +1100
Subject: sysctl: Add register_sysctl_paths function

There are a number of modules that register a sysctl table
somewhere deeply nested in the sysctl hierarchy, such as
fs/nfs, fs/xfs, dev/cdrom, etc.

They all specify several dummy ctl_tables for the path name.
This patch implements register_sysctl_path that takes
an additional path name, and makes up dummy sysctl nodes
for each component.

This patch was originally written by Olaf Kirch and
brought to my attention and reworked some by Olaf Hering.
I have changed a few additional things so the bugs are mine.

After converting all of the easy callers Olaf Hering observed
allyesconfig ARCH=i386, the patch reduces the final binary size by 9369 bytes.

.text +897
.data -7008

   text    data     bss     dec     hex filename
   26959310        4045899 4718592 35723801        2211a19 ../vmlinux-vanilla
   26960207        4038891 4718592 35717690        221023a ../O-allyesconfig/vmlinux

So this change is both a space savings and a code simplification.

CC: Olaf Kirch <okir@suse.de>
CC: Olaf Hering <olaf@aepfle.de>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Cc: Serge Hallyn <serue@us.ibm.com>
Cc: Daniel Lezcano <dlezcano@fr.ibm.com>
Cc: Cedric Le Goater <clg@fr.ibm.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sysctl.h |  8 +++++
 kernel/sysctl.c        | 89 +++++++++++++++++++++++++++++++++++++++++---------
 2 files changed, 82 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 4f5047df8a9e..3b6e2c9fbb2e 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -1059,7 +1059,15 @@ struct ctl_table_header
 	struct completion *unregistering;
 };
 
+/* struct ctl_path describes where in the hierarchy a table is added */
+struct ctl_path {
+	const char *procname;
+	int ctl_name;
+};
+
 struct ctl_table_header *register_sysctl_table(struct ctl_table * table);
+struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path,
+						struct ctl_table *table);
 
 void unregister_sysctl_table(struct ctl_table_header * table);
 int sysctl_check_table(struct ctl_table *table);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 8e96558cb8f3..f580542333eb 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1561,11 +1561,12 @@ static __init int sysctl_init(void)
 core_initcall(sysctl_init);
 
 /**
- * register_sysctl_table - register a sysctl hierarchy
+ * register_sysctl_paths - register a sysctl hierarchy
+ * @path: The path to the directory the sysctl table is in.
  * @table: the top-level table structure
  *
  * Register a sysctl table hierarchy. @table should be a filled in ctl_table
- * array. An entry with a ctl_name of 0 terminates the table. 
+ * array. A completely 0 filled entry terminates the table.
  *
  * The members of the &struct ctl_table structure are used as follows:
  *
@@ -1628,25 +1629,76 @@ core_initcall(sysctl_init);
  * This routine returns %NULL on a failure to register, and a pointer
  * to the table header on success.
  */
-struct ctl_table_header *register_sysctl_table(struct ctl_table * table)
+struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path,
+						struct ctl_table *table)
 {
-	struct ctl_table_header *tmp;
-	tmp = kmalloc(sizeof(struct ctl_table_header), GFP_KERNEL);
-	if (!tmp)
+	struct ctl_table_header *header;
+	struct ctl_table *new, **prevp;
+	unsigned int n, npath;
+
+	/* Count the path components */
+	for (npath = 0; path[npath].ctl_name || path[npath].procname; ++npath)
+		;
+
+	/*
+	 * For each path component, allocate a 2-element ctl_table array.
+	 * The first array element will be filled with the sysctl entry
+	 * for this, the second will be the sentinel (ctl_name == 0).
+	 *
+	 * We allocate everything in one go so that we don't have to
+	 * worry about freeing additional memory in unregister_sysctl_table.
+	 */
+	header = kzalloc(sizeof(struct ctl_table_header) +
+			 (2 * npath * sizeof(struct ctl_table)), GFP_KERNEL);
+	if (!header)
 		return NULL;
-	tmp->ctl_table = table;
-	INIT_LIST_HEAD(&tmp->ctl_entry);
-	tmp->used = 0;
-	tmp->unregistering = NULL;
-	sysctl_set_parent(NULL, table);
-	if (sysctl_check_table(tmp->ctl_table)) {
-		kfree(tmp);
+
+	new = (struct ctl_table *) (header + 1);
+
+	/* Now connect the dots */
+	prevp = &header->ctl_table;
+	for (n = 0; n < npath; ++n, ++path) {
+		/* Copy the procname */
+		new->procname = path->procname;
+		new->ctl_name = path->ctl_name;
+		new->mode     = 0555;
+
+		*prevp = new;
+		prevp = &new->child;
+
+		new += 2;
+	}
+	*prevp = table;
+
+	INIT_LIST_HEAD(&header->ctl_entry);
+	header->used = 0;
+	header->unregistering = NULL;
+	sysctl_set_parent(NULL, header->ctl_table);
+	if (sysctl_check_table(header->ctl_table)) {
+		kfree(header);
 		return NULL;
 	}
 	spin_lock(&sysctl_lock);
-	list_add_tail(&tmp->ctl_entry, &root_table_header.ctl_entry);
+	list_add_tail(&header->ctl_entry, &root_table_header.ctl_entry);
 	spin_unlock(&sysctl_lock);
-	return tmp;
+
+	return header;
+}
+
+/**
+ * register_sysctl_table - register a sysctl table hierarchy
+ * @table: the top-level table structure
+ *
+ * Register a sysctl table hierarchy. @table should be a filled in ctl_table
+ * array. A completely 0 filled entry terminates the table.
+ *
+ * See register_sysctl_paths for more details.
+ */
+struct ctl_table_header *register_sysctl_table(struct ctl_table *table)
+{
+	static const struct ctl_path null_path[] = { {} };
+
+	return register_sysctl_paths(null_path, table);
 }
 
 /**
@@ -1675,6 +1727,12 @@ struct ctl_table_header *register_sysctl_table(struct ctl_table * table)
 	return NULL;
 }
 
+struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path,
+						    struct ctl_table *table)
+{
+	return NULL;
+}
+
 void unregister_sysctl_table(struct ctl_table_header * table)
 {
 }
@@ -2733,6 +2791,7 @@ EXPORT_SYMBOL(proc_dostring);
 EXPORT_SYMBOL(proc_doulongvec_minmax);
 EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax);
 EXPORT_SYMBOL(register_sysctl_table);
+EXPORT_SYMBOL(register_sysctl_paths);
 EXPORT_SYMBOL(sysctl_intvec);
 EXPORT_SYMBOL(sysctl_jiffies);
 EXPORT_SYMBOL(sysctl_ms_jiffies);
-- 
cgit v1.2.3


From 23eb06de7d2d333a0f7ebba2da663e00c9c9483e Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 30 Nov 2007 23:52:10 +1100
Subject: sysctl: Remember the ctl_table we passed to register_sysctl_paths

By doing this we allow users of register_sysctl_paths that build
and dynamically allocate their ctl_table to be simpler.  This allows
them to just remember the ctl_table_header returned from
register_sysctl_paths from which they can now find the
ctl_table array they need to free.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Cc: Serge Hallyn <serue@us.ibm.com>
Cc: Daniel Lezcano <dlezcano@fr.ibm.com>
Cc: Cedric Le Goater <clg@fr.ibm.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sysctl.h | 1 +
 kernel/sysctl.c        | 1 +
 2 files changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 3b6e2c9fbb2e..77de3bfd8744 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -1057,6 +1057,7 @@ struct ctl_table_header
 	struct list_head ctl_entry;
 	int used;
 	struct completion *unregistering;
+	struct ctl_table *ctl_table_arg;
 };
 
 /* struct ctl_path describes where in the hierarchy a table is added */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index f580542333eb..89b7d95ecf51 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1669,6 +1669,7 @@ struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path,
 		new += 2;
 	}
 	*prevp = table;
+	header->ctl_table_arg = table;
 
 	INIT_LIST_HEAD(&header->ctl_entry);
 	header->used = 0;
-- 
cgit v1.2.3


From e51b6ba077791f2f8c876022b37419be7a2ceec3 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 30 Nov 2007 23:54:00 +1100
Subject: sysctl: Infrastructure for per namespace sysctls

This patch implements the basic infrastructure for per namespace sysctls.

A list of lists of sysctl headers is added, allowing each namespace to have
it's own list of sysctl headers.

Each list of sysctl headers has a lookup function to find the first
sysctl header in the list, allowing the lists to have a per namespace
instance.

register_sysct_root is added to tell sysctl.c about additional
lists of sysctl_headers.  As all of the users are expected to be in
kernel no unregister function is provided.

sysctl_head_next is updated to walk through the list of lists.

__register_sysctl_paths is added to add a new sysctl table on
a non-default sysctl list.

The only intrusive part of this patch is propagating the information
to decided which list of sysctls to use for sysctl_check_table.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Cc: Serge Hallyn <serue@us.ibm.com>
Cc: Daniel Lezcano <dlezcano@fr.ibm.com>
Cc: Cedric Le Goater <clg@fr.ibm.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sysctl.h | 17 ++++++++-
 kernel/sysctl.c        | 93 ++++++++++++++++++++++++++++++++++++++++++++------
 kernel/sysctl_check.c  | 25 ++++++++------
 3 files changed, 112 insertions(+), 23 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 77de3bfd8744..89faebfe48b8 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -945,7 +945,10 @@ enum
 
 /* For the /proc/sys support */
 struct ctl_table;
+struct nsproxy;
 extern struct ctl_table_header *sysctl_head_next(struct ctl_table_header *prev);
+extern struct ctl_table_header *__sysctl_head_next(struct nsproxy *namespaces,
+						struct ctl_table_header *prev);
 extern void sysctl_head_finish(struct ctl_table_header *prev);
 extern int sysctl_perm(struct ctl_table *table, int op);
 
@@ -1049,6 +1052,13 @@ struct ctl_table
 	void *extra2;
 };
 
+struct ctl_table_root {
+	struct list_head root_list;
+	struct list_head header_list;
+	struct list_head *(*lookup)(struct ctl_table_root *root,
+					   struct nsproxy *namespaces);
+};
+
 /* struct ctl_table_header is used to maintain dynamic lists of
    struct ctl_table trees. */
 struct ctl_table_header
@@ -1058,6 +1068,7 @@ struct ctl_table_header
 	int used;
 	struct completion *unregistering;
 	struct ctl_table *ctl_table_arg;
+	struct ctl_table_root *root;
 };
 
 /* struct ctl_path describes where in the hierarchy a table is added */
@@ -1066,12 +1077,16 @@ struct ctl_path {
 	int ctl_name;
 };
 
+void register_sysctl_root(struct ctl_table_root *root);
+struct ctl_table_header *__register_sysctl_paths(
+	struct ctl_table_root *root, struct nsproxy *namespaces,
+	const struct ctl_path *path, struct ctl_table *table);
 struct ctl_table_header *register_sysctl_table(struct ctl_table * table);
 struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path,
 						struct ctl_table *table);
 
 void unregister_sysctl_table(struct ctl_table_header * table);
-int sysctl_check_table(struct ctl_table *table);
+int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table);
 
 #else /* __KERNEL__ */
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 89b7d95ecf51..45e76f209dcb 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -157,8 +157,16 @@ static int proc_dointvec_taint(struct ctl_table *table, int write, struct file *
 #endif
 
 static struct ctl_table root_table[];
-static struct ctl_table_header root_table_header =
-	{ root_table, LIST_HEAD_INIT(root_table_header.ctl_entry) };
+static struct ctl_table_root sysctl_table_root;
+static struct ctl_table_header root_table_header = {
+	.ctl_table = root_table,
+	.ctl_entry = LIST_HEAD_INIT(sysctl_table_root.header_list),
+	.root = &sysctl_table_root,
+};
+static struct ctl_table_root sysctl_table_root = {
+	.root_list = LIST_HEAD_INIT(sysctl_table_root.root_list),
+	.header_list = LIST_HEAD_INIT(root_table_header.ctl_entry),
+};
 
 static struct ctl_table kern_table[];
 static struct ctl_table vm_table[];
@@ -1371,12 +1379,27 @@ void sysctl_head_finish(struct ctl_table_header *head)
 	spin_unlock(&sysctl_lock);
 }
 
-struct ctl_table_header *sysctl_head_next(struct ctl_table_header *prev)
+static struct list_head *
+lookup_header_list(struct ctl_table_root *root, struct nsproxy *namespaces)
 {
+	struct list_head *header_list;
+	header_list = &root->header_list;
+	if (root->lookup)
+		header_list = root->lookup(root, namespaces);
+	return header_list;
+}
+
+struct ctl_table_header *__sysctl_head_next(struct nsproxy *namespaces,
+					    struct ctl_table_header *prev)
+{
+	struct ctl_table_root *root;
+	struct list_head *header_list;
 	struct ctl_table_header *head;
 	struct list_head *tmp;
+
 	spin_lock(&sysctl_lock);
 	if (prev) {
+		head = prev;
 		tmp = &prev->ctl_entry;
 		unuse_table(prev);
 		goto next;
@@ -1390,14 +1413,38 @@ struct ctl_table_header *sysctl_head_next(struct ctl_table_header *prev)
 		spin_unlock(&sysctl_lock);
 		return head;
 	next:
+		root = head->root;
 		tmp = tmp->next;
-		if (tmp == &root_table_header.ctl_entry)
-			break;
+		header_list = lookup_header_list(root, namespaces);
+		if (tmp != header_list)
+			continue;
+
+		do {
+			root = list_entry(root->root_list.next,
+					struct ctl_table_root, root_list);
+			if (root == &sysctl_table_root)
+				goto out;
+			header_list = lookup_header_list(root, namespaces);
+		} while (list_empty(header_list));
+		tmp = header_list->next;
 	}
+out:
 	spin_unlock(&sysctl_lock);
 	return NULL;
 }
 
+struct ctl_table_header *sysctl_head_next(struct ctl_table_header *prev)
+{
+	return __sysctl_head_next(current->nsproxy, prev);
+}
+
+void register_sysctl_root(struct ctl_table_root *root)
+{
+	spin_lock(&sysctl_lock);
+	list_add_tail(&root->root_list, &sysctl_table_root.root_list);
+	spin_unlock(&sysctl_lock);
+}
+
 #ifdef CONFIG_SYSCTL_SYSCALL
 int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp,
 	       void __user *newval, size_t newlen)
@@ -1554,14 +1601,16 @@ static __init int sysctl_init(void)
 {
 	int err;
 	sysctl_set_parent(NULL, root_table);
-	err = sysctl_check_table(root_table);
+	err = sysctl_check_table(current->nsproxy, root_table);
 	return 0;
 }
 
 core_initcall(sysctl_init);
 
 /**
- * register_sysctl_paths - register a sysctl hierarchy
+ * __register_sysctl_paths - register a sysctl hierarchy
+ * @root: List of sysctl headers to register on
+ * @namespaces: Data to compute which lists of sysctl entries are visible
  * @path: The path to the directory the sysctl table is in.
  * @table: the top-level table structure
  *
@@ -1629,9 +1678,12 @@ core_initcall(sysctl_init);
  * This routine returns %NULL on a failure to register, and a pointer
  * to the table header on success.
  */
-struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path,
-						struct ctl_table *table)
+struct ctl_table_header *__register_sysctl_paths(
+	struct ctl_table_root *root,
+	struct nsproxy *namespaces,
+	const struct ctl_path *path, struct ctl_table *table)
 {
+	struct list_head *header_list;
 	struct ctl_table_header *header;
 	struct ctl_table *new, **prevp;
 	unsigned int n, npath;
@@ -1674,18 +1726,37 @@ struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path,
 	INIT_LIST_HEAD(&header->ctl_entry);
 	header->used = 0;
 	header->unregistering = NULL;
+	header->root = root;
 	sysctl_set_parent(NULL, header->ctl_table);
-	if (sysctl_check_table(header->ctl_table)) {
+	if (sysctl_check_table(namespaces, header->ctl_table)) {
 		kfree(header);
 		return NULL;
 	}
 	spin_lock(&sysctl_lock);
-	list_add_tail(&header->ctl_entry, &root_table_header.ctl_entry);
+	header_list = lookup_header_list(root, namespaces);
+	list_add_tail(&header->ctl_entry, header_list);
 	spin_unlock(&sysctl_lock);
 
 	return header;
 }
 
+/**
+ * register_sysctl_table_path - register a sysctl table hierarchy
+ * @path: The path to the directory the sysctl table is in.
+ * @table: the top-level table structure
+ *
+ * Register a sysctl table hierarchy. @table should be a filled in ctl_table
+ * array. A completely 0 filled entry terminates the table.
+ *
+ * See __register_sysctl_paths for more details.
+ */
+struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path,
+						struct ctl_table *table)
+{
+	return __register_sysctl_paths(&sysctl_table_root, current->nsproxy,
+					path, table);
+}
+
 /**
  * register_sysctl_table - register a sysctl table hierarchy
  * @table: the top-level table structure
diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c
index d8a5558a47b4..c3206fa50048 100644
--- a/kernel/sysctl_check.c
+++ b/kernel/sysctl_check.c
@@ -1342,7 +1342,8 @@ static void sysctl_repair_table(struct ctl_table *table)
 	}
 }
 
-static struct ctl_table *sysctl_check_lookup(struct ctl_table *table)
+static struct ctl_table *sysctl_check_lookup(struct nsproxy *namespaces,
+						struct ctl_table *table)
 {
 	struct ctl_table_header *head;
 	struct ctl_table *ref, *test;
@@ -1350,8 +1351,8 @@ static struct ctl_table *sysctl_check_lookup(struct ctl_table *table)
 
 	depth = sysctl_depth(table);
 
-	for (head = sysctl_head_next(NULL); head;
-	     head = sysctl_head_next(head)) {
+	for (head = __sysctl_head_next(namespaces, NULL); head;
+	     head = __sysctl_head_next(namespaces, head)) {
 		cur_depth = depth;
 		ref = head->ctl_table;
 repeat:
@@ -1396,13 +1397,14 @@ static void set_fail(const char **fail, struct ctl_table *table, const char *str
 	*fail = str;
 }
 
-static int sysctl_check_dir(struct ctl_table *table)
+static int sysctl_check_dir(struct nsproxy *namespaces,
+				struct ctl_table *table)
 {
 	struct ctl_table *ref;
 	int error;
 
 	error = 0;
-	ref = sysctl_check_lookup(table);
+	ref = sysctl_check_lookup(namespaces, table);
 	if (ref) {
 		int match = 0;
 		if ((!table->procname && !ref->procname) ||
@@ -1427,11 +1429,12 @@ static int sysctl_check_dir(struct ctl_table *table)
 	return error;
 }
 
-static void sysctl_check_leaf(struct ctl_table *table, const char **fail)
+static void sysctl_check_leaf(struct nsproxy *namespaces,
+				struct ctl_table *table, const char **fail)
 {
 	struct ctl_table *ref;
 
-	ref = sysctl_check_lookup(table);
+	ref = sysctl_check_lookup(namespaces, table);
 	if (ref && (ref != table))
 		set_fail(fail, table, "Sysctl already exists");
 }
@@ -1455,7 +1458,7 @@ static void sysctl_check_bin_path(struct ctl_table *table, const char **fail)
 	}
 }
 
-int sysctl_check_table(struct ctl_table *table)
+int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
 {
 	int error = 0;
 	for (; table->ctl_name || table->procname; table++) {
@@ -1485,7 +1488,7 @@ int sysctl_check_table(struct ctl_table *table)
 				set_fail(&fail, table, "Directory with extra1");
 			if (table->extra2)
 				set_fail(&fail, table, "Directory with extra2");
-			if (sysctl_check_dir(table))
+			if (sysctl_check_dir(namespaces, table))
 				set_fail(&fail, table, "Inconsistent directory names");
 		} else {
 			if ((table->strategy == sysctl_data) ||
@@ -1534,7 +1537,7 @@ int sysctl_check_table(struct ctl_table *table)
 			if (!table->procname && table->proc_handler)
 				set_fail(&fail, table, "proc_handler without procname");
 #endif
-			sysctl_check_leaf(table, &fail);
+			sysctl_check_leaf(namespaces, table, &fail);
 		}
 		sysctl_check_bin_path(table, &fail);
 		if (fail) {
@@ -1542,7 +1545,7 @@ int sysctl_check_table(struct ctl_table *table)
 			error = -EINVAL;
 		}
 		if (table->child)
-			error |= sysctl_check_table(table->child);
+			error |= sysctl_check_table(namespaces, table->child);
 	}
 	return error;
 }
-- 
cgit v1.2.3


From 08913681e484f3f0db949dd0809012e089846216 Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@openvz.org>
Date: Wed, 5 Dec 2007 01:42:49 -0800
Subject: [NET]: Remove the empty net_table

I have removed all the entries from this table (core_table,
ipv4_table and tr_table), so now we can safely drop it.

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/net.h | 1 -
 kernel/sysctl.c     | 8 --------
 net/sysctl_net.c    | 4 ----
 3 files changed, 13 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/net.h b/include/linux/net.h
index f95f12c5840c..c414d90e647b 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -337,7 +337,6 @@ static const struct proto_ops name##_ops = {			\
 
 #ifdef CONFIG_SYSCTL
 #include <linux/sysctl.h>
-extern ctl_table net_table[];
 extern int net_msg_cost;
 extern int net_msg_burst;
 #endif
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 45e76f209dcb..4bc8e48434a7 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -200,14 +200,6 @@ static struct ctl_table root_table[] = {
 		.mode		= 0555,
 		.child		= vm_table,
 	},
-#ifdef CONFIG_NET
-	{
-		.ctl_name	= CTL_NET,
-		.procname	= "net",
-		.mode		= 0555,
-		.child		= net_table,
-	},
-#endif
 	{
 		.ctl_name	= CTL_FS,
 		.procname	= "fs",
diff --git a/net/sysctl_net.c b/net/sysctl_net.c
index 16ad14b5d572..665e856675a4 100644
--- a/net/sysctl_net.c
+++ b/net/sysctl_net.c
@@ -30,10 +30,6 @@
 #include <linux/if_tr.h>
 #endif
 
-struct ctl_table net_table[] = {
-	{ 0 },
-};
-
 static struct list_head *
 net_ctl_header_lookup(struct ctl_table_root *root, struct nsproxy *namespaces)
 {
-- 
cgit v1.2.3


From a2da4052f1df6bc77749f84496fe731ab8b458f7 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 29 Jan 2008 17:13:17 -0500
Subject: module: Don't report discarded init pages as kernel text.

Current code could cause a bug in symbol_put_addr() if an arch used
kmalloc module text: we might think the symbol belongs to the core
kernel.

The downside is that this might make backtraces through (discarded)
init functions harder to read on some archs, but we already have that
issue for modules and noone has complained.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/extable.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/extable.c b/kernel/extable.c
index 7fe262855317..a26cb2e17023 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -46,7 +46,8 @@ int core_kernel_text(unsigned long addr)
 	    addr <= (unsigned long)_etext)
 		return 1;
 
-	if (addr >= (unsigned long)_sinittext &&
+	if (system_state == SYSTEM_BOOTING &&
+	    addr >= (unsigned long)_sinittext &&
 	    addr <= (unsigned long)_einittext)
 		return 1;
 	return 0;
-- 
cgit v1.2.3


From c9a3ba55bb5da03fc7d707709a7fe078fe1aa0a0 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 29 Jan 2008 17:13:18 -0500
Subject: module: wait for dependent modules doing init.

There have been reports of modules failing to load because the modules
they depend on are still loading.  This changes the modules to wait
for a reasonable length of time in that case.  We time out eventually,
because there can be module loops or broken modules.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/module.c | 28 +++++++++++++++++++++++-----
 1 file changed, 23 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 1bb4c5e0d56e..17314691d3cc 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -65,6 +65,9 @@
 static DEFINE_MUTEX(module_mutex);
 static LIST_HEAD(modules);
 
+/* Waiting for a module to finish initializing? */
+static DECLARE_WAIT_QUEUE_HEAD(module_wq);
+
 static BLOCKING_NOTIFIER_HEAD(module_notify_list);
 
 int register_module_notifier(struct notifier_block * nb)
@@ -84,8 +87,11 @@ EXPORT_SYMBOL(unregister_module_notifier);
 static inline int strong_try_module_get(struct module *mod)
 {
 	if (mod && mod->state == MODULE_STATE_COMING)
+		return -EBUSY;
+	if (try_module_get(mod))
 		return 0;
-	return try_module_get(mod);
+	else
+		return -ENOENT;
 }
 
 static inline void add_taint_module(struct module *mod, unsigned flag)
@@ -539,11 +545,21 @@ static int already_uses(struct module *a, struct module *b)
 static int use_module(struct module *a, struct module *b)
 {
 	struct module_use *use;
-	int no_warn;
+	int no_warn, err;
 
 	if (b == NULL || already_uses(a, b)) return 1;
 
-	if (!strong_try_module_get(b))
+	/* If we're interrupted or time out, we fail. */
+	if (wait_event_interruptible_timeout(
+		    module_wq, (err = strong_try_module_get(b)) != -EBUSY,
+		    30 * HZ) <= 0) {
+		printk("%s: gave up waiting for init of module %s.\n",
+		       a->name, b->name);
+		return 0;
+	}
+
+	/* If strong_try_module_get() returned a different error, we fail. */
+	if (err)
 		return 0;
 
 	DEBUGP("Allocating new usage for %s.\n", a->name);
@@ -816,7 +832,7 @@ static inline void module_unload_free(struct module *mod)
 
 static inline int use_module(struct module *a, struct module *b)
 {
-	return strong_try_module_get(b);
+	return strong_try_module_get(b) == 0;
 }
 
 static inline void module_unload_init(struct module *mod)
@@ -1326,7 +1342,7 @@ void *__symbol_get(const char *symbol)
 
 	preempt_disable();
 	value = __find_symbol(symbol, &owner, &crc, 1);
-	if (value && !strong_try_module_get(owner))
+	if (value && strong_try_module_get(owner) != 0)
 		value = 0;
 	preempt_enable();
 
@@ -2132,6 +2148,7 @@ sys_init_module(void __user *umod,
 		mutex_lock(&module_mutex);
 		free_module(mod);
 		mutex_unlock(&module_mutex);
+		wake_up(&module_wq);
 		return ret;
 	}
 
@@ -2146,6 +2163,7 @@ sys_init_module(void __user *umod,
 	mod->init_size = 0;
 	mod->init_text_size = 0;
 	mutex_unlock(&module_mutex);
+	wake_up(&module_wq);
 
 	return 0;
 }
-- 
cgit v1.2.3


From efa5345e39d01deef349c120f55ac6b6eabe7457 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 29 Jan 2008 17:13:20 -0500
Subject: module: Fix gratuitous sprintf in module.c

Andrew sent an older version of this patch: we shouldn't use sprintf
to copy a string.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/module.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 17314691d3cc..276abd7b7ff7 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -738,7 +738,7 @@ sys_delete_module(const char __user *name_user, unsigned int flags)
 		mutex_lock(&module_mutex);
 	}
 	/* Store the name of the last unloaded module for diagnostic purposes */
-	sprintf(last_unloaded_module, mod->name);
+	strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module));
 	free_module(mod);
 
  out:
-- 
cgit v1.2.3


From bb9d3d56e792d2619cc0903df4ac01d86ac1261d Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 29 Jan 2008 17:13:21 -0500
Subject: module: better OOPS and lockdep coverage for loading modules

If we put the module in the linked list *before* calling into to, we
get the module name and functions in the OOPS (is_module_address can
find the module).  It also helps lockdep in a similar way.

Acked-and-tested-by: Joern Engel <joern@lazybastard.org>
Tested-by: Erez Zadok <ezk@cs.sunysb.edu>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/module.c | 38 ++++++++++++++++++++------------------
 1 file changed, 20 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 276abd7b7ff7..12067ff34d01 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1293,6 +1293,17 @@ static void mod_kobject_remove(struct module *mod)
 	kobject_put(&mod->mkobj.kobj);
 }
 
+/*
+ * link the module with the whole machine is stopped with interrupts off
+ * - this defends against kallsyms not taking locks
+ */
+static int __link_module(void *_mod)
+{
+	struct module *mod = _mod;
+	list_add(&mod->list, &modules);
+	return 0;
+}
+
 /*
  * unlink the module with the whole machine is stopped with interrupts off
  * - this defends against kallsyms not taking locks
@@ -2035,6 +2046,11 @@ static struct module *load_module(void __user *umod,
 		printk(KERN_WARNING "%s: Ignoring obsolete parameters\n",
 		       mod->name);
 
+	/* Now sew it into the lists so we can get lockdep and oops
+         * info during argument parsing.  Noone should access us, since
+         * strong_try_module_get() will fail. */
+	stop_machine_run(__link_module, mod, NR_CPUS);
+
 	/* Size of section 0 is 0, so this works well if no params */
 	err = parse_args(mod->name, mod->args,
 			 (struct kernel_param *)
@@ -2043,7 +2059,7 @@ static struct module *load_module(void __user *umod,
 			 / sizeof(struct kernel_param),
 			 NULL);
 	if (err < 0)
-		goto arch_cleanup;
+		goto unlink;
 
 	err = mod_sysfs_setup(mod,
 			      (struct kernel_param *)
@@ -2051,7 +2067,7 @@ static struct module *load_module(void __user *umod,
 			      sechdrs[setupindex].sh_size
 			      / sizeof(struct kernel_param));
 	if (err < 0)
-		goto arch_cleanup;
+		goto unlink;
 	add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs);
 	add_notes_attrs(mod, hdr->e_shnum, secstrings, sechdrs);
 
@@ -2066,7 +2082,8 @@ static struct module *load_module(void __user *umod,
 	/* Done! */
 	return mod;
 
- arch_cleanup:
+ unlink:
+	stop_machine_run(__unlink_module, mod, NR_CPUS);
 	module_arch_cleanup(mod);
  cleanup:
 	kobject_del(&mod->mkobj.kobj);
@@ -2091,17 +2108,6 @@ static struct module *load_module(void __user *umod,
 	goto free_hdr;
 }
 
-/*
- * link the module with the whole machine is stopped with interrupts off
- * - this defends against kallsyms not taking locks
- */
-static int __link_module(void *_mod)
-{
-	struct module *mod = _mod;
-	list_add(&mod->list, &modules);
-	return 0;
-}
-
 /* This is where the real work happens */
 asmlinkage long
 sys_init_module(void __user *umod,
@@ -2126,10 +2132,6 @@ sys_init_module(void __user *umod,
 		return PTR_ERR(mod);
 	}
 
-	/* Now sew it into the lists.  They won't access us, since
-           strong_try_module_get() will fail. */
-	stop_machine_run(__link_module, mod, NR_CPUS);
-
 	/* Drop lock so they can recurse */
 	mutex_unlock(&module_mutex);
 
-- 
cgit v1.2.3


From 6dd06c9fbe025f542bce4cdb91790c0f91962722 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 29 Jan 2008 17:13:22 -0500
Subject: module: make module_address_lookup safe

module_address_lookup releases preemption then returns a pointer into
the module space.  The only user (kallsyms) copies the result, so just
do that under the preempt disable.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 include/linux/module.h | 22 +++++++++++++---------
 kernel/kallsyms.c      | 11 ++++-------
 kernel/module.c        | 22 +++++++++++++---------
 3 files changed, 30 insertions(+), 25 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/module.h b/include/linux/module.h
index c97bdb7eb957..aedc06be1de8 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -446,11 +446,14 @@ static inline void __module_get(struct module *module)
 	__mod ? __mod->name : "kernel";		\
 })
 
-/* For kallsyms to ask for address resolution.  NULL means not found. */
-const char *module_address_lookup(unsigned long addr,
-				  unsigned long *symbolsize,
-				  unsigned long *offset,
-				  char **modname);
+/* For kallsyms to ask for address resolution.  namebuf should be at
+ * least KSYM_NAME_LEN long: a pointer to namebuf is returned if
+ * found, otherwise NULL. */
+char *module_address_lookup(unsigned long addr,
+			    unsigned long *symbolsize,
+			    unsigned long *offset,
+			    char **modname,
+			    char *namebuf);
 int lookup_module_symbol_name(unsigned long addr, char *symname);
 int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size, unsigned long *offset, char *modname, char *name);
 
@@ -516,10 +519,11 @@ static inline void module_put(struct module *module)
 #define module_name(mod) "kernel"
 
 /* For kallsyms to ask for address resolution.  NULL means not found. */
-static inline const char *module_address_lookup(unsigned long addr,
-						unsigned long *symbolsize,
-						unsigned long *offset,
-						char **modname)
+static inline char *module_address_lookup(unsigned long addr,
+					  unsigned long *symbolsize,
+					  unsigned long *offset,
+					  char **modname,
+					  char *namebuf)
 {
 	return NULL;
 }
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 2fc25810509e..7dadc71ce516 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -233,10 +233,11 @@ static unsigned long get_symbol_pos(unsigned long addr,
 int kallsyms_lookup_size_offset(unsigned long addr, unsigned long *symbolsize,
 				unsigned long *offset)
 {
+	char namebuf[KSYM_NAME_LEN];
 	if (is_ksym_addr(addr))
 		return !!get_symbol_pos(addr, symbolsize, offset);
 
-	return !!module_address_lookup(addr, symbolsize, offset, NULL);
+	return !!module_address_lookup(addr, symbolsize, offset, NULL, namebuf);
 }
 
 /*
@@ -251,8 +252,6 @@ const char *kallsyms_lookup(unsigned long addr,
 			    unsigned long *offset,
 			    char **modname, char *namebuf)
 {
-	const char *msym;
-
 	namebuf[KSYM_NAME_LEN - 1] = 0;
 	namebuf[0] = 0;
 
@@ -268,10 +267,8 @@ const char *kallsyms_lookup(unsigned long addr,
 	}
 
 	/* see if it's in a module */
-	msym = module_address_lookup(addr, symbolsize, offset, modname);
-	if (msym)
-		return strncpy(namebuf, msym, KSYM_NAME_LEN - 1);
-
+	return module_address_lookup(addr, symbolsize, offset, modname,
+				     namebuf);
 	return NULL;
 }
 
diff --git a/kernel/module.c b/kernel/module.c
index 12067ff34d01..e814cd7da634 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2230,14 +2230,13 @@ static const char *get_ksymbol(struct module *mod,
 	return mod->strtab + mod->symtab[best].st_name;
 }
 
-/* For kallsyms to ask for address resolution.  NULL means not found.
-   We don't lock, as this is used for oops resolution and races are a
-   lesser concern. */
-/* FIXME: Risky: returns a pointer into a module w/o lock */
-const char *module_address_lookup(unsigned long addr,
-				  unsigned long *size,
-				  unsigned long *offset,
-				  char **modname)
+/* For kallsyms to ask for address resolution.  NULL means not found.  Careful
+ * not to lock to avoid deadlock on oopses, simply disable preemption. */
+char *module_address_lookup(unsigned long addr,
+			    unsigned long *size,
+			    unsigned long *offset,
+			    char **modname,
+			    char *namebuf)
 {
 	struct module *mod;
 	const char *ret = NULL;
@@ -2252,8 +2251,13 @@ const char *module_address_lookup(unsigned long addr,
 			break;
 		}
 	}
+	/* Make a copy in here where it's safe */
+	if (ret) {
+		strncpy(namebuf, ret, KSYM_NAME_LEN - 1);
+		ret = namebuf;
+	}
 	preempt_enable();
-	return ret;
+	return (char *)ret;
 }
 
 int lookup_module_symbol_name(unsigned long addr, char *symname)
-- 
cgit v1.2.3


From 8686c99875f3849047660a5b6d02438443f22ce7 Mon Sep 17 00:00:00 2001
From: Denis Cheng <crquan@gmail.com>
Date: Mon, 21 Jan 2008 17:08:25 +0800
Subject: module: fix the module name length in param_sysfs_builtin

the original code use KOBJ_NAME_LEN for built-in module name length,
that's defined to 20 in linux/kobject.h, but this is not enough appearntly,
many module names are longer than this;
 #define KOBJ_NAME_LEN                   20

another macro is MODULE_NAME_LEN defined in linux/module.h, I think this is
enough for module names:
 #define MODULE_NAME_LEN (64 - sizeof(unsigned long))

Signed-off-by: Denis Cheng <crquan@gmail.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/params.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/params.c b/kernel/params.c
index 67f65ee7211d..42fe5e6126c0 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -376,8 +376,6 @@ int param_get_string(char *buffer, struct kernel_param *kp)
 
 extern struct kernel_param __start___param[], __stop___param[];
 
-#define MAX_KBUILD_MODNAME KOBJ_NAME_LEN
-
 struct param_attribute
 {
 	struct module_attribute mattr;
@@ -587,7 +585,7 @@ static void __init param_sysfs_builtin(void)
 {
 	struct kernel_param *kp, *kp_begin = NULL;
 	unsigned int i, name_len, count = 0;
-	char modname[MAX_KBUILD_MODNAME + 1] = "";
+	char modname[MODULE_NAME_LEN + 1] = "";
 
 	for (i=0; i < __stop___param - __start___param; i++) {
 		char *dot;
@@ -595,12 +593,12 @@ static void __init param_sysfs_builtin(void)
 
 		kp = &__start___param[i];
 		max_name_len =
-			min_t(size_t, MAX_KBUILD_MODNAME, strlen(kp->name));
+			min_t(size_t, MODULE_NAME_LEN, strlen(kp->name));
 
 		dot = memchr(kp->name, '.', max_name_len);
 		if (!dot) {
 			DEBUGP("couldn't find period in first %d characters "
-			       "of %s\n", MAX_KBUILD_MODNAME, kp->name);
+			       "of %s\n", MODULE_NAME_LEN, kp->name);
 			continue;
 		}
 		name_len = dot - kp->name;
-- 
cgit v1.2.3


From 0aa5bd52d0c49ca56d24584c646e6544ccbb3dc9 Mon Sep 17 00:00:00 2001
From: Jon Masters <jonathan@jonmasters.org>
Date: Mon, 21 Jan 2008 20:43:41 +0000
Subject: module: add module taint on ndiswrapper

The struct module taints member is supposed to store per-module taint
data. The kernel knows about certain specific external modules that will
taint the kernel, such as ndiswrapper. Use of ndiswrapper possibly
should set the per-module taint in addition to the global kernel
taint flag, unless we're arguing not because wrapper module itself
is not what actually causes the kernel to be tainted as such?

Signed-off-by: Jon Masters <jcm@jonmasters.org>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/module.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index e814cd7da634..af3f81a94745 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1916,7 +1916,7 @@ static struct module *load_module(void __user *umod,
 	set_license(mod, get_modinfo(sechdrs, infoindex, "license"));
 
 	if (strcmp(mod->name, "ndiswrapper") == 0)
-		add_taint(TAINT_PROPRIETARY_MODULE);
+		add_taint_module(mod, TAINT_PROPRIETARY_MODULE);
 	if (strcmp(mod->name, "driverloader") == 0)
 		add_taint_module(mod, TAINT_PROPRIETARY_MODULE);
 
-- 
cgit v1.2.3


From 6494a93d55fad586238cc1940e846c6d03e1aaf6 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Sun, 27 Jan 2008 15:38:40 -0800
Subject: Module: check to see if we have a built in module with the same name

When trying to load a module with the same name as a built-in one, a
scary kobject backtrace comes up.  Prevent that from checking for this
condition and warning the user as to what exactly is going on.

Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/module.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index af3f81a94745..f6a4e721fd49 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1230,6 +1230,7 @@ void module_remove_modinfo_attrs(struct module *mod)
 int mod_sysfs_init(struct module *mod)
 {
 	int err;
+	struct kobject *kobj;
 
 	if (!module_sysfs_initialized) {
 		printk(KERN_ERR "%s: module sysfs not initialized\n",
@@ -1237,6 +1238,15 @@ int mod_sysfs_init(struct module *mod)
 		err = -EINVAL;
 		goto out;
 	}
+
+	kobj = kset_find_obj(module_kset, mod->name);
+	if (kobj) {
+		printk(KERN_ERR "%s: module is already loaded\n", mod->name);
+		kobject_put(kobj);
+		err = -EINVAL;
+		goto out;
+	}
+
 	mod->mkobj.mod = mod;
 
 	memset(&mod->mkobj.kobj, 0, sizeof(mod->mkobj.kobj));
-- 
cgit v1.2.3


From a6fa8e5a6172a5a5bc06ed04f34e50b36c978127 Mon Sep 17 00:00:00 2001
From: Pavel Machek <pavel@ucw.cz>
Date: Wed, 30 Jan 2008 13:30:00 +0100
Subject: time: clean hungarian notation from timers

Clean up hungarian notation from timer code.

Signed-off-by: Pavel Machek <pavel@suse.cz>
Cc: john stultz <johnstul@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/timer.h |  6 ++--
 kernel/timer.c        | 80 +++++++++++++++++++++++++--------------------------
 2 files changed, 42 insertions(+), 44 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/timer.h b/include/linux/timer.h
index 78cf899b4409..de0e71359ede 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -5,7 +5,7 @@
 #include <linux/ktime.h>
 #include <linux/stddef.h>
 
-struct tvec_t_base_s;
+struct tvec_base;
 
 struct timer_list {
 	struct list_head entry;
@@ -14,7 +14,7 @@ struct timer_list {
 	void (*function)(unsigned long);
 	unsigned long data;
 
-	struct tvec_t_base_s *base;
+	struct tvec_base *base;
 #ifdef CONFIG_TIMER_STATS
 	void *start_site;
 	char start_comm[16];
@@ -22,7 +22,7 @@ struct timer_list {
 #endif
 };
 
-extern struct tvec_t_base_s boot_tvec_bases;
+extern struct tvec_base boot_tvec_bases;
 
 #define TIMER_INITIALIZER(_function, _expires, _data) {		\
 		.function = (_function),			\
diff --git a/kernel/timer.c b/kernel/timer.c
index f739dfb539ce..aadfbc8367f5 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -58,59 +58,57 @@ EXPORT_SYMBOL(jiffies_64);
 #define TVN_MASK (TVN_SIZE - 1)
 #define TVR_MASK (TVR_SIZE - 1)
 
-typedef struct tvec_s {
+struct tvec {
 	struct list_head vec[TVN_SIZE];
-} tvec_t;
+};
 
-typedef struct tvec_root_s {
+struct tvec_root {
 	struct list_head vec[TVR_SIZE];
-} tvec_root_t;
+};
 
-struct tvec_t_base_s {
+struct tvec_base {
 	spinlock_t lock;
 	struct timer_list *running_timer;
 	unsigned long timer_jiffies;
-	tvec_root_t tv1;
-	tvec_t tv2;
-	tvec_t tv3;
-	tvec_t tv4;
-	tvec_t tv5;
+	struct tvec_root tv1;
+	struct tvec tv2;
+	struct tvec tv3;
+	struct tvec tv4;
+	struct tvec tv5;
 } ____cacheline_aligned;
 
-typedef struct tvec_t_base_s tvec_base_t;
-
-tvec_base_t boot_tvec_bases;
+struct tvec_base boot_tvec_bases;
 EXPORT_SYMBOL(boot_tvec_bases);
-static DEFINE_PER_CPU(tvec_base_t *, tvec_bases) = &boot_tvec_bases;
+static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases;
 
 /*
- * Note that all tvec_bases is 2 byte aligned and lower bit of
+ * Note that all tvec_bases are 2 byte aligned and lower bit of
  * base in timer_list is guaranteed to be zero. Use the LSB for
  * the new flag to indicate whether the timer is deferrable
  */
 #define TBASE_DEFERRABLE_FLAG		(0x1)
 
 /* Functions below help us manage 'deferrable' flag */
-static inline unsigned int tbase_get_deferrable(tvec_base_t *base)
+static inline unsigned int tbase_get_deferrable(struct tvec_base *base)
 {
 	return ((unsigned int)(unsigned long)base & TBASE_DEFERRABLE_FLAG);
 }
 
-static inline tvec_base_t *tbase_get_base(tvec_base_t *base)
+static inline struct tvec_base *tbase_get_base(struct tvec_base *base)
 {
-	return ((tvec_base_t *)((unsigned long)base & ~TBASE_DEFERRABLE_FLAG));
+	return ((struct tvec_base *)((unsigned long)base & ~TBASE_DEFERRABLE_FLAG));
 }
 
 static inline void timer_set_deferrable(struct timer_list *timer)
 {
-	timer->base = ((tvec_base_t *)((unsigned long)(timer->base) |
+	timer->base = ((struct tvec_base *)((unsigned long)(timer->base) |
 				       TBASE_DEFERRABLE_FLAG));
 }
 
 static inline void
-timer_set_base(struct timer_list *timer, tvec_base_t *new_base)
+timer_set_base(struct timer_list *timer, struct tvec_base *new_base)
 {
-	timer->base = (tvec_base_t *)((unsigned long)(new_base) |
+	timer->base = (struct tvec_base *)((unsigned long)(new_base) |
 				      tbase_get_deferrable(timer->base));
 }
 
@@ -246,7 +244,7 @@ unsigned long round_jiffies_relative(unsigned long j)
 EXPORT_SYMBOL_GPL(round_jiffies_relative);
 
 
-static inline void set_running_timer(tvec_base_t *base,
+static inline void set_running_timer(struct tvec_base *base,
 					struct timer_list *timer)
 {
 #ifdef CONFIG_SMP
@@ -254,7 +252,7 @@ static inline void set_running_timer(tvec_base_t *base,
 #endif
 }
 
-static void internal_add_timer(tvec_base_t *base, struct timer_list *timer)
+static void internal_add_timer(struct tvec_base *base, struct timer_list *timer)
 {
 	unsigned long expires = timer->expires;
 	unsigned long idx = expires - base->timer_jiffies;
@@ -371,14 +369,14 @@ static inline void detach_timer(struct timer_list *timer,
  * possible to set timer->base = NULL and drop the lock: the timer remains
  * locked.
  */
-static tvec_base_t *lock_timer_base(struct timer_list *timer,
+static struct tvec_base *lock_timer_base(struct timer_list *timer,
 					unsigned long *flags)
 	__acquires(timer->base->lock)
 {
-	tvec_base_t *base;
+	struct tvec_base *base;
 
 	for (;;) {
-		tvec_base_t *prelock_base = timer->base;
+		struct tvec_base *prelock_base = timer->base;
 		base = tbase_get_base(prelock_base);
 		if (likely(base != NULL)) {
 			spin_lock_irqsave(&base->lock, *flags);
@@ -393,7 +391,7 @@ static tvec_base_t *lock_timer_base(struct timer_list *timer,
 
 int __mod_timer(struct timer_list *timer, unsigned long expires)
 {
-	tvec_base_t *base, *new_base;
+	struct tvec_base *base, *new_base;
 	unsigned long flags;
 	int ret = 0;
 
@@ -445,7 +443,7 @@ EXPORT_SYMBOL(__mod_timer);
  */
 void add_timer_on(struct timer_list *timer, int cpu)
 {
-	tvec_base_t *base = per_cpu(tvec_bases, cpu);
+	struct tvec_base *base = per_cpu(tvec_bases, cpu);
 	unsigned long flags;
 
 	timer_stats_timer_set_start_info(timer);
@@ -508,7 +506,7 @@ EXPORT_SYMBOL(mod_timer);
  */
 int del_timer(struct timer_list *timer)
 {
-	tvec_base_t *base;
+	struct tvec_base *base;
 	unsigned long flags;
 	int ret = 0;
 
@@ -539,7 +537,7 @@ EXPORT_SYMBOL(del_timer);
  */
 int try_to_del_timer_sync(struct timer_list *timer)
 {
-	tvec_base_t *base;
+	struct tvec_base *base;
 	unsigned long flags;
 	int ret = -1;
 
@@ -591,7 +589,7 @@ int del_timer_sync(struct timer_list *timer)
 EXPORT_SYMBOL(del_timer_sync);
 #endif
 
-static int cascade(tvec_base_t *base, tvec_t *tv, int index)
+static int cascade(struct tvec_base *base, struct tvec *tv, int index)
 {
 	/* cascade all the timers from tv up one level */
 	struct timer_list *timer, *tmp;
@@ -620,7 +618,7 @@ static int cascade(tvec_base_t *base, tvec_t *tv, int index)
  * This function cascades all vectors and executes all expired timer
  * vectors.
  */
-static inline void __run_timers(tvec_base_t *base)
+static inline void __run_timers(struct tvec_base *base)
 {
 	struct timer_list *timer;
 
@@ -678,13 +676,13 @@ static inline void __run_timers(tvec_base_t *base)
  * is used on S/390 to stop all activity when a cpus is idle.
  * This functions needs to be called disabled.
  */
-static unsigned long __next_timer_interrupt(tvec_base_t *base)
+static unsigned long __next_timer_interrupt(struct tvec_base *base)
 {
 	unsigned long timer_jiffies = base->timer_jiffies;
 	unsigned long expires = timer_jiffies + NEXT_TIMER_MAX_DELTA;
 	int index, slot, array, found = 0;
 	struct timer_list *nte;
-	tvec_t *varray[4];
+	struct tvec *varray[4];
 
 	/* Look for timer events in tv1. */
 	index = slot = timer_jiffies & TVR_MASK;
@@ -716,7 +714,7 @@ cascade:
 	varray[3] = &base->tv5;
 
 	for (array = 0; array < 4; array++) {
-		tvec_t *varp = varray[array];
+		struct tvec *varp = varray[array];
 
 		index = slot = timer_jiffies & TVN_MASK;
 		do {
@@ -795,7 +793,7 @@ static unsigned long cmp_next_hrtimer_event(unsigned long now,
  */
 unsigned long get_next_timer_interrupt(unsigned long now)
 {
-	tvec_base_t *base = __get_cpu_var(tvec_bases);
+	struct tvec_base *base = __get_cpu_var(tvec_bases);
 	unsigned long expires;
 
 	spin_lock(&base->lock);
@@ -894,7 +892,7 @@ static inline void calc_load(unsigned long ticks)
  */
 static void run_timer_softirq(struct softirq_action *h)
 {
-	tvec_base_t *base = __get_cpu_var(tvec_bases);
+	struct tvec_base *base = __get_cpu_var(tvec_bases);
 
 	hrtimer_run_pending();
 
@@ -1223,7 +1221,7 @@ static struct lock_class_key base_lock_keys[NR_CPUS];
 static int __cpuinit init_timers_cpu(int cpu)
 {
 	int j;
-	tvec_base_t *base;
+	struct tvec_base *base;
 	static char __cpuinitdata tvec_base_done[NR_CPUS];
 
 	if (!tvec_base_done[cpu]) {
@@ -1278,7 +1276,7 @@ static int __cpuinit init_timers_cpu(int cpu)
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
-static void migrate_timer_list(tvec_base_t *new_base, struct list_head *head)
+static void migrate_timer_list(struct tvec_base *new_base, struct list_head *head)
 {
 	struct timer_list *timer;
 
@@ -1292,8 +1290,8 @@ static void migrate_timer_list(tvec_base_t *new_base, struct list_head *head)
 
 static void __cpuinit migrate_timers(int cpu)
 {
-	tvec_base_t *old_base;
-	tvec_base_t *new_base;
+	struct tvec_base *old_base;
+	struct tvec_base *new_base;
 	int i;
 
 	BUG_ON(cpu_online(cpu));
-- 
cgit v1.2.3


From 4c9dc6412247abf4972080c51cd16a58c4009c19 Mon Sep 17 00:00:00 2001
From: Pavel Machek <pavel@ucw.cz>
Date: Wed, 30 Jan 2008 13:30:00 +0100
Subject: time: timer cleanups

Small cleanups to tick-related code. Wrong preempt count is followed
by BUG(), so it is hardly KERN_WARNING.

Signed-off-by: Pavel Machek <pavel@suse.cz>
Cc: john stultz <johnstul@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/tick-sched.c | 4 ++--
 kernel/timer.c           | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 1a21b6fdb674..d36ee2fd1a3b 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -296,7 +296,7 @@ void tick_nohz_stop_sched_tick(void)
 			/* Check, if the timer was already in the past */
 			if (hrtimer_active(&ts->sched_timer))
 				goto out;
-		} else if(!tick_program_event(expires, 0))
+		} else if (!tick_program_event(expires, 0))
 				goto out;
 		/*
 		 * We are past the event already. So we crossed a
@@ -507,7 +507,7 @@ static inline void tick_nohz_switch_to_nohz(void) { }
  */
 #ifdef CONFIG_HIGH_RES_TIMERS
 /*
- * We rearm the timer until we get disabled by the idle code
+ * We rearm the timer until we get disabled by the idle code.
  * Called with interrupts disabled and timer->base->cpu_base->lock held.
  */
 static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
diff --git a/kernel/timer.c b/kernel/timer.c
index aadfbc8367f5..23f7ead78fae 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -655,7 +655,7 @@ static inline void __run_timers(struct tvec_base *base)
 				int preempt_count = preempt_count();
 				fn(data);
 				if (preempt_count != preempt_count()) {
-					printk(KERN_WARNING "huh, entered %p "
+					printk(KERN_ERR "huh, entered %p "
 					       "with preempt_count %08x, exited"
 					       " with %08x?\n",
 					       fn, preempt_count,
-- 
cgit v1.2.3


From b10db7f0d2b589a7f88dc3026e150756cb437a28 Mon Sep 17 00:00:00 2001
From: Pavel Machek <pavel@ucw.cz>
Date: Wed, 30 Jan 2008 13:30:00 +0100
Subject: time: more timer related cleanups

I was confused by FSEC = 10^15 NSEC statement, plus small whitespace
fixes. When there's copyright, there should be GPL.

Signed-off-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/hpet.c       | 3 ++-
 arch/x86/kernel/process_64.c | 2 +-
 kernel/softirq.c             | 4 +++-
 kernel/time/tick-sched.c     | 2 +-
 kernel/time/timer_stats.c    | 2 +-
 5 files changed, 8 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 2f99ee206b95..9ec2ab793042 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -16,7 +16,8 @@
 #define HPET_MASK	CLOCKSOURCE_MASK(32)
 #define HPET_SHIFT	22
 
-/* FSEC = 10^-15 NSEC = 10^-9 */
+/* FSEC = 10^-15
+   NSEC = 10^-9 */
 #define FSEC_PER_NSEC	1000000
 
 /*
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index ab79e1dfa023..c2db7ef93565 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -207,7 +207,7 @@ static inline void play_dead(void)
  * low exit latency (ie sit in a loop waiting for
  * somebody to say that they'd like to reschedule)
  */
-void cpu_idle (void)
+void cpu_idle(void)
 {
 	current_thread_info()->status |= TS_POLLING;
 	/* endless idle loop with no priority at all */
diff --git a/kernel/softirq.c b/kernel/softirq.c
index bd89bc4eb0b9..8fe1ff40102d 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -3,7 +3,9 @@
  *
  *	Copyright (C) 1992 Linus Torvalds
  *
- * Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903)
+ *	Distribute under GPLv2.
+ *
+ *	Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903)
  */
 
 #include <linux/module.h>
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index d36ee2fd1a3b..49e12f6a4bab 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -9,7 +9,7 @@
  *
  *  Started by: Thomas Gleixner and Ingo Molnar
  *
- *  For licencing details see kernel-base/COPYING
+ *  Distribute under GPLv2.
  */
 #include <linux/cpu.h>
 #include <linux/err.h>
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index c36bb7ed0301..417da8c5bc72 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -26,7 +26,7 @@
  * the pid and cmdline from the owner process if applicable.
  *
  * Start/stop data collection:
- * # echo 1[0] >/proc/timer_stats
+ * # echo [1|0] >/proc/timer_stats
  *
  * Display the information collected so far:
  * # cat /proc/timer_stats
-- 
cgit v1.2.3


From 186e3cb8a465bac010ee3b020768d2fa2b505aef Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 30 Jan 2008 13:30:01 +0100
Subject: timer: clean up tick-broadcast.c

clean up tick-broadcast.c

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/time/tick-broadcast.c | 7 ++-----
 kernel/time/tick-internal.h  | 2 --
 2 files changed, 2 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 5b86698faa0b..e1bd50cbbf5d 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -126,9 +126,9 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
 /*
  * Broadcast the event to the cpus, which are set in the mask
  */
-int tick_do_broadcast(cpumask_t mask)
+static void tick_do_broadcast(cpumask_t mask)
 {
-	int ret = 0, cpu = smp_processor_id();
+	int cpu = smp_processor_id();
 	struct tick_device *td;
 
 	/*
@@ -138,7 +138,6 @@ int tick_do_broadcast(cpumask_t mask)
 		cpu_clear(cpu, mask);
 		td = &per_cpu(tick_cpu_device, cpu);
 		td->evtdev->event_handler(td->evtdev);
-		ret = 1;
 	}
 
 	if (!cpus_empty(mask)) {
@@ -151,9 +150,7 @@ int tick_do_broadcast(cpumask_t mask)
 		cpu = first_cpu(mask);
 		td = &per_cpu(tick_cpu_device, cpu);
 		td->evtdev->broadcast(mask);
-		ret = 1;
 	}
-	return ret;
 }
 
 /*
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index bb13f2724905..f13f2b7f4fd4 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -70,8 +70,6 @@ static inline int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
  * Broadcasting support
  */
 #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
-extern int tick_do_broadcast(cpumask_t mask);
-
 extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu);
 extern int tick_check_broadcast_device(struct clock_event_device *dev);
 extern int tick_is_broadcast_device(struct clock_event_device *dev);
-- 
cgit v1.2.3


From efd9ac8630e89b9ee7ce64008bd7783952374f37 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <Geert.Uytterhoeven@sonycom.com>
Date: Wed, 30 Jan 2008 13:30:01 +0100
Subject: time: fold __get_realtime_clock_ts() into getnstimeofday()

  - getnstimeofday() was just a wrapper around __get_realtime_clock_ts()
  - Replace calls to __get_realtime_clock_ts() by calls to getnstimeofday()
  - Fix bogus reference to get_realtime_clock_ts(), which never existed

Signed-off-by: Geert Uytterhoeven <Geert.Uytterhoeven@sonycom.com>
Cc: john stultz <johnstul@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/timekeeping.c | 22 +++++-----------------
 1 file changed, 5 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index ab46ae8c062b..77680195cf84 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -82,13 +82,12 @@ static inline s64 __get_nsec_offset(void)
 }
 
 /**
- * __get_realtime_clock_ts - Returns the time of day in a timespec
+ * getnstimeofday - Returns the time of day in a timespec
  * @ts:		pointer to the timespec to be set
  *
- * Returns the time of day in a timespec. Used by
- * do_gettimeofday() and get_realtime_clock_ts().
+ * Returns the time of day in a timespec.
  */
-static inline void __get_realtime_clock_ts(struct timespec *ts)
+void getnstimeofday(struct timespec *ts)
 {
 	unsigned long seq;
 	s64 nsecs;
@@ -104,30 +103,19 @@ static inline void __get_realtime_clock_ts(struct timespec *ts)
 	timespec_add_ns(ts, nsecs);
 }
 
-/**
- * getnstimeofday - Returns the time of day in a timespec
- * @ts:		pointer to the timespec to be set
- *
- * Returns the time of day in a timespec.
- */
-void getnstimeofday(struct timespec *ts)
-{
-	__get_realtime_clock_ts(ts);
-}
-
 EXPORT_SYMBOL(getnstimeofday);
 
 /**
  * do_gettimeofday - Returns the time of day in a timeval
  * @tv:		pointer to the timeval to be set
  *
- * NOTE: Users should be converted to using get_realtime_clock_ts()
+ * NOTE: Users should be converted to using getnstimeofday()
  */
 void do_gettimeofday(struct timeval *tv)
 {
 	struct timespec now;
 
-	__get_realtime_clock_ts(&now);
+	getnstimeofday(&now);
 	tv->tv_sec = now.tv_sec;
 	tv->tv_usec = now.tv_nsec/1000;
 }
-- 
cgit v1.2.3


From 1077f5a917b7c630231037826b344b2f7f5b903f Mon Sep 17 00:00:00 2001
From: Parag Warudkar <parag.warudkar@gmail.com>
Date: Wed, 30 Jan 2008 13:30:01 +0100
Subject: clocksource.c: use init_timer_deferrable for clocksource_watchdog

clocksource_watchdog can use a deferrable timer - reduces wakeups from
idle per second.

Signed-off-by: Parag Warudkar <parag.warudkar@gmail.com>
Cc: john stultz <johnstul@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/clocksource.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 8d6125ad2cf0..cabfa193efb3 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -175,7 +175,7 @@ static void clocksource_check_watchdog(struct clocksource *cs)
 			if (watchdog)
 				del_timer(&watchdog_timer);
 			watchdog = cs;
-			init_timer(&watchdog_timer);
+			init_timer_deferrable(&watchdog_timer);
 			watchdog_timer.function = clocksource_watchdog;
 
 			/* Reset watchdog cycles */
-- 
cgit v1.2.3


From 1ada5cba6a0318f90e45b38557e7b5206a9cba38 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Wed, 30 Jan 2008 13:30:02 +0100
Subject: clocksource: make clocksource watchdog cycle through online CPUs

This way it checks if the clocks are synchronized between CPUs too.
This might be able to detect slowly drifting TSCs which only
go wrong over longer time.

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/clocksource.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index cabfa193efb3..edd5ef8e1765 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -142,8 +142,13 @@ static void clocksource_watchdog(unsigned long data)
 	}
 
 	if (!list_empty(&watchdog_list)) {
-		__mod_timer(&watchdog_timer,
-			    watchdog_timer.expires + WATCHDOG_INTERVAL);
+		/* Cycle through CPUs to check if the CPUs stay synchronized to
+		 * each other. */
+		int next_cpu = next_cpu(raw_smp_processor_id(), cpu_online_map);
+		if (next_cpu >= NR_CPUS)
+			next_cpu = first_cpu(cpu_online_map);
+		watchdog_timer.expires += WATCHDOG_INTERVAL;
+		add_timer_on(&watchdog_timer, next_cpu);
 	}
 	spin_unlock(&watchdog_lock);
 }
@@ -165,7 +170,7 @@ static void clocksource_check_watchdog(struct clocksource *cs)
 		if (!started && watchdog) {
 			watchdog_last = watchdog->read();
 			watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL;
-			add_timer(&watchdog_timer);
+			add_timer_on(&watchdog_timer, first_cpu(cpu_online_map));
 		}
 	} else {
 		if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS)
@@ -186,7 +191,8 @@ static void clocksource_check_watchdog(struct clocksource *cs)
 				watchdog_last = watchdog->read();
 				watchdog_timer.expires =
 					jiffies + WATCHDOG_INTERVAL;
-				add_timer(&watchdog_timer);
+				add_timer_on(&watchdog_timer,
+						first_cpu(cpu_online_map));
 			}
 		}
 	}
-- 
cgit v1.2.3


From 4713e22ce81eb8b3353e16435362eb3d0ec95640 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 30 Jan 2008 13:30:02 +0100
Subject: clocksource: add unregister function to disable unusable clocksources

On x86 the PIT might become an unusable clocksource. Add an unregister
function to provide a possibilty to remove the PIT from the list of
available clock sources.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/i8253.c     |  1 +
 include/linux/clocksource.h |  1 +
 kernel/time/clocksource.c   | 15 +++++++++++++++
 3 files changed, 17 insertions(+)

(limited to 'kernel')

diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c
index 0f8f35458a8f..decc5d294d76 100644
--- a/arch/x86/kernel/i8253.c
+++ b/arch/x86/kernel/i8253.c
@@ -13,6 +13,7 @@
 #include <asm/delay.h>
 #include <asm/i8253.h>
 #include <asm/io.h>
+#include <asm/hpet.h>
 
 DEFINE_SPINLOCK(i8253_lock);
 EXPORT_SYMBOL(i8253_lock);
diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h
index 07b42153de24..85778a4b1209 100644
--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -215,6 +215,7 @@ static inline void clocksource_calculate_interval(struct clocksource *c,
 
 /* used to install a new clocksource */
 extern int clocksource_register(struct clocksource*);
+extern void clocksource_unregister(struct clocksource*);
 extern struct clocksource* clocksource_get_next(void);
 extern void clocksource_change_rating(struct clocksource *cs, int rating);
 extern void clocksource_resume(void);
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index edd5ef8e1765..6e9259a5d501 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -337,6 +337,21 @@ void clocksource_change_rating(struct clocksource *cs, int rating)
 	spin_unlock_irqrestore(&clocksource_lock, flags);
 }
 
+/**
+ * clocksource_unregister - remove a registered clocksource
+ */
+void clocksource_unregister(struct clocksource *cs)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&clocksource_lock, flags);
+	list_del(&cs->list);
+	if (clocksource_override == cs)
+		clocksource_override = NULL;
+	next_clocksource = select_clocksource();
+	spin_unlock_irqrestore(&clocksource_lock, flags);
+}
+
 #ifdef CONFIG_SYSFS
 /**
  * sysfs_show_current_clocksources - sysfs interface for current clocksource
-- 
cgit v1.2.3


From 45fe4fe19120a22f7339f5bb110447170c25fca9 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 30 Jan 2008 13:30:03 +0100
Subject: x86: make clockevents more robust

detect zero event-device multiplicators - they then cause
division-by-zero crashes if a clockevent has been initialized
incorrectly.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/clockevents.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'kernel')

diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 5fb139fef9fa..3e59fce6dd43 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -41,6 +41,11 @@ unsigned long clockevent_delta2ns(unsigned long latch,
 {
 	u64 clc = ((u64) latch << evt->shift);
 
+	if (unlikely(!evt->mult)) {
+		evt->mult = 1;
+		WARN_ON(1);
+	}
+
 	do_div(clc, evt->mult);
 	if (clc < 1000)
 		clc = 1000;
@@ -151,6 +156,14 @@ static void clockevents_notify_released(void)
 void clockevents_register_device(struct clock_event_device *dev)
 {
 	BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
+	/*
+	 * A nsec2cyc multiplicator of 0 is invalid and we'd crash
+	 * on it, so fix it up and emit a warning:
+	 */
+	if (unlikely(!dev->mult)) {
+		dev->mult = 1;
+		WARN_ON(1);
+	}
 
 	spin_lock(&clockevents_lock);
 
-- 
cgit v1.2.3


From bbe4d18ac2e058c56adb0cd71f49d9ed3216a405 Mon Sep 17 00:00:00 2001
From: john stultz <johnstul@us.ibm.com>
Date: Wed, 30 Jan 2008 13:30:03 +0100
Subject: NTP: correct inconsistent ntp interval/tick_length usage

I recently noticed on one of my boxes that when synched with an NTP
server, the drift value reported for the system was ~283ppm. While in
some cases, clock hardware can be that bad, it struck me as unusual as
the system was using the acpi_pm clocksource, which is one of the more
trustworthy and accurate clocksources on x86 hardware.

I brought up another system and let it sync to the same NTP server, and
I noticed a similar 280some ppm drift.

In looking at the code, I found that the acpi_pm's constant frequency
was being computed correctly at boot-up, however once the system was up,
even without the ntp daemon running, the clocksource's frequency was
being modified by the clocksource_adjust() function.

Digging deeper, I realized that in the code that keeps track of how much
the clocksource is skewing from the ntp desired time, we were using
different lengths to establish how long an time interval was.

The clocksource was being setup with the following interval:
	NTP_INTERVAL_LENGTH = NSEC_PER_SEC/NTP_INTERVAL_FREQ

While the ntp code was using the tick_length_base value:
	tick_length_base ~= (tick_usec * NSEC_PER_USEC * USER_HZ)
					/NTP_INTERVAL_FREQ

The subtle difference is:
	(tick_usec * NSEC_PER_USEC * USER_HZ) != NSEC_PER_SEC

This difference in calculation was causing the clocksource correction
code to apply a correction factor to the clocksource so the two
intervals were the same, however this results in the actual frequency of
the clocksource to be made incorrect. I believe this difference would
affect all clocksources, although to differing degrees depending on the
clocksource resolution.

The issue was introduced when my HZ free ntp patch landed in 2.6.21-rc1,
so my apologies for the mistake, and for not noticing it until now.

The following patch, corrects the clocksource's initialization code so
it uses the same interval length as the code in ntp.c. After applying
this patch, the drift value for the same system went from ~283ppm to
only 2.635ppm.

I believe this patch to be good, however it does affect all arches and
I've only tested on x86, so some caution is advised. I do think it would
be a likely candidate for a stable 2.6.24.x release.

Any thoughts or feedback would be appreciated.

Signed-off-by: John Stultz <johnstul@us.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/timekeeping.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 77680195cf84..092a2366b5a9 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -186,7 +186,8 @@ static void change_clocksource(void)
 
 	clock->error = 0;
 	clock->xtime_nsec = 0;
-	clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
+	clocksource_calculate_interval(clock,
+		(unsigned long)(current_tick_length()>>TICK_LENGTH_SHIFT));
 
 	tick_clock_notify();
 
@@ -243,7 +244,8 @@ void __init timekeeping_init(void)
 	ntp_clear();
 
 	clock = clocksource_get_next();
-	clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
+	clocksource_calculate_interval(clock,
+		(unsigned long)(current_tick_length()>>TICK_LENGTH_SHIFT));
 	clock->cycle_last = clocksource_read(clock);
 
 	xtime.tv_sec = sec;
-- 
cgit v1.2.3


From 6378ddb592158db4b42197f1bc8666228800e379 Mon Sep 17 00:00:00 2001
From: Venki Pallipadi <venkatesh.pallipadi@intel.com>
Date: Wed, 30 Jan 2008 13:30:04 +0100
Subject: time: track accurate idle time with tick_sched.idle_sleeptime

Current idle time in kstat is based on jiffies and is coarse grained.
tick_sched.idle_sleeptime is making some attempt to keep track of idle time
in a fine grained manner.  But, it is not handling the time spent in
interrupts fully.

Make tick_sched.idle_sleeptime accurate with respect to time spent on
handling interrupts and also add tick_sched.idle_lastupdate, which keeps
track of last time when idle_sleeptime was updated.

This statistics will be crucial for cpufreq-ondemand governor, which can
shed some conservative gaurd band that is uses today while setting the
frequency.  The ondemand changes that uses the exact idle time is coming
soon.

Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/tick.h     |  6 +++++
 kernel/softirq.c         |  7 ++++-
 kernel/time/tick-sched.c | 70 +++++++++++++++++++++++++++++++++---------------
 3 files changed, 60 insertions(+), 23 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/tick.h b/include/linux/tick.h
index f4a1395e05ff..0fadf95debe1 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -51,8 +51,10 @@ struct tick_sched {
 	unsigned long			idle_jiffies;
 	unsigned long			idle_calls;
 	unsigned long			idle_sleeps;
+	int				idle_active;
 	ktime_t				idle_entrytime;
 	ktime_t				idle_sleeptime;
+	ktime_t				idle_lastupdate;
 	ktime_t				sleep_length;
 	unsigned long			last_jiffies;
 	unsigned long			next_jiffies;
@@ -103,6 +105,8 @@ extern void tick_nohz_stop_sched_tick(void);
 extern void tick_nohz_restart_sched_tick(void);
 extern void tick_nohz_update_jiffies(void);
 extern ktime_t tick_nohz_get_sleep_length(void);
+extern void tick_nohz_stop_idle(int cpu);
+extern u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time);
 # else
 static inline void tick_nohz_stop_sched_tick(void) { }
 static inline void tick_nohz_restart_sched_tick(void) { }
@@ -113,6 +117,8 @@ static inline ktime_t tick_nohz_get_sleep_length(void)
 
 	return len;
 }
+static inline void tick_nohz_stop_idle(int cpu) { }
+static inline u64 get_cpu_idle_time_us(int cpu, u64 *unused) { return 0; }
 # endif /* !NO_HZ */
 
 #endif
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 8fe1ff40102d..d7837d45419e 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -280,9 +280,14 @@ asmlinkage void do_softirq(void)
  */
 void irq_enter(void)
 {
+#ifdef CONFIG_NO_HZ
+	int cpu = smp_processor_id();
+	if (idle_cpu(cpu) && !in_interrupt())
+		tick_nohz_stop_idle(cpu);
+#endif
 	__irq_enter();
 #ifdef CONFIG_NO_HZ
-	if (idle_cpu(smp_processor_id()))
+	if (idle_cpu(cpu))
 		tick_nohz_update_jiffies();
 #endif
 }
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 49e12f6a4bab..63f24b550695 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -143,6 +143,44 @@ void tick_nohz_update_jiffies(void)
 	local_irq_restore(flags);
 }
 
+void tick_nohz_stop_idle(int cpu)
+{
+	struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
+
+	if (ts->idle_active) {
+		ktime_t now, delta;
+		now = ktime_get();
+		delta = ktime_sub(now, ts->idle_entrytime);
+		ts->idle_lastupdate = now;
+		ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
+		ts->idle_active = 0;
+	}
+}
+
+static ktime_t tick_nohz_start_idle(int cpu)
+{
+	struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
+	ktime_t now, delta;
+
+	now = ktime_get();
+	if (ts->idle_active) {
+		delta = ktime_sub(now, ts->idle_entrytime);
+		ts->idle_lastupdate = now;
+		ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
+	}
+	ts->idle_entrytime = now;
+	ts->idle_active = 1;
+	return now;
+}
+
+u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
+{
+	struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
+
+	*last_update_time = ktime_to_us(ts->idle_lastupdate);
+	return ktime_to_us(ts->idle_sleeptime);
+}
+
 /**
  * tick_nohz_stop_sched_tick - stop the idle tick from the idle task
  *
@@ -155,13 +193,14 @@ void tick_nohz_stop_sched_tick(void)
 	unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags;
 	unsigned long rt_jiffies;
 	struct tick_sched *ts;
-	ktime_t last_update, expires, now, delta;
+	ktime_t last_update, expires, now;
 	struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
 	int cpu;
 
 	local_irq_save(flags);
 
 	cpu = smp_processor_id();
+	now = tick_nohz_start_idle(cpu);
 	ts = &per_cpu(tick_cpu_sched, cpu);
 
 	/*
@@ -193,19 +232,7 @@ void tick_nohz_stop_sched_tick(void)
 		}
 	}
 
-	now = ktime_get();
-	/*
-	 * When called from irq_exit we need to account the idle sleep time
-	 * correctly.
-	 */
-	if (ts->tick_stopped) {
-		delta = ktime_sub(now, ts->idle_entrytime);
-		ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
-	}
-
-	ts->idle_entrytime = now;
 	ts->idle_calls++;
-
 	/* Read jiffies and the time when jiffies were updated last */
 	do {
 		seq = read_seqbegin(&xtime_lock);
@@ -337,23 +364,22 @@ void tick_nohz_restart_sched_tick(void)
 	int cpu = smp_processor_id();
 	struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
 	unsigned long ticks;
-	ktime_t now, delta;
+	ktime_t now;
 
-	if (!ts->tick_stopped)
+	local_irq_disable();
+	tick_nohz_stop_idle(cpu);
+
+	if (!ts->tick_stopped) {
+		local_irq_enable();
 		return;
+	}
 
 	/* Update jiffies first */
-	now = ktime_get();
-
-	local_irq_disable();
 	select_nohz_load_balancer(0);
+	now = ktime_get();
 	tick_do_update_jiffies64(now);
 	cpu_clear(cpu, nohz_cpu_mask);
 
-	/* Account the idle time */
-	delta = ktime_sub(now, ts->idle_entrytime);
-	ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
-
 	/*
 	 * We stopped the tick in idle. Update process times would miss the
 	 * time we slept as update_process_times does only a 1 tick
-- 
cgit v1.2.3


From 6e7c402590b75b6b45138792445ee0f0315a8473 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 30 Jan 2008 13:30:05 +0100
Subject: x86: various changes and cleanups to in_p/out_p delay details

various changes to the in_p/out_p delay details:

- add the io_delay=none method
- make each method selectable from the kernel config
- simplify the delay code a bit by getting rid of an indirect function call
- add the /proc/sys/kernel/io_delay_type sysctl
- change 'io_delay=standard|alternate' to io_delay=0x80 and io_delay=0xed
- make the io delay config not depend on CONFIG_DEBUG_KERNEL

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: "David P. Reed" <dpreed@reed.com>
---
 Documentation/kernel-parameters.txt |  12 ++--
 arch/x86/Kconfig.debug              |  79 ++++++++++++++++++++++++---
 arch/x86/kernel/io_delay.c          | 106 +++++++++++++++++-------------------
 include/asm-x86/io_32.h             |  10 +---
 include/asm-x86/io_64.h             |  10 +---
 kernel/sysctl.c                     |   9 +++
 6 files changed, 143 insertions(+), 83 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 9e6056058425..b427b7c0e5d0 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -795,12 +795,14 @@ and is between 256 and 4096 characters. It is defined in the file
 			then look in the higher range.
 
 	io_delay=	[X86-32,X86-64] I/O delay method
-		standard
-			Standard port 0x80 delay
-		alternate
-			Alternate port 0xed delay
+		0x80
+			Standard port 0x80 based delay
+		0xed
+			Alternate port 0xed based delay (needed on some systems)
 		udelay
-			Simple two microsecond delay
+			Simple two microseconds delay
+		none
+			No delay
 
 	io7=		[HW] IO7 for Marvel based alpha systems
 			See comment before marvel_specify_io7 in
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 40aba670fb37..77eda46f97b8 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -112,13 +112,78 @@ config IOMMU_LEAK
 	  Add a simple leak tracer to the IOMMU code. This is useful when you
 	  are debugging a buggy device driver that leaks IOMMU mappings.
 
-config UDELAY_IO_DELAY
-	bool "Delay I/O through udelay instead of outb"
-	depends on DEBUG_KERNEL
+#
+# IO delay types:
+#
+
+config IO_DELAY_TYPE_0X80
+	int
+	default "0"
+
+config IO_DELAY_TYPE_0XED
+	int
+	default "1"
+
+config IO_DELAY_TYPE_UDELAY
+	int
+	default "2"
+
+config IO_DELAY_TYPE_NONE
+	int
+	default "3"
+
+choice
+	prompt "IO delay type"
+	default IO_DELAY_0X80
+
+config IO_DELAY_0X80
+	bool "port 0x80 based port-IO delay [recommended]"
+	help
+	  This is the traditional Linux IO delay used for in/out_p.
+	  It is the most tested hence safest selection here.
+
+config IO_DELAY_0XED
+	bool "port 0xed based port-IO delay"
+	help
+	  Use port 0xed as the IO delay. This frees up port 0x80 which is
+	  often used as a hardware-debug port.
+
+config IO_DELAY_UDELAY
+	bool "udelay based port-IO delay"
+	help
+	  Use udelay(2) as the IO delay method. This provides the delay
+	  while not having any side-effect on the IO port space.
+
+config IO_DELAY_NONE
+	bool "no port-IO delay"
 	help
-	  Make inb_p/outb_p use udelay() based delays by default. Please note
-	  that udelay() does not have the same bus-level side-effects that
-	  the normal outb based delay does meaning this could cause drivers
-	  to change behaviour and/or bugs to surface.
+	  No port-IO delay. Will break on old boxes that require port-IO
+	  delay for certain operations. Should work on most new machines.
+
+endchoice
+
+if IO_DELAY_0X80
+config DEFAULT_IO_DELAY_TYPE
+	int
+	default IO_DELAY_TYPE_0X80
+endif
+
+if IO_DELAY_0XED
+config DEFAULT_IO_DELAY_TYPE
+	int
+	default IO_DELAY_TYPE_0XED
+endif
+
+if IO_DELAY_UDELAY
+config DEFAULT_IO_DELAY_TYPE
+	int
+	default IO_DELAY_TYPE_UDELAY
+endif
+
+if IO_DELAY_NONE
+config DEFAULT_IO_DELAY_TYPE
+	int
+	default IO_DELAY_TYPE_NONE
+endif
 
 endmenu
diff --git a/arch/x86/kernel/io_delay.c b/arch/x86/kernel/io_delay.c
index 4d955e74b974..f052e34dc94c 100644
--- a/arch/x86/kernel/io_delay.c
+++ b/arch/x86/kernel/io_delay.c
@@ -1,5 +1,9 @@
 /*
  * I/O delay strategies for inb_p/outb_p
+ *
+ * Allow for a DMI based override of port 0x80, needed for certain HP laptops
+ * and possibly other systems. Also allow for the gradual elimination of
+ * outb_p/inb_p API uses.
  */
 #include <linux/kernel.h>
 #include <linux/module.h>
@@ -8,98 +12,86 @@
 #include <linux/dmi.h>
 #include <asm/io.h>
 
-/*
- * Allow for a DMI based override of port 0x80 needed for certain HP laptops
- */
-#define IO_DELAY_PORT_STD 0x80
-#define IO_DELAY_PORT_ALT 0xed
-
-static void standard_io_delay(void)
-{
-	asm volatile ("outb %%al, %0" : : "N" (IO_DELAY_PORT_STD));
-}
-
-static void alternate_io_delay(void)
-{
-	asm volatile ("outb %%al, %0" : : "N" (IO_DELAY_PORT_ALT));
-}
-
-/*
- * 2 usecs is an upper-bound for the outb delay but note that udelay doesn't
- * have the bus-level side-effects that outb does
- */
-#define IO_DELAY_USECS 2
-
-/*
- * High on a hill was a lonely goatherd
- */
-static void udelay_io_delay(void)
-{
-	udelay(IO_DELAY_USECS);
-}
+int io_delay_type __read_mostly = CONFIG_DEFAULT_IO_DELAY_TYPE;
+EXPORT_SYMBOL_GPL(io_delay_type);
 
-#ifndef CONFIG_UDELAY_IO_DELAY
-static void (*io_delay)(void) = standard_io_delay;
-#else
-static void (*io_delay)(void) = udelay_io_delay;
-#endif
+static int __initdata io_delay_override;
 
 /*
  * Paravirt wants native_io_delay to be a constant.
  */
 void native_io_delay(void)
 {
-	io_delay();
+	switch (io_delay_type) {
+	default:
+	case CONFIG_IO_DELAY_TYPE_0X80:
+		asm volatile ("outb %al, $0x80");
+		break;
+	case CONFIG_IO_DELAY_TYPE_0XED:
+		asm volatile ("outb %al, $0xed");
+		break;
+	case CONFIG_IO_DELAY_TYPE_UDELAY:
+		/*
+		 * 2 usecs is an upper-bound for the outb delay but
+		 * note that udelay doesn't have the bus-level
+		 * side-effects that outb does, nor does udelay() have
+		 * precise timings during very early bootup (the delays
+		 * are shorter until calibrated):
+		 */
+		udelay(2);
+	case CONFIG_IO_DELAY_TYPE_NONE:
+		break;
+	}
 }
 EXPORT_SYMBOL(native_io_delay);
 
-#ifndef CONFIG_UDELAY_IO_DELAY
-static int __init dmi_alternate_io_delay_port(const struct dmi_system_id *id)
+static int __init dmi_io_delay_0xed_port(const struct dmi_system_id *id)
 {
-	printk(KERN_NOTICE "%s: using alternate I/O delay port\n", id->ident);
-	io_delay = alternate_io_delay;
+	if (io_delay_type == CONFIG_IO_DELAY_TYPE_0X80) {
+		printk(KERN_NOTICE "%s: using 0xed I/O delay port\n",
+			id->ident);
+		io_delay_type = CONFIG_IO_DELAY_TYPE_0XED;
+	}
+
 	return 0;
 }
 
-static struct dmi_system_id __initdata alternate_io_delay_port_dmi_table[] = {
+/*
+ * Quirk table for systems that misbehave (lock up, etc.) if port
+ * 0x80 is used:
+ */
+static struct dmi_system_id __initdata io_delay_0xed_port_dmi_table[] = {
 	{
-		.callback	= dmi_alternate_io_delay_port,
+		.callback	= dmi_io_delay_0xed_port,
 		.ident		= "HP Pavilion dv9000z",
 		.matches	= {
 			DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"),
 			DMI_MATCH(DMI_BOARD_NAME, "30B9")
 		}
 	},
-	{
-	}
+	{ }
 };
 
-static int __initdata io_delay_override;
-
 void __init io_delay_init(void)
 {
 	if (!io_delay_override)
-		dmi_check_system(alternate_io_delay_port_dmi_table);
+		dmi_check_system(io_delay_0xed_port_dmi_table);
 }
-#endif
 
 static int __init io_delay_param(char *s)
 {
-	if (!s)
-		return -EINVAL;
-
-	if (!strcmp(s, "standard"))
-		io_delay = standard_io_delay;
-	else if (!strcmp(s, "alternate"))
-		io_delay = alternate_io_delay;
+	if (!strcmp(s, "0x80"))
+		io_delay_type = CONFIG_IO_DELAY_TYPE_0X80;
+	else if (!strcmp(s, "0xed"))
+		io_delay_type = CONFIG_IO_DELAY_TYPE_0XED;
 	else if (!strcmp(s, "udelay"))
-		io_delay = udelay_io_delay;
+		io_delay_type = CONFIG_IO_DELAY_TYPE_UDELAY;
+	else if (!strcmp(s, "none"))
+		io_delay_type = CONFIG_IO_DELAY_TYPE_NONE;
 	else
 		return -EINVAL;
 
-#ifndef CONFIG_UDELAY_IO_DELAY
 	io_delay_override = 1;
-#endif
 	return 0;
 }
 
diff --git a/include/asm-x86/io_32.h b/include/asm-x86/io_32.h
index a8d25c38b91c..2a04bd17eac5 100644
--- a/include/asm-x86/io_32.h
+++ b/include/asm-x86/io_32.h
@@ -250,15 +250,11 @@ static inline void flush_write_buffers(void)
 
 #endif /* __KERNEL__ */
 
-#ifndef CONFIG_UDELAY_IO_DELAY
-extern void io_delay_init(void);
-#else
-static inline void io_delay_init(void)
-{
-}
-#endif
 extern void native_io_delay(void);
 
+extern int io_delay_type;
+extern void io_delay_init(void);
+
 #if defined(CONFIG_PARAVIRT)
 #include <asm/paravirt.h>
 #else
diff --git a/include/asm-x86/io_64.h b/include/asm-x86/io_64.h
index 5bebaf961692..dbcc03aa1c6a 100644
--- a/include/asm-x86/io_64.h
+++ b/include/asm-x86/io_64.h
@@ -35,15 +35,11 @@
   *  - Arnaldo Carvalho de Melo <acme@conectiva.com.br>
   */
 
-#ifndef CONFIG_UDELAY_IO_DELAY
-extern void io_delay_init(void);
-#else
-static inline void io_delay_init(void)
-{
-}
-#endif
 extern void native_io_delay(void);
 
+extern int io_delay_type;
+extern void io_delay_init(void);
+
 static inline void slow_down_io(void)
 {
 	native_io_delay();
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 4bc8e48434a7..357b68ba23ec 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -53,6 +53,7 @@
 #ifdef CONFIG_X86
 #include <asm/nmi.h>
 #include <asm/stacktrace.h>
+#include <asm/io.h>
 #endif
 
 static int deprecated_sysctl_warning(struct __sysctl_args *args);
@@ -727,6 +728,14 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "io_delay_type",
+		.data		= &io_delay_type,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
 #endif
 #if defined(CONFIG_MMU)
 	{
-- 
cgit v1.2.3


From 36df29d7994180cf7f0ccc5d46495855d56d2721 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Wed, 30 Jan 2008 13:30:51 +0100
Subject: ptrace: generic resume

This makes ptrace_request handle all the ptrace requests that wake
up the traced task.  These do low-level ptrace implementation magic
that is not arch-specific and should be kept out of arch code.  The
implementations on each arch usually do the same thing.  The new
generic code makes use of the arch_has_single_step macro and generic
entry points to handle PTRACE_SINGLESTEP.

Signed-off-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/ptrace.c | 61 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 61 insertions(+)

(limited to 'kernel')

diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index c719bb9d79ab..fad5f1e8511f 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -366,6 +366,50 @@ static int ptrace_setsiginfo(struct task_struct *child, siginfo_t __user * data)
 	return error;
 }
 
+
+#ifdef PTRACE_SINGLESTEP
+#define is_singlestep(request)		((request) == PTRACE_SINGLESTEP)
+#else
+#define is_singlestep(request)		0
+#endif
+
+#ifdef PTRACE_SYSEMU
+#define is_sysemu_singlestep(request)	((request) == PTRACE_SYSEMU_SINGLESTEP)
+#else
+#define is_sysemu_singlestep(request)	0
+#endif
+
+static int ptrace_resume(struct task_struct *child, long request, long data)
+{
+	if (!valid_signal(data))
+		return -EIO;
+
+	if (request == PTRACE_SYSCALL)
+		set_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
+	else
+		clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
+
+#ifdef TIF_SYSCALL_EMU
+	if (request == PTRACE_SYSEMU || request == PTRACE_SYSEMU_SINGLESTEP)
+		set_tsk_thread_flag(child, TIF_SYSCALL_EMU);
+	else
+		clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
+#endif
+
+	if (is_singlestep(request) || is_sysemu_singlestep(request)) {
+		if (unlikely(!arch_has_single_step()))
+			return -EIO;
+		user_enable_single_step(child);
+	}
+	else
+		user_disable_single_step(child);
+
+	child->exit_code = data;
+	wake_up_process(child);
+
+	return 0;
+}
+
 int ptrace_request(struct task_struct *child, long request,
 		   long addr, long data)
 {
@@ -390,6 +434,23 @@ int ptrace_request(struct task_struct *child, long request,
 	case PTRACE_DETACH:	 /* detach a process that was attached. */
 		ret = ptrace_detach(child, data);
 		break;
+
+#ifdef PTRACE_SINGLESTEP
+	case PTRACE_SINGLESTEP:
+#endif
+#ifdef PTRACE_SYSEMU
+	case PTRACE_SYSEMU:
+	case PTRACE_SYSEMU_SINGLESTEP:
+#endif
+	case PTRACE_SYSCALL:
+	case PTRACE_CONT:
+		return ptrace_resume(child, request, data);
+
+	case PTRACE_KILL:
+		if (child->exit_state)	/* already dead */
+			return 0;
+		return ptrace_resume(child, request, SIGKILL);
+
 	default:
 		break;
 	}
-- 
cgit v1.2.3


From 5b88abbf770a0e1975c668743100f42934f385e8 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Wed, 30 Jan 2008 13:30:53 +0100
Subject: ptrace: generic PTRACE_SINGLEBLOCK

This makes ptrace_request handle PTRACE_SINGLEBLOCK along with
PTRACE_CONT et al.  The new generic code makes use of the
arch_has_block_step macro and generic entry points on machines
that define them.

[ mingo@elte.hu: bugfix ]

Signed-off-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/ptrace.h |  2 +-
 kernel/ptrace.c        | 15 ++++++++++++++-
 2 files changed, 15 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index 1febc541dda5..515bff053de8 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -187,7 +187,7 @@ static inline void user_disable_single_step(struct task_struct *task)
  * supports step-until-branch for user mode.  It can be a constant or it
  * can test a CPU feature bit.
  */
-#define arch_has_single_step()		(0)
+#define arch_has_block_step()		(0)
 
 /**
  * user_enable_block_step - step until branch in user-mode task
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index fad5f1e8511f..973d727f5e84 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -373,6 +373,12 @@ static int ptrace_setsiginfo(struct task_struct *child, siginfo_t __user * data)
 #define is_singlestep(request)		0
 #endif
 
+#ifdef PTRACE_SINGLEBLOCK
+#define is_singleblock(request)		((request) == PTRACE_SINGLEBLOCK)
+#else
+#define is_singleblock(request)		0
+#endif
+
 #ifdef PTRACE_SYSEMU
 #define is_sysemu_singlestep(request)	((request) == PTRACE_SYSEMU_SINGLESTEP)
 #else
@@ -396,7 +402,11 @@ static int ptrace_resume(struct task_struct *child, long request, long data)
 		clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
 #endif
 
-	if (is_singlestep(request) || is_sysemu_singlestep(request)) {
+	if (is_singleblock(request)) {
+		if (unlikely(!arch_has_block_step()))
+			return -EIO;
+		user_enable_block_step(child);
+	} else if (is_singlestep(request) || is_sysemu_singlestep(request)) {
 		if (unlikely(!arch_has_single_step()))
 			return -EIO;
 		user_enable_single_step(child);
@@ -438,6 +448,9 @@ int ptrace_request(struct task_struct *child, long request,
 #ifdef PTRACE_SINGLESTEP
 	case PTRACE_SINGLESTEP:
 #endif
+#ifdef PTRACE_SINGLEBLOCK
+	case PTRACE_SINGLEBLOCK:
+#endif
 #ifdef PTRACE_SYSEMU
 	case PTRACE_SYSEMU:
 	case PTRACE_SYSEMU_SINGLESTEP:
-- 
cgit v1.2.3


From 65ea5b0349903585bfed9720fa06f5edb4f1cd25 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Wed, 30 Jan 2008 13:30:56 +0100
Subject: x86: rename the struct pt_regs members for 32/64-bit consistency

We have a lot of code which differs only by the naming of specific
members of structures that contain registers.  In order to enable
additional unifications, this patch drops the e- or r- size prefix
from the register names in struct pt_regs, and drops the x- prefixes
for segment registers on the 32-bit side.

This patch also performs the equivalent renames in some additional
places that might be candidates for unification in the future.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/ia32/ia32_aout.c           |  30 +++---
 arch/x86/ia32/ia32_binfmt.c         |  42 ++++----
 arch/x86/ia32/ia32_signal.c         | 100 +++++++++----------
 arch/x86/ia32/ptrace32.c            |  44 ++++-----
 arch/x86/ia32/sys_ia32.c            |   6 +-
 arch/x86/kernel/acpi/wakeup_64.S    |  32 +++---
 arch/x86/kernel/asm-offsets_32.c    |  32 +++---
 arch/x86/kernel/asm-offsets_64.c    |  18 ++--
 arch/x86/kernel/cpu/common.c        |   2 +-
 arch/x86/kernel/cpu/mcheck/mce_64.c |  14 +--
 arch/x86/kernel/i8259_32.c          |   2 +-
 arch/x86/kernel/io_apic_64.c        |   2 +-
 arch/x86/kernel/ioport_32.c         |   8 +-
 arch/x86/kernel/ioport_64.c         |   6 +-
 arch/x86/kernel/irq_32.c            |  20 ++--
 arch/x86/kernel/irq_64.c            |  10 +-
 arch/x86/kernel/kprobes_32.c        |  92 ++++++++---------
 arch/x86/kernel/kprobes_64.c        |  69 ++++++-------
 arch/x86/kernel/process_32.c        | 140 +++++++++++++-------------
 arch/x86/kernel/process_64.c        |  44 ++++-----
 arch/x86/kernel/ptrace_32.c         |  26 ++---
 arch/x86/kernel/ptrace_64.c         |  24 ++---
 arch/x86/kernel/signal_32.c         | 192 ++++++++++++++++++------------------
 arch/x86/kernel/signal_64.c         | 115 ++++++++++-----------
 arch/x86/kernel/smp_64.c            |   2 +-
 arch/x86/kernel/smpboot_32.c        |  10 +-
 arch/x86/kernel/step.c              |  15 +--
 arch/x86/kernel/suspend_asm_64.S    |  32 +++---
 arch/x86/kernel/time_32.c           |   8 +-
 arch/x86/kernel/time_64.c           |   4 +-
 arch/x86/kernel/traps_32.c          | 114 ++++++++++-----------
 arch/x86/kernel/traps_64.c          |  84 ++++++++--------
 arch/x86/kernel/vm86_32.c           |  94 +++++++++---------
 arch/x86/kernel/vmi_32.c            |  50 +++++-----
 arch/x86/kernel/vsyscall_64.c       |   2 +-
 arch/x86/lguest/boot.c              |  22 ++---
 arch/x86/mach-voyager/voyager_smp.c |  12 +--
 arch/x86/mm/extable_32.c            |   6 +-
 arch/x86/mm/fault_32.c              |  38 +++----
 arch/x86/mm/fault_64.c              |  22 ++---
 arch/x86/oprofile/backtrace.c       |   6 +-
 arch/x86/xen/enlighten.c            |  18 ++--
 arch/x86/xen/events.c               |   2 +-
 include/asm-x86/compat.h            |   2 +-
 include/asm-x86/elf.h               |  66 ++++++-------
 include/asm-x86/kexec_32.h          |  36 +++----
 include/asm-x86/kexec_64.h          |  20 ++--
 include/asm-x86/kprobes_32.h        |   2 +-
 include/asm-x86/kprobes_64.h        |   2 +-
 include/asm-x86/mce.h               |   4 +-
 include/asm-x86/processor_32.h      |  20 ++--
 include/asm-x86/processor_64.h      |   8 +-
 include/asm-x86/ptrace.h            |  80 ++++++++++++---
 kernel/signal.c                     |   4 +-
 54 files changed, 953 insertions(+), 902 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c
index a764e4e95314..f1a0f83676dc 100644
--- a/arch/x86/ia32/ia32_aout.c
+++ b/arch/x86/ia32/ia32_aout.c
@@ -53,7 +53,7 @@ static void dump_thread32(struct pt_regs *regs, struct user32 *dump)
 /* changed the size calculations - should hopefully work better. lbt */
 	dump->magic = CMAGIC;
 	dump->start_code = 0;
-	dump->start_stack = regs->rsp & ~(PAGE_SIZE - 1);
+	dump->start_stack = regs->sp & ~(PAGE_SIZE - 1);
 	dump->u_tsize = ((unsigned long) current->mm->end_code) >> PAGE_SHIFT;
 	dump->u_dsize = ((unsigned long)
 			 (current->mm->brk + (PAGE_SIZE-1))) >> PAGE_SHIFT;
@@ -75,22 +75,22 @@ static void dump_thread32(struct pt_regs *regs, struct user32 *dump)
 		dump->u_ssize = tmp >> PAGE_SHIFT;
 	}
 
-	dump->regs.ebx = regs->rbx;
-	dump->regs.ecx = regs->rcx;
-	dump->regs.edx = regs->rdx;
-	dump->regs.esi = regs->rsi;
-	dump->regs.edi = regs->rdi;
-	dump->regs.ebp = regs->rbp;
-	dump->regs.eax = regs->rax;
+	dump->regs.bx = regs->bx;
+	dump->regs.cx = regs->cx;
+	dump->regs.dx = regs->dx;
+	dump->regs.si = regs->si;
+	dump->regs.di = regs->di;
+	dump->regs.bp = regs->bp;
+	dump->regs.ax = regs->ax;
 	dump->regs.ds = current->thread.ds;
 	dump->regs.es = current->thread.es;
 	asm("movl %%fs,%0" : "=r" (fs)); dump->regs.fs = fs;
 	asm("movl %%gs,%0" : "=r" (gs)); dump->regs.gs = gs;
-	dump->regs.orig_eax = regs->orig_rax;
-	dump->regs.eip = regs->rip;
+	dump->regs.orig_ax = regs->orig_ax;
+	dump->regs.ip = regs->ip;
 	dump->regs.cs = regs->cs;
-	dump->regs.eflags = regs->eflags;
-	dump->regs.esp = regs->rsp;
+	dump->regs.flags = regs->flags;
+	dump->regs.sp = regs->sp;
 	dump->regs.ss = regs->ss;
 
 #if 1 /* FIXME */
@@ -432,9 +432,9 @@ beyond_if:
 	asm volatile("movl %0,%%fs" :: "r" (0)); \
 	asm volatile("movl %0,%%es; movl %0,%%ds": :"r" (__USER32_DS));
 	load_gs_index(0);
-	(regs)->rip = ex.a_entry;
-	(regs)->rsp = current->mm->start_stack;
-	(regs)->eflags = 0x200;
+	(regs)->ip = ex.a_entry;
+	(regs)->sp = current->mm->start_stack;
+	(regs)->flags = 0x200;
 	(regs)->cs = __USER32_CS;
 	(regs)->ss = __USER32_DS;
 	regs->r8 = regs->r9 = regs->r10 = regs->r11 =
diff --git a/arch/x86/ia32/ia32_binfmt.c b/arch/x86/ia32/ia32_binfmt.c
index e32974c3dd3b..806135c3f975 100644
--- a/arch/x86/ia32/ia32_binfmt.c
+++ b/arch/x86/ia32/ia32_binfmt.c
@@ -73,22 +73,22 @@ struct file;
 /* Assumes current==process to be dumped */
 #undef	ELF_CORE_COPY_REGS
 #define ELF_CORE_COPY_REGS(pr_reg, regs)       		\
-	pr_reg[0] = regs->rbx;				\
-	pr_reg[1] = regs->rcx;				\
-	pr_reg[2] = regs->rdx;				\
-	pr_reg[3] = regs->rsi;				\
-	pr_reg[4] = regs->rdi;				\
-	pr_reg[5] = regs->rbp;				\
-	pr_reg[6] = regs->rax;				\
+	pr_reg[0] = regs->bx;				\
+	pr_reg[1] = regs->cx;				\
+	pr_reg[2] = regs->dx;				\
+	pr_reg[3] = regs->si;				\
+	pr_reg[4] = regs->di;				\
+	pr_reg[5] = regs->bp;				\
+	pr_reg[6] = regs->ax;				\
 	pr_reg[7] = _GET_SEG(ds);   			\
 	pr_reg[8] = _GET_SEG(es);			\
 	pr_reg[9] = _GET_SEG(fs);			\
 	pr_reg[10] = _GET_SEG(gs);			\
-	pr_reg[11] = regs->orig_rax;			\
-	pr_reg[12] = regs->rip;				\
+	pr_reg[11] = regs->orig_ax;			\
+	pr_reg[12] = regs->ip;				\
 	pr_reg[13] = regs->cs;				\
-	pr_reg[14] = regs->eflags;			\
-	pr_reg[15] = regs->rsp;				\
+	pr_reg[14] = regs->flags;			\
+	pr_reg[15] = regs->sp;				\
 	pr_reg[16] = regs->ss;
 
 
@@ -205,9 +205,9 @@ do {							\
 	asm volatile("movl %0,%%fs" :: "r" (0)); \
 	asm volatile("movl %0,%%es; movl %0,%%ds": :"r" (__USER32_DS)); \
 	load_gs_index(0); \
-	(regs)->rip = (new_rip); \
-	(regs)->rsp = (new_rsp); \
-	(regs)->eflags = 0x200; \
+	(regs)->ip = (new_rip); \
+	(regs)->sp = (new_rsp); \
+	(regs)->flags = X86_EFLAGS_IF; \
 	(regs)->cs = __USER32_CS; \
 	(regs)->ss = __USER32_DS; \
 	set_fs(USER_DS); \
@@ -233,13 +233,13 @@ extern int syscall32_setup_pages(struct linux_binprm *, int exstack);
 static void elf32_init(struct pt_regs *regs)
 {
 	struct task_struct *me = current; 
-	regs->rdi = 0;
-	regs->rsi = 0;
-	regs->rdx = 0;
-	regs->rcx = 0;
-	regs->rax = 0;
-	regs->rbx = 0; 
-	regs->rbp = 0; 
+	regs->di = 0;
+	regs->si = 0;
+	regs->dx = 0;
+	regs->cx = 0;
+	regs->ax = 0;
+	regs->bx = 0;
+	regs->bp = 0;
 	regs->r8 = regs->r9 = regs->r10 = regs->r11 = regs->r12 =
 		regs->r13 = regs->r14 = regs->r15 = 0; 
     me->thread.fs = 0; 
diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index 39356a756b28..f2da443f8c7b 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -154,7 +154,7 @@ asmlinkage long sys32_sigaltstack(const stack_ia32_t __user *uss_ptr,
 	}
 	seg = get_fs();
 	set_fs(KERNEL_DS);
-	ret = do_sigaltstack(uss_ptr ? &uss : NULL, &uoss, regs->rsp);
+	ret = do_sigaltstack(uss_ptr ? &uss : NULL, &uoss, regs->sp);
 	set_fs(seg);
 	if (ret >= 0 && uoss_ptr)  {
 		if (!access_ok(VERIFY_WRITE, uoss_ptr, sizeof(stack_ia32_t)) ||
@@ -195,7 +195,7 @@ struct rt_sigframe
 #define COPY(x)		{ \
 	unsigned int reg;			\
 	err |= __get_user(reg, &sc->e ##x);	\
-	regs->r ## x = reg;			\
+	regs->x = reg;			\
 }
 
 #define RELOAD_SEG(seg,mask)						\
@@ -220,7 +220,7 @@ static int ia32_restore_sigcontext(struct pt_regs *regs,
 #if DEBUG_SIG
 	printk(KERN_DEBUG "SIG restore_sigcontext: "
 	       "sc=%p err(%x) eip(%x) cs(%x) flg(%x)\n",
-	       sc, sc->err, sc->eip, sc->cs, sc->eflags);
+	       sc, sc->err, sc->ip, sc->cs, sc->flags);
 #endif
 
 	/*
@@ -249,9 +249,9 @@ static int ia32_restore_sigcontext(struct pt_regs *regs,
 	regs->ss |= 3;
 
 	err |= __get_user(tmpflags, &sc->eflags);
-	regs->eflags = (regs->eflags & ~0x40DD5) | (tmpflags & 0x40DD5);
+	regs->flags = (regs->flags & ~0x40DD5) | (tmpflags & 0x40DD5);
 	/* disable syscall checks */
-	regs->orig_rax = -1;
+	regs->orig_ax = -1;
 
 	err |= __get_user(tmp, &sc->fpstate);
 	buf = compat_ptr(tmp);
@@ -279,9 +279,9 @@ badframe:
 
 asmlinkage long sys32_sigreturn(struct pt_regs *regs)
 {
-	struct sigframe __user *frame = (struct sigframe __user *)(regs->rsp-8);
+	struct sigframe __user *frame = (struct sigframe __user *)(regs->sp-8);
 	sigset_t set;
-	unsigned int eax;
+	unsigned int ax;
 
 	if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
 		goto badframe;
@@ -298,9 +298,9 @@ asmlinkage long sys32_sigreturn(struct pt_regs *regs)
 	recalc_sigpending();
 	spin_unlock_irq(&current->sighand->siglock);
 
-	if (ia32_restore_sigcontext(regs, &frame->sc, &eax))
+	if (ia32_restore_sigcontext(regs, &frame->sc, &ax))
 		goto badframe;
-	return eax;
+	return ax;
 
 badframe:
 	signal_fault(regs, frame, "32bit sigreturn");
@@ -311,10 +311,10 @@ asmlinkage long sys32_rt_sigreturn(struct pt_regs *regs)
 {
 	struct rt_sigframe __user *frame;
 	sigset_t set;
-	unsigned int eax;
+	unsigned int ax;
 	struct pt_regs tregs;
 
-	frame = (struct rt_sigframe __user *)(regs->rsp - 4);
+	frame = (struct rt_sigframe __user *)(regs->sp - 4);
 
 	if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
 		goto badframe;
@@ -327,14 +327,14 @@ asmlinkage long sys32_rt_sigreturn(struct pt_regs *regs)
 	recalc_sigpending();
 	spin_unlock_irq(&current->sighand->siglock);
 
-	if (ia32_restore_sigcontext(regs, &frame->uc.uc_mcontext, &eax))
+	if (ia32_restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax))
 		goto badframe;
 
 	tregs = *regs;
 	if (sys32_sigaltstack(&frame->uc.uc_stack, NULL, &tregs) == -EFAULT)
 		goto badframe;
 
-	return eax;
+	return ax;
 
 badframe:
 	signal_fault(regs, frame, "32bit rt sigreturn");
@@ -361,21 +361,21 @@ static int ia32_setup_sigcontext(struct sigcontext_ia32 __user *sc,
 	__asm__("movl %%es,%0" : "=r"(tmp): "0"(tmp));
 	err |= __put_user(tmp, (unsigned int __user *)&sc->es);
 
-	err |= __put_user((u32)regs->rdi, &sc->edi);
-	err |= __put_user((u32)regs->rsi, &sc->esi);
-	err |= __put_user((u32)regs->rbp, &sc->ebp);
-	err |= __put_user((u32)regs->rsp, &sc->esp);
-	err |= __put_user((u32)regs->rbx, &sc->ebx);
-	err |= __put_user((u32)regs->rdx, &sc->edx);
-	err |= __put_user((u32)regs->rcx, &sc->ecx);
-	err |= __put_user((u32)regs->rax, &sc->eax);
+	err |= __put_user((u32)regs->di, &sc->edi);
+	err |= __put_user((u32)regs->si, &sc->esi);
+	err |= __put_user((u32)regs->bp, &sc->ebp);
+	err |= __put_user((u32)regs->sp, &sc->esp);
+	err |= __put_user((u32)regs->bx, &sc->ebx);
+	err |= __put_user((u32)regs->dx, &sc->edx);
+	err |= __put_user((u32)regs->cx, &sc->ecx);
+	err |= __put_user((u32)regs->ax, &sc->eax);
 	err |= __put_user((u32)regs->cs, &sc->cs);
 	err |= __put_user((u32)regs->ss, &sc->ss);
 	err |= __put_user(current->thread.trap_no, &sc->trapno);
 	err |= __put_user(current->thread.error_code, &sc->err);
-	err |= __put_user((u32)regs->rip, &sc->eip);
-	err |= __put_user((u32)regs->eflags, &sc->eflags);
-	err |= __put_user((u32)regs->rsp, &sc->esp_at_signal);
+	err |= __put_user((u32)regs->ip, &sc->eip);
+	err |= __put_user((u32)regs->flags, &sc->eflags);
+	err |= __put_user((u32)regs->sp, &sc->esp_at_signal);
 
 	tmp = save_i387_ia32(current, fpstate, regs, 0);
 	if (tmp < 0)
@@ -400,28 +400,28 @@ static int ia32_setup_sigcontext(struct sigcontext_ia32 __user *sc,
 static void __user *get_sigframe(struct k_sigaction *ka, struct pt_regs *regs,
 				 size_t frame_size)
 {
-	unsigned long rsp;
+	unsigned long sp;
 
 	/* Default to using normal stack */
-	rsp = regs->rsp;
+	sp = regs->sp;
 
 	/* This is the X/Open sanctioned signal stack switching.  */
 	if (ka->sa.sa_flags & SA_ONSTACK) {
-		if (sas_ss_flags(rsp) == 0)
-			rsp = current->sas_ss_sp + current->sas_ss_size;
+		if (sas_ss_flags(sp) == 0)
+			sp = current->sas_ss_sp + current->sas_ss_size;
 	}
 
 	/* This is the legacy signal stack switching. */
 	else if ((regs->ss & 0xffff) != __USER_DS &&
 		!(ka->sa.sa_flags & SA_RESTORER) &&
 		 ka->sa.sa_restorer)
-		rsp = (unsigned long) ka->sa.sa_restorer;
+		sp = (unsigned long) ka->sa.sa_restorer;
 
-	rsp -= frame_size;
+	sp -= frame_size;
 	/* Align the stack pointer according to the i386 ABI,
 	 * i.e. so that on function entry ((sp + 4) & 15) == 0. */
-	rsp = ((rsp + 4) & -16ul) - 4;
-	return (void __user *) rsp;
+	sp = ((sp + 4) & -16ul) - 4;
+	return (void __user *) sp;
 }
 
 int ia32_setup_frame(int sig, struct k_sigaction *ka,
@@ -486,13 +486,13 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka,
 		goto give_sigsegv;
 
 	/* Set up registers for signal handler */
-	regs->rsp = (unsigned long) frame;
-	regs->rip = (unsigned long) ka->sa.sa_handler;
+	regs->sp = (unsigned long) frame;
+	regs->ip = (unsigned long) ka->sa.sa_handler;
 
 	/* Make -mregparm=3 work */
-	regs->rax = sig;
-	regs->rdx = 0;
-	regs->rcx = 0;
+	regs->ax = sig;
+	regs->dx = 0;
+	regs->cx = 0;
 
 	asm volatile("movl %0,%%ds" :: "r" (__USER32_DS));
 	asm volatile("movl %0,%%es" :: "r" (__USER32_DS));
@@ -501,13 +501,13 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka,
 	regs->ss = __USER32_DS;
 
 	set_fs(USER_DS);
-	regs->eflags &= ~TF_MASK;
+	regs->flags &= ~TF_MASK;
 	if (test_thread_flag(TIF_SINGLESTEP))
 		ptrace_notify(SIGTRAP);
 
 #if DEBUG_SIG
 	printk(KERN_DEBUG "SIG deliver (%s:%d): sp=%p pc=%lx ra=%u\n",
-	       current->comm, current->pid, frame, regs->rip, frame->pretcode);
+	       current->comm, current->pid, frame, regs->ip, frame->pretcode);
 #endif
 
 	return 0;
@@ -556,7 +556,7 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
 	err |= __put_user(0, &frame->uc.uc_flags);
 	err |= __put_user(0, &frame->uc.uc_link);
 	err |= __put_user(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
-	err |= __put_user(sas_ss_flags(regs->rsp),
+	err |= __put_user(sas_ss_flags(regs->sp),
 			  &frame->uc.uc_stack.ss_flags);
 	err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size);
 	err |= ia32_setup_sigcontext(&frame->uc.uc_mcontext, &frame->fpstate,
@@ -581,18 +581,18 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
 		goto give_sigsegv;
 
 	/* Set up registers for signal handler */
-	regs->rsp = (unsigned long) frame;
-	regs->rip = (unsigned long) ka->sa.sa_handler;
+	regs->sp = (unsigned long) frame;
+	regs->ip = (unsigned long) ka->sa.sa_handler;
 
 	/* Make -mregparm=3 work */
-	regs->rax = sig;
-	regs->rdx = (unsigned long) &frame->info;
-	regs->rcx = (unsigned long) &frame->uc;
+	regs->ax = sig;
+	regs->dx = (unsigned long) &frame->info;
+	regs->cx = (unsigned long) &frame->uc;
 
 	/* Make -mregparm=3 work */
-	regs->rax = sig;
-	regs->rdx = (unsigned long) &frame->info;
-	regs->rcx = (unsigned long) &frame->uc;
+	regs->ax = sig;
+	regs->dx = (unsigned long) &frame->info;
+	regs->cx = (unsigned long) &frame->uc;
 
 	asm volatile("movl %0,%%ds" :: "r" (__USER32_DS));
 	asm volatile("movl %0,%%es" :: "r" (__USER32_DS));
@@ -601,13 +601,13 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
 	regs->ss = __USER32_DS;
 
 	set_fs(USER_DS);
-	regs->eflags &= ~TF_MASK;
+	regs->flags &= ~TF_MASK;
 	if (test_thread_flag(TIF_SINGLESTEP))
 		ptrace_notify(SIGTRAP);
 
 #if DEBUG_SIG
 	printk(KERN_DEBUG "SIG deliver (%s:%d): sp=%p pc=%lx ra=%u\n",
-	       current->comm, current->pid, frame, regs->rip, frame->pretcode);
+	       current->comm, current->pid, frame, regs->ip, frame->pretcode);
 #endif
 
 	return 0;
diff --git a/arch/x86/ia32/ptrace32.c b/arch/x86/ia32/ptrace32.c
index 8c6fb9d8352b..1e382e3bd882 100644
--- a/arch/x86/ia32/ptrace32.c
+++ b/arch/x86/ia32/ptrace32.c
@@ -73,19 +73,19 @@ static int putreg32(struct task_struct *child, unsigned regno, u32 val)
 		stack[offsetof(struct pt_regs, cs)/8] = val & 0xffff;
 		break;
 
-	R32(ebx, rbx);
-	R32(ecx, rcx);
-	R32(edx, rdx);
-	R32(edi, rdi);
-	R32(esi, rsi);
-	R32(ebp, rbp);
-	R32(eax, rax);
-	R32(orig_eax, orig_rax);
-	R32(eip, rip);
-	R32(esp, rsp);
+	R32(ebx, bx);
+	R32(ecx, cx);
+	R32(edx, dx);
+	R32(edi, di);
+	R32(esi, si);
+	R32(ebp, bp);
+	R32(eax, ax);
+	R32(orig_eax, orig_ax);
+	R32(eip, ip);
+	R32(esp, sp);
 
 	case offsetof(struct user32, regs.eflags): {
-		__u64 *flags = &stack[offsetof(struct pt_regs, eflags)/8];
+		__u64 *flags = &stack[offsetof(struct pt_regs, flags)/8];
 
 		val &= FLAG_MASK;
 		/*
@@ -145,22 +145,22 @@ static int getreg32(struct task_struct *child, unsigned regno, u32 *val)
 
 	R32(cs, cs);
 	R32(ss, ss);
-	R32(ebx, rbx);
-	R32(ecx, rcx);
-	R32(edx, rdx);
-	R32(edi, rdi);
-	R32(esi, rsi);
-	R32(ebp, rbp);
-	R32(eax, rax);
-	R32(orig_eax, orig_rax);
-	R32(eip, rip);
-	R32(esp, rsp);
+	R32(ebx, bx);
+	R32(ecx, cx);
+	R32(edx, dx);
+	R32(edi, di);
+	R32(esi, si);
+	R32(ebp, bp);
+	R32(eax, ax);
+	R32(orig_eax, orig_ax);
+	R32(eip, ip);
+	R32(esp, sp);
 
 	case offsetof(struct user32, regs.eflags):
 		/*
 		 * If the debugger set TF, hide it from the readout.
 		 */
-		*val = stack[offsetof(struct pt_regs, eflags)/8];
+		*val = stack[offsetof(struct pt_regs, flags)/8];
 		if (test_tsk_thread_flag(child, TIF_FORCED_TF))
 			*val &= ~X86_EFLAGS_TF;
 		break;
diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c
index 58991abc5b59..abf71d26fc2a 100644
--- a/arch/x86/ia32/sys_ia32.c
+++ b/arch/x86/ia32/sys_ia32.c
@@ -816,11 +816,11 @@ asmlinkage long sys32_execve(char __user *name, compat_uptr_t __user *argv,
 asmlinkage long sys32_clone(unsigned int clone_flags, unsigned int newsp,
 			    struct pt_regs *regs)
 {
-	void __user *parent_tid = (void __user *)regs->rdx;
-	void __user *child_tid = (void __user *)regs->rdi;
+	void __user *parent_tid = (void __user *)regs->dx;
+	void __user *child_tid = (void __user *)regs->di;
 
 	if (!newsp)
-		newsp = regs->rsp;
+		newsp = regs->sp;
 	return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
 }
 
diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S
index 5ed3bc5c61d7..2e1b9e0d0767 100644
--- a/arch/x86/kernel/acpi/wakeup_64.S
+++ b/arch/x86/kernel/acpi/wakeup_64.S
@@ -344,13 +344,13 @@ do_suspend_lowlevel:
 	call	save_processor_state
 
 	movq	$saved_context, %rax
-	movq	%rsp, pt_regs_rsp(%rax)
-	movq	%rbp, pt_regs_rbp(%rax)
-	movq	%rsi, pt_regs_rsi(%rax)
-	movq	%rdi, pt_regs_rdi(%rax)
-	movq	%rbx, pt_regs_rbx(%rax)
-	movq	%rcx, pt_regs_rcx(%rax)
-	movq	%rdx, pt_regs_rdx(%rax)
+	movq	%rsp, pt_regs_sp(%rax)
+	movq	%rbp, pt_regs_bp(%rax)
+	movq	%rsi, pt_regs_si(%rax)
+	movq	%rdi, pt_regs_di(%rax)
+	movq	%rbx, pt_regs_bx(%rax)
+	movq	%rcx, pt_regs_cx(%rax)
+	movq	%rdx, pt_regs_dx(%rax)
 	movq	%r8, pt_regs_r8(%rax)
 	movq	%r9, pt_regs_r9(%rax)
 	movq	%r10, pt_regs_r10(%rax)
@@ -360,7 +360,7 @@ do_suspend_lowlevel:
 	movq	%r14, pt_regs_r14(%rax)
 	movq	%r15, pt_regs_r15(%rax)
 	pushfq
-	popq	pt_regs_eflags(%rax)
+	popq	pt_regs_flags(%rax)
 
 	movq	$.L97, saved_rip(%rip)
 
@@ -391,15 +391,15 @@ do_suspend_lowlevel:
 	movq	%rbx, %cr2
 	movq	saved_context_cr0(%rax), %rbx
 	movq	%rbx, %cr0
-	pushq	pt_regs_eflags(%rax)
+	pushq	pt_regs_flags(%rax)
 	popfq
-	movq	pt_regs_rsp(%rax), %rsp
-	movq	pt_regs_rbp(%rax), %rbp
-	movq	pt_regs_rsi(%rax), %rsi
-	movq	pt_regs_rdi(%rax), %rdi
-	movq	pt_regs_rbx(%rax), %rbx
-	movq	pt_regs_rcx(%rax), %rcx
-	movq	pt_regs_rdx(%rax), %rdx
+	movq	pt_regs_sp(%rax), %rsp
+	movq	pt_regs_bp(%rax), %rbp
+	movq	pt_regs_si(%rax), %rsi
+	movq	pt_regs_di(%rax), %rdi
+	movq	pt_regs_bx(%rax), %rbx
+	movq	pt_regs_cx(%rax), %rcx
+	movq	pt_regs_dx(%rax), %rdx
 	movq	pt_regs_r8(%rax), %r8
 	movq	pt_regs_r9(%rax), %r9
 	movq	pt_regs_r10(%rax), %r10
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index fd7464d23339..a3a8be7618d1 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -75,22 +75,22 @@ void foo(void)
 	OFFSET(GDS_pad, Xgt_desc_struct, pad);
 	BLANK();
 
-	OFFSET(PT_EBX, pt_regs, ebx);
-	OFFSET(PT_ECX, pt_regs, ecx);
-	OFFSET(PT_EDX, pt_regs, edx);
-	OFFSET(PT_ESI, pt_regs, esi);
-	OFFSET(PT_EDI, pt_regs, edi);
-	OFFSET(PT_EBP, pt_regs, ebp);
-	OFFSET(PT_EAX, pt_regs, eax);
-	OFFSET(PT_DS,  pt_regs, xds);
-	OFFSET(PT_ES,  pt_regs, xes);
-	OFFSET(PT_FS,  pt_regs, xfs);
-	OFFSET(PT_ORIG_EAX, pt_regs, orig_eax);
-	OFFSET(PT_EIP, pt_regs, eip);
-	OFFSET(PT_CS,  pt_regs, xcs);
-	OFFSET(PT_EFLAGS, pt_regs, eflags);
-	OFFSET(PT_OLDESP, pt_regs, esp);
-	OFFSET(PT_OLDSS,  pt_regs, xss);
+	OFFSET(PT_EBX, pt_regs, bx);
+	OFFSET(PT_ECX, pt_regs, cx);
+	OFFSET(PT_EDX, pt_regs, dx);
+	OFFSET(PT_ESI, pt_regs, si);
+	OFFSET(PT_EDI, pt_regs, di);
+	OFFSET(PT_EBP, pt_regs, bp);
+	OFFSET(PT_EAX, pt_regs, ax);
+	OFFSET(PT_DS,  pt_regs, ds);
+	OFFSET(PT_ES,  pt_regs, es);
+	OFFSET(PT_FS,  pt_regs, fs);
+	OFFSET(PT_ORIG_EAX, pt_regs, orig_ax);
+	OFFSET(PT_EIP, pt_regs, ip);
+	OFFSET(PT_CS,  pt_regs, cs);
+	OFFSET(PT_EFLAGS, pt_regs, flags);
+	OFFSET(PT_OLDESP, pt_regs, sp);
+	OFFSET(PT_OLDSS,  pt_regs, ss);
 	BLANK();
 
 	OFFSET(EXEC_DOMAIN_handler, exec_domain, handler);
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index c27c646214f4..2e918ebf21d3 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -83,14 +83,14 @@ int main(void)
 	DEFINE(pbe_next, offsetof(struct pbe, next));
 	BLANK();
 #define ENTRY(entry) DEFINE(pt_regs_ ## entry, offsetof(struct pt_regs, entry))
-	ENTRY(rbx);
-	ENTRY(rbx);
-	ENTRY(rcx);
-	ENTRY(rdx);
-	ENTRY(rsp);
-	ENTRY(rbp);
-	ENTRY(rsi);
-	ENTRY(rdi);
+	ENTRY(bx);
+	ENTRY(bx);
+	ENTRY(cx);
+	ENTRY(dx);
+	ENTRY(sp);
+	ENTRY(bp);
+	ENTRY(si);
+	ENTRY(di);
 	ENTRY(r8);
 	ENTRY(r9);
 	ENTRY(r10);
@@ -99,7 +99,7 @@ int main(void)
 	ENTRY(r13);
 	ENTRY(r14);
 	ENTRY(r15);
-	ENTRY(eflags);
+	ENTRY(flags);
 	BLANK();
 #undef ENTRY
 #define ENTRY(entry) DEFINE(saved_context_ ## entry, offsetof(struct saved_context, entry))
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index e2fcf2051bdb..5db2a163bf4b 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -634,7 +634,7 @@ void __init early_cpu_init(void)
 struct pt_regs * __devinit idle_regs(struct pt_regs *regs)
 {
 	memset(regs, 0, sizeof(struct pt_regs));
-	regs->xfs = __KERNEL_PERCPU;
+	regs->fs = __KERNEL_PERCPU;
 	return regs;
 }
 
diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index 3c7672c40cf4..0adad772d0da 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -110,12 +110,12 @@ static void print_mce(struct mce *m)
 	       KERN_EMERG
 	       "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
 	       m->cpu, m->mcgstatus, m->bank, m->status);
-	if (m->rip) {
+	if (m->ip) {
 		printk(KERN_EMERG "RIP%s %02x:<%016Lx> ",
 		       !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
-		       m->cs, m->rip);
+		       m->cs, m->ip);
 		if (m->cs == __KERNEL_CS)
-			print_symbol("{%s}", m->rip);
+			print_symbol("{%s}", m->ip);
 		printk("\n");
 	}
 	printk(KERN_EMERG "TSC %Lx ", m->tsc);
@@ -156,16 +156,16 @@ static int mce_available(struct cpuinfo_x86 *c)
 static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
 {
 	if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) {
-		m->rip = regs->rip;
+		m->ip = regs->ip;
 		m->cs = regs->cs;
 	} else {
-		m->rip = 0;
+		m->ip = 0;
 		m->cs = 0;
 	}
 	if (rip_msr) {
 		/* Assume the RIP in the MSR is exact. Is this true? */
 		m->mcgstatus |= MCG_STATUS_EIPV;
-		rdmsrl(rip_msr, m->rip);
+		rdmsrl(rip_msr, m->ip);
 		m->cs = 0;
 	}
 }
@@ -288,7 +288,7 @@ void do_machine_check(struct pt_regs * regs, long error_code)
 		 * instruction which caused the MCE.
 		 */
 		if (m.mcgstatus & MCG_STATUS_EIPV)
-			user_space = panicm.rip && (panicm.cs & 3);
+			user_space = panicm.ip && (panicm.cs & 3);
 
 		/*
 		 * If we know that the error was in user space, send a
diff --git a/arch/x86/kernel/i8259_32.c b/arch/x86/kernel/i8259_32.c
index 3321ce669295..f201e7da1bbc 100644
--- a/arch/x86/kernel/i8259_32.c
+++ b/arch/x86/kernel/i8259_32.c
@@ -339,7 +339,7 @@ static irqreturn_t math_error_irq(int cpl, void *dev_id)
 	outb(0,0xF0);
 	if (ignore_fpu_irq || !boot_cpu_data.hard_math)
 		return IRQ_NONE;
-	math_error((void __user *)get_irq_regs()->eip);
+	math_error((void __user *)get_irq_regs()->ip);
 	return IRQ_HANDLED;
 }
 
diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c
index 4ef85a3b3f9f..fa70005be5e8 100644
--- a/arch/x86/kernel/io_apic_64.c
+++ b/arch/x86/kernel/io_apic_64.c
@@ -1412,7 +1412,7 @@ static void irq_complete_move(unsigned int irq)
 	if (likely(!cfg->move_in_progress))
 		return;
 
-	vector = ~get_irq_regs()->orig_rax;
+	vector = ~get_irq_regs()->orig_ax;
 	me = smp_processor_id();
 	if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) {
 		cpumask_t cleanup_mask;
diff --git a/arch/x86/kernel/ioport_32.c b/arch/x86/kernel/ioport_32.c
index c281ffa18259..9295e01ff49c 100644
--- a/arch/x86/kernel/ioport_32.c
+++ b/arch/x86/kernel/ioport_32.c
@@ -100,7 +100,7 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
  * beyond the 0x3ff range: to get the full 65536 ports bitmapped
  * you'd need 8kB of bitmaps/process, which is a bit excessive.
  *
- * Here we just change the eflags value on the stack: we allow
+ * Here we just change the flags value on the stack: we allow
  * only the super-user to do it. This depends on the stack-layout
  * on system-call entry - see also fork() and the signal handling
  * code.
@@ -109,8 +109,8 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
 asmlinkage long sys_iopl(unsigned long regsp)
 {
 	volatile struct pt_regs *regs = (struct pt_regs *)&regsp;
-	unsigned int level = regs->ebx;
-	unsigned int old = (regs->eflags >> 12) & 3;
+	unsigned int level = regs->bx;
+	unsigned int old = (regs->flags >> 12) & 3;
 	struct thread_struct *t = &current->thread;
 
 	if (level > 3)
@@ -122,7 +122,7 @@ asmlinkage long sys_iopl(unsigned long regsp)
 	}
 
 	t->iopl = level << 12;
-	regs->eflags = (regs->eflags & ~X86_EFLAGS_IOPL) | t->iopl;
+	regs->flags = (regs->flags & ~X86_EFLAGS_IOPL) | t->iopl;
 	set_iopl_mask(t->iopl);
 
 	return 0;
diff --git a/arch/x86/kernel/ioport_64.c b/arch/x86/kernel/ioport_64.c
index 5f62fad64dab..ff7514b757e5 100644
--- a/arch/x86/kernel/ioport_64.c
+++ b/arch/x86/kernel/ioport_64.c
@@ -95,7 +95,7 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
  * beyond the 0x3ff range: to get the full 65536 ports bitmapped
  * you'd need 8kB of bitmaps/process, which is a bit excessive.
  *
- * Here we just change the eflags value on the stack: we allow
+ * Here we just change the flags value on the stack: we allow
  * only the super-user to do it. This depends on the stack-layout
  * on system-call entry - see also fork() and the signal handling
  * code.
@@ -103,7 +103,7 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
 
 asmlinkage long sys_iopl(unsigned int level, struct pt_regs *regs)
 {
-	unsigned int old = (regs->eflags >> 12) & 3;
+	unsigned int old = (regs->flags >> 12) & 3;
 
 	if (level > 3)
 		return -EINVAL;
@@ -112,6 +112,6 @@ asmlinkage long sys_iopl(unsigned int level, struct pt_regs *regs)
 		if (!capable(CAP_SYS_RAWIO))
 			return -EPERM;
 	}
-	regs->eflags = (regs->eflags &~ X86_EFLAGS_IOPL) | (level << 12);
+	regs->flags = (regs->flags &~ X86_EFLAGS_IOPL) | (level << 12);
 	return 0;
 }
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index d3fde94f7345..b49616bcc16b 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -70,7 +70,7 @@ fastcall unsigned int do_IRQ(struct pt_regs *regs)
 {	
 	struct pt_regs *old_regs;
 	/* high bit used in ret_from_ code */
-	int irq = ~regs->orig_eax;
+	int irq = ~regs->orig_ax;
 	struct irq_desc *desc = irq_desc + irq;
 #ifdef CONFIG_4KSTACKS
 	union irq_ctx *curctx, *irqctx;
@@ -88,13 +88,13 @@ fastcall unsigned int do_IRQ(struct pt_regs *regs)
 #ifdef CONFIG_DEBUG_STACKOVERFLOW
 	/* Debugging check for stack overflow: is there less than 1KB free? */
 	{
-		long esp;
+		long sp;
 
 		__asm__ __volatile__("andl %%esp,%0" :
-					"=r" (esp) : "0" (THREAD_SIZE - 1));
-		if (unlikely(esp < (sizeof(struct thread_info) + STACK_WARN))) {
+					"=r" (sp) : "0" (THREAD_SIZE - 1));
+		if (unlikely(sp < (sizeof(struct thread_info) + STACK_WARN))) {
 			printk("do_IRQ: stack overflow: %ld\n",
-				esp - sizeof(struct thread_info));
+				sp - sizeof(struct thread_info));
 			dump_stack();
 		}
 	}
@@ -112,7 +112,7 @@ fastcall unsigned int do_IRQ(struct pt_regs *regs)
 	 * current stack (which is the irq stack already after all)
 	 */
 	if (curctx != irqctx) {
-		int arg1, arg2, ebx;
+		int arg1, arg2, bx;
 
 		/* build the stack frame on the IRQ stack */
 		isp = (u32*) ((char*)irqctx + sizeof(*irqctx));
@@ -128,10 +128,10 @@ fastcall unsigned int do_IRQ(struct pt_regs *regs)
 			(curctx->tinfo.preempt_count & SOFTIRQ_MASK);
 
 		asm volatile(
-			"       xchgl  %%ebx,%%esp      \n"
-			"       call   *%%edi           \n"
-			"       movl   %%ebx,%%esp      \n"
-			: "=a" (arg1), "=d" (arg2), "=b" (ebx)
+			"       xchgl  %%ebx,%%esp    \n"
+			"       call   *%%edi         \n"
+			"       movl   %%ebx,%%esp    \n"
+			: "=a" (arg1), "=d" (arg2), "=b" (bx)
 			:  "0" (irq),   "1" (desc),  "2" (isp),
 			   "D" (desc->handle_irq)
 			: "memory", "cc"
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 6c3a3b6e5cf4..3aac15466a91 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -53,11 +53,11 @@ static inline void stack_overflow_check(struct pt_regs *regs)
 	u64 curbase = (u64)task_stack_page(current);
 	static unsigned long warned = -60*HZ;
 
-	if (regs->rsp >= curbase && regs->rsp <= curbase + THREAD_SIZE &&
-	    regs->rsp <  curbase + sizeof(struct thread_info) + 128 &&
+	if (regs->sp >= curbase && regs->sp <= curbase + THREAD_SIZE &&
+	    regs->sp <  curbase + sizeof(struct thread_info) + 128 &&
 	    time_after(jiffies, warned + 60*HZ)) {
-		printk("do_IRQ: %s near stack overflow (cur:%Lx,rsp:%lx)\n",
-		       current->comm, curbase, regs->rsp);
+		printk("do_IRQ: %s near stack overflow (cur:%Lx,sp:%lx)\n",
+		       current->comm, curbase, regs->sp);
 		show_stack(NULL,NULL);
 		warned = jiffies;
 	}
@@ -162,7 +162,7 @@ asmlinkage unsigned int do_IRQ(struct pt_regs *regs)
 	struct pt_regs *old_regs = set_irq_regs(regs);
 
 	/* high bit used in ret_from_ code  */
-	unsigned vector = ~regs->orig_rax;
+	unsigned vector = ~regs->orig_ax;
 	unsigned irq;
 
 	exit_idle();
diff --git a/arch/x86/kernel/kprobes_32.c b/arch/x86/kernel/kprobes_32.c
index bc4a68367cd0..d708cd4f956f 100644
--- a/arch/x86/kernel/kprobes_32.c
+++ b/arch/x86/kernel/kprobes_32.c
@@ -212,7 +212,7 @@ static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
 {
 	__get_cpu_var(current_kprobe) = p;
 	kcb->kprobe_saved_eflags = kcb->kprobe_old_eflags
-		= (regs->eflags & (TF_MASK | IF_MASK));
+		= (regs->flags & (TF_MASK | IF_MASK));
 	if (is_IF_modifier(p->opcode))
 		kcb->kprobe_saved_eflags &= ~IF_MASK;
 }
@@ -232,20 +232,20 @@ static __always_inline void restore_btf(void)
 static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
 {
 	clear_btf();
-	regs->eflags |= TF_MASK;
-	regs->eflags &= ~IF_MASK;
+	regs->flags |= TF_MASK;
+	regs->flags &= ~IF_MASK;
 	/*single step inline if the instruction is an int3*/
 	if (p->opcode == BREAKPOINT_INSTRUCTION)
-		regs->eip = (unsigned long)p->addr;
+		regs->ip = (unsigned long)p->addr;
 	else
-		regs->eip = (unsigned long)p->ainsn.insn;
+		regs->ip = (unsigned long)p->ainsn.insn;
 }
 
 /* Called with kretprobe_lock held */
 void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
 				      struct pt_regs *regs)
 {
-	unsigned long *sara = (unsigned long *)&regs->esp;
+	unsigned long *sara = (unsigned long *)&regs->sp;
 
 	ri->ret_addr = (kprobe_opcode_t *) *sara;
 
@@ -264,7 +264,7 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
 	kprobe_opcode_t *addr;
 	struct kprobe_ctlblk *kcb;
 
-	addr = (kprobe_opcode_t *)(regs->eip - sizeof(kprobe_opcode_t));
+	addr = (kprobe_opcode_t *)(regs->ip - sizeof(kprobe_opcode_t));
 
 	/*
 	 * We don't want to be preempted for the entire
@@ -279,8 +279,8 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
 		if (p) {
 			if (kcb->kprobe_status == KPROBE_HIT_SS &&
 				*p->ainsn.insn == BREAKPOINT_INSTRUCTION) {
-				regs->eflags &= ~TF_MASK;
-				regs->eflags |= kcb->kprobe_saved_eflags;
+				regs->flags &= ~TF_MASK;
+				regs->flags |= kcb->kprobe_saved_eflags;
 				goto no_kprobe;
 			}
 			/* We have reentered the kprobe_handler(), since
@@ -301,7 +301,7 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
 			 * another cpu right after we hit, no further
 			 * handling of this interrupt is appropriate
 			 */
-				regs->eip -= sizeof(kprobe_opcode_t);
+				regs->ip -= sizeof(kprobe_opcode_t);
 				ret = 1;
 				goto no_kprobe;
 			}
@@ -325,7 +325,7 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
 			 * Back up over the (now missing) int3 and run
 			 * the original instruction.
 			 */
-			regs->eip -= sizeof(kprobe_opcode_t);
+			regs->ip -= sizeof(kprobe_opcode_t);
 			ret = 1;
 		}
 		/* Not one of ours: let kernel handle it */
@@ -344,7 +344,7 @@ ss_probe:
 	if (p->ainsn.boostable == 1 && !p->post_handler){
 		/* Boost up -- we can execute copied instructions directly */
 		reset_current_kprobe();
-		regs->eip = (unsigned long)p->ainsn.insn;
+		regs->ip = (unsigned long)p->ainsn.insn;
 		preempt_enable_no_resched();
 		return 1;
 	}
@@ -368,7 +368,7 @@ no_kprobe:
 	asm volatile ( ".global kretprobe_trampoline\n"
 			"kretprobe_trampoline: \n"
 			"	pushf\n"
-			/* skip cs, eip, orig_eax */
+			/* skip cs, ip, orig_ax */
 			"	subl $12, %esp\n"
 			"	pushl %fs\n"
 			"	pushl %ds\n"
@@ -382,10 +382,10 @@ no_kprobe:
 			"	pushl %ebx\n"
 			"	movl %esp, %eax\n"
 			"	call trampoline_handler\n"
-			/* move eflags to cs */
+			/* move flags to cs */
 			"	movl 52(%esp), %edx\n"
 			"	movl %edx, 48(%esp)\n"
-			/* save true return address on eflags */
+			/* save true return address on flags */
 			"	movl %eax, 52(%esp)\n"
 			"	popl %ebx\n"
 			"	popl %ecx\n"
@@ -394,7 +394,7 @@ no_kprobe:
 			"	popl %edi\n"
 			"	popl %ebp\n"
 			"	popl %eax\n"
-			/* skip eip, orig_eax, es, ds, fs */
+			/* skip ip, orig_ax, es, ds, fs */
 			"	addl $20, %esp\n"
 			"	popf\n"
 			"	ret\n");
@@ -415,9 +415,9 @@ fastcall void *__kprobes trampoline_handler(struct pt_regs *regs)
 	spin_lock_irqsave(&kretprobe_lock, flags);
 	head = kretprobe_inst_table_head(current);
 	/* fixup registers */
-	regs->xcs = __KERNEL_CS | get_kernel_rpl();
-	regs->eip = trampoline_address;
-	regs->orig_eax = 0xffffffff;
+	regs->cs = __KERNEL_CS | get_kernel_rpl();
+	regs->ip = trampoline_address;
+	regs->orig_ax = 0xffffffff;
 
 	/*
 	 * It is possible to have multiple instances associated with a given
@@ -478,11 +478,11 @@ fastcall void *__kprobes trampoline_handler(struct pt_regs *regs)
  * interrupt.  We have to fix up the stack as follows:
  *
  * 0) Except in the case of absolute or indirect jump or call instructions,
- * the new eip is relative to the copied instruction.  We need to make
+ * the new ip is relative to the copied instruction.  We need to make
  * it relative to the original instruction.
  *
  * 1) If the single-stepped instruction was pushfl, then the TF and IF
- * flags are set in the just-pushed eflags, and may need to be cleared.
+ * flags are set in the just-pushed flags, and may need to be cleared.
  *
  * 2) If the single-stepped instruction was a call, the return address
  * that is atop the stack is the address following the copied instruction.
@@ -493,11 +493,11 @@ fastcall void *__kprobes trampoline_handler(struct pt_regs *regs)
 static void __kprobes resume_execution(struct kprobe *p,
 		struct pt_regs *regs, struct kprobe_ctlblk *kcb)
 {
-	unsigned long *tos = (unsigned long *)&regs->esp;
+	unsigned long *tos = (unsigned long *)&regs->sp;
 	unsigned long copy_eip = (unsigned long)p->ainsn.insn;
 	unsigned long orig_eip = (unsigned long)p->addr;
 
-	regs->eflags &= ~TF_MASK;
+	regs->flags &= ~TF_MASK;
 	switch (p->ainsn.insn[0]) {
 	case 0x9c:		/* pushfl */
 		*tos &= ~(TF_MASK | IF_MASK);
@@ -508,8 +508,8 @@ static void __kprobes resume_execution(struct kprobe *p,
 	case 0xca:
 	case 0xcb:
 	case 0xcf:
-	case 0xea:		/* jmp absolute -- eip is correct */
-		/* eip is already adjusted, no more changes required */
+	case 0xea:		/* jmp absolute -- ip is correct */
+		/* ip is already adjusted, no more changes required */
 		p->ainsn.boostable = 1;
 		goto no_change;
 	case 0xe8:		/* call relative - Fix return addr */
@@ -522,14 +522,14 @@ static void __kprobes resume_execution(struct kprobe *p,
 		if ((p->ainsn.insn[1] & 0x30) == 0x10) {
 			/*
 			 * call absolute, indirect
-			 * Fix return addr; eip is correct.
+			 * Fix return addr; ip is correct.
 			 * But this is not boostable
 			 */
 			*tos = orig_eip + (*tos - copy_eip);
 			goto no_change;
 		} else if (((p->ainsn.insn[1] & 0x31) == 0x20) ||	/* jmp near, absolute indirect */
 			   ((p->ainsn.insn[1] & 0x31) == 0x21)) {	/* jmp far, absolute indirect */
-			/* eip is correct. And this is boostable */
+			/* ip is correct. And this is boostable */
 			p->ainsn.boostable = 1;
 			goto no_change;
 		}
@@ -538,21 +538,21 @@ static void __kprobes resume_execution(struct kprobe *p,
 	}
 
 	if (p->ainsn.boostable == 0) {
-		if ((regs->eip > copy_eip) &&
-		    (regs->eip - copy_eip) + 5 < MAX_INSN_SIZE) {
+		if ((regs->ip > copy_eip) &&
+		    (regs->ip - copy_eip) + 5 < MAX_INSN_SIZE) {
 			/*
 			 * These instructions can be executed directly if it
 			 * jumps back to correct address.
 			 */
-			set_jmp_op((void *)regs->eip,
-				   (void *)orig_eip + (regs->eip - copy_eip));
+			set_jmp_op((void *)regs->ip,
+				   (void *)orig_eip + (regs->ip - copy_eip));
 			p->ainsn.boostable = 1;
 		} else {
 			p->ainsn.boostable = -1;
 		}
 	}
 
-	regs->eip = orig_eip + (regs->eip - copy_eip);
+	regs->ip = orig_eip + (regs->ip - copy_eip);
 
 no_change:
 	restore_btf();
@@ -578,8 +578,8 @@ static int __kprobes post_kprobe_handler(struct pt_regs *regs)
 	}
 
 	resume_execution(cur, regs, kcb);
-	regs->eflags |= kcb->kprobe_saved_eflags;
-	trace_hardirqs_fixup_flags(regs->eflags);
+	regs->flags |= kcb->kprobe_saved_eflags;
+	trace_hardirqs_fixup_flags(regs->flags);
 
 	/*Restore back the original saved kprobes variables and continue. */
 	if (kcb->kprobe_status == KPROBE_REENTER) {
@@ -591,11 +591,11 @@ out:
 	preempt_enable_no_resched();
 
 	/*
-	 * if somebody else is singlestepping across a probe point, eflags
+	 * if somebody else is singlestepping across a probe point, flags
 	 * will have TF set, in which case, continue the remaining processing
 	 * of do_debug, as if this is not a probe hit.
 	 */
-	if (regs->eflags & TF_MASK)
+	if (regs->flags & TF_MASK)
 		return 0;
 
 	return 1;
@@ -612,12 +612,12 @@ int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr)
 		/*
 		 * We are here because the instruction being single
 		 * stepped caused a page fault. We reset the current
-		 * kprobe and the eip points back to the probe address
+		 * kprobe and the ip points back to the probe address
 		 * and allow the page fault handler to continue as a
 		 * normal page fault.
 		 */
-		regs->eip = (unsigned long)cur->addr;
-		regs->eflags |= kcb->kprobe_old_eflags;
+		regs->ip = (unsigned long)cur->addr;
+		regs->flags |= kcb->kprobe_old_eflags;
 		if (kcb->kprobe_status == KPROBE_REENTER)
 			restore_previous_kprobe(kcb);
 		else
@@ -703,7 +703,7 @@ int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
 	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
 
 	kcb->jprobe_saved_regs = *regs;
-	kcb->jprobe_saved_esp = &regs->esp;
+	kcb->jprobe_saved_esp = &regs->sp;
 	addr = (unsigned long)(kcb->jprobe_saved_esp);
 
 	/*
@@ -715,9 +715,9 @@ int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
 	 */
 	memcpy(kcb->jprobes_stack, (kprobe_opcode_t *)addr,
 			MIN_STACK_SIZE(addr));
-	regs->eflags &= ~IF_MASK;
+	regs->flags &= ~IF_MASK;
 	trace_hardirqs_off();
-	regs->eip = (unsigned long)(jp->entry);
+	regs->ip = (unsigned long)(jp->entry);
 	return 1;
 }
 
@@ -736,15 +736,15 @@ void __kprobes jprobe_return(void)
 int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
 {
 	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
-	u8 *addr = (u8 *) (regs->eip - 1);
+	u8 *addr = (u8 *) (regs->ip - 1);
 	unsigned long stack_addr = (unsigned long)(kcb->jprobe_saved_esp);
 	struct jprobe *jp = container_of(p, struct jprobe, kp);
 
 	if ((addr > (u8 *) jprobe_return) && (addr < (u8 *) jprobe_return_end)) {
-		if (&regs->esp != kcb->jprobe_saved_esp) {
+		if (&regs->sp != kcb->jprobe_saved_esp) {
 			struct pt_regs *saved_regs = &kcb->jprobe_saved_regs;
-			printk("current esp %p does not match saved esp %p\n",
-			       &regs->esp, kcb->jprobe_saved_esp);
+			printk("current sp %p does not match saved sp %p\n",
+			       &regs->sp, kcb->jprobe_saved_esp);
 			printk("Saved registers for jprobe %p\n", jp);
 			show_registers(saved_regs);
 			printk("Current registers\n");
diff --git a/arch/x86/kernel/kprobes_64.c b/arch/x86/kernel/kprobes_64.c
index 10d66e323c7d..f6837cd3bed5 100644
--- a/arch/x86/kernel/kprobes_64.c
+++ b/arch/x86/kernel/kprobes_64.c
@@ -251,7 +251,7 @@ static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
 {
 	__get_cpu_var(current_kprobe) = p;
 	kcb->kprobe_saved_rflags = kcb->kprobe_old_rflags
-		= (regs->eflags & (TF_MASK | IF_MASK));
+		= (regs->flags & (TF_MASK | IF_MASK));
 	if (is_IF_modifier(p->ainsn.insn))
 		kcb->kprobe_saved_rflags &= ~IF_MASK;
 }
@@ -271,20 +271,20 @@ static __always_inline void restore_btf(void)
 static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
 {
 	clear_btf();
-	regs->eflags |= TF_MASK;
-	regs->eflags &= ~IF_MASK;
+	regs->flags |= TF_MASK;
+	regs->flags &= ~IF_MASK;
 	/*single step inline if the instruction is an int3*/
 	if (p->opcode == BREAKPOINT_INSTRUCTION)
-		regs->rip = (unsigned long)p->addr;
+		regs->ip = (unsigned long)p->addr;
 	else
-		regs->rip = (unsigned long)p->ainsn.insn;
+		regs->ip = (unsigned long)p->ainsn.insn;
 }
 
 /* Called with kretprobe_lock held */
 void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
 				      struct pt_regs *regs)
 {
-	unsigned long *sara = (unsigned long *)regs->rsp;
+	unsigned long *sara = (unsigned long *)regs->sp;
 
 	ri->ret_addr = (kprobe_opcode_t *) *sara;
 	/* Replace the return addr with trampoline addr */
@@ -295,7 +295,7 @@ int __kprobes kprobe_handler(struct pt_regs *regs)
 {
 	struct kprobe *p;
 	int ret = 0;
-	kprobe_opcode_t *addr = (kprobe_opcode_t *)(regs->rip - sizeof(kprobe_opcode_t));
+	kprobe_opcode_t *addr = (kprobe_opcode_t *)(regs->ip - sizeof(kprobe_opcode_t));
 	struct kprobe_ctlblk *kcb;
 
 	/*
@@ -311,8 +311,8 @@ int __kprobes kprobe_handler(struct pt_regs *regs)
 		if (p) {
 			if (kcb->kprobe_status == KPROBE_HIT_SS &&
 				*p->ainsn.insn == BREAKPOINT_INSTRUCTION) {
-				regs->eflags &= ~TF_MASK;
-				regs->eflags |= kcb->kprobe_saved_rflags;
+				regs->flags &= ~TF_MASK;
+				regs->flags |= kcb->kprobe_saved_rflags;
 				goto no_kprobe;
 			} else if (kcb->kprobe_status == KPROBE_HIT_SSDONE) {
 				/* TODO: Provide re-entrancy from
@@ -321,7 +321,7 @@ int __kprobes kprobe_handler(struct pt_regs *regs)
 				 * the instruction of the new probe.
 				 */
 				arch_disarm_kprobe(p);
-				regs->rip = (unsigned long)p->addr;
+				regs->ip = (unsigned long)p->addr;
 				reset_current_kprobe();
 				ret = 1;
 			} else {
@@ -345,7 +345,7 @@ int __kprobes kprobe_handler(struct pt_regs *regs)
 			 * another cpu right after we hit, no further
 			 * handling of this interrupt is appropriate
 			 */
-				regs->rip = (unsigned long)addr;
+				regs->ip = (unsigned long)addr;
 				ret = 1;
 				goto no_kprobe;
 			}
@@ -369,7 +369,7 @@ int __kprobes kprobe_handler(struct pt_regs *regs)
 			 * Back up over the (now missing) int3 and run
 			 * the original instruction.
 			 */
-			regs->rip = (unsigned long)addr;
+			regs->ip = (unsigned long)addr;
 			ret = 1;
 		}
 		/* Not one of ours: let kernel handle it */
@@ -454,7 +454,7 @@ int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
 	}
 
 	kretprobe_assert(ri, orig_ret_address, trampoline_address);
-	regs->rip = orig_ret_address;
+	regs->ip = orig_ret_address;
 
 	reset_current_kprobe();
 	spin_unlock_irqrestore(&kretprobe_lock, flags);
@@ -484,11 +484,11 @@ int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
  * interrupt.  We have to fix up the stack as follows:
  *
  * 0) Except in the case of absolute or indirect jump or call instructions,
- * the new rip is relative to the copied instruction.  We need to make
+ * the new ip is relative to the copied instruction.  We need to make
  * it relative to the original instruction.
  *
  * 1) If the single-stepped instruction was pushfl, then the TF and IF
- * flags are set in the just-pushed eflags, and may need to be cleared.
+ * flags are set in the just-pushed flags, and may need to be cleared.
  *
  * 2) If the single-stepped instruction was a call, the return address
  * that is atop the stack is the address following the copied instruction.
@@ -497,7 +497,7 @@ int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
 static void __kprobes resume_execution(struct kprobe *p,
 		struct pt_regs *regs, struct kprobe_ctlblk *kcb)
 {
-	unsigned long *tos = (unsigned long *)regs->rsp;
+	unsigned long *tos = (unsigned long *)regs->sp;
 	unsigned long copy_rip = (unsigned long)p->ainsn.insn;
 	unsigned long orig_rip = (unsigned long)p->addr;
 	kprobe_opcode_t *insn = p->ainsn.insn;
@@ -506,7 +506,7 @@ static void __kprobes resume_execution(struct kprobe *p,
 	if (*insn >= 0x40 && *insn <= 0x4f)
 		insn++;
 
-	regs->eflags &= ~TF_MASK;
+	regs->flags &= ~TF_MASK;
 	switch (*insn) {
 	case 0x9c:	/* pushfl */
 		*tos &= ~(TF_MASK | IF_MASK);
@@ -538,7 +538,8 @@ static void __kprobes resume_execution(struct kprobe *p,
 		break;
 	}
 
-	regs->rip = orig_rip + (regs->rip - copy_rip);
+	regs->ip = orig_rip + (regs->ip - copy_rip);
+
 no_change:
 	restore_btf();
 
@@ -559,8 +560,8 @@ int __kprobes post_kprobe_handler(struct pt_regs *regs)
 	}
 
 	resume_execution(cur, regs, kcb);
-	regs->eflags |= kcb->kprobe_saved_rflags;
-	trace_hardirqs_fixup_flags(regs->eflags);
+	regs->flags |= kcb->kprobe_saved_rflags;
+	trace_hardirqs_fixup_flags(regs->flags);
 
 	/* Restore the original saved kprobes variables and continue. */
 	if (kcb->kprobe_status == KPROBE_REENTER) {
@@ -572,11 +573,11 @@ out:
 	preempt_enable_no_resched();
 
 	/*
-	 * if somebody else is singlestepping across a probe point, eflags
+	 * if somebody else is singlestepping across a probe point, flags
 	 * will have TF set, in which case, continue the remaining processing
 	 * of do_debug, as if this is not a probe hit.
 	 */
-	if (regs->eflags & TF_MASK)
+	if (regs->flags & TF_MASK)
 		return 0;
 
 	return 1;
@@ -594,12 +595,12 @@ int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr)
 		/*
 		 * We are here because the instruction being single
 		 * stepped caused a page fault. We reset the current
-		 * kprobe and the rip points back to the probe address
+		 * kprobe and the ip points back to the probe address
 		 * and allow the page fault handler to continue as a
 		 * normal page fault.
 		 */
-		regs->rip = (unsigned long)cur->addr;
-		regs->eflags |= kcb->kprobe_old_rflags;
+		regs->ip = (unsigned long)cur->addr;
+		regs->flags |= kcb->kprobe_old_rflags;
 		if (kcb->kprobe_status == KPROBE_REENTER)
 			restore_previous_kprobe(kcb);
 		else
@@ -629,9 +630,9 @@ int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr)
 		 * In case the user-specified fault handler returned
 		 * zero, try to fix up.
 		 */
-		fixup = search_exception_tables(regs->rip);
+		fixup = search_exception_tables(regs->ip);
 		if (fixup) {
-			regs->rip = fixup->fixup;
+			regs->ip = fixup->fixup;
 			return 1;
 		}
 
@@ -688,7 +689,7 @@ int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
 	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
 
 	kcb->jprobe_saved_regs = *regs;
-	kcb->jprobe_saved_rsp = (long *) regs->rsp;
+	kcb->jprobe_saved_rsp = (long *) regs->sp;
 	addr = (unsigned long)(kcb->jprobe_saved_rsp);
 	/*
 	 * As Linus pointed out, gcc assumes that the callee
@@ -699,9 +700,9 @@ int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
 	 */
 	memcpy(kcb->jprobes_stack, (kprobe_opcode_t *)addr,
 			MIN_STACK_SIZE(addr));
-	regs->eflags &= ~IF_MASK;
+	regs->flags &= ~IF_MASK;
 	trace_hardirqs_off();
-	regs->rip = (unsigned long)(jp->entry);
+	regs->ip = (unsigned long)(jp->entry);
 	return 1;
 }
 
@@ -720,15 +721,15 @@ void __kprobes jprobe_return(void)
 int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
 {
 	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
-	u8 *addr = (u8 *) (regs->rip - 1);
+	u8 *addr = (u8 *) (regs->ip - 1);
 	unsigned long stack_addr = (unsigned long)(kcb->jprobe_saved_rsp);
 	struct jprobe *jp = container_of(p, struct jprobe, kp);
 
 	if ((addr > (u8 *) jprobe_return) && (addr < (u8 *) jprobe_return_end)) {
-		if ((unsigned long *)regs->rsp != kcb->jprobe_saved_rsp) {
+		if ((unsigned long *)regs->sp != kcb->jprobe_saved_rsp) {
 			struct pt_regs *saved_regs = &kcb->jprobe_saved_regs;
-			printk("current rsp %p does not match saved rsp %p\n",
-			       (long *)regs->rsp, kcb->jprobe_saved_rsp);
+			printk("current sp %p does not match saved sp %p\n",
+			       (long *)regs->sp, kcb->jprobe_saved_rsp);
 			printk("Saved registers for jprobe %p\n", jp);
 			show_registers(saved_regs);
 			printk("Current registers\n");
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index d5462f228daf..c9f28e02e86d 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -265,13 +265,13 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait);
  * New with Core Duo processors, MWAIT can take some hints based on CPU
  * capability.
  */
-void mwait_idle_with_hints(unsigned long eax, unsigned long ecx)
+void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
 {
 	if (!need_resched()) {
 		__monitor((void *)&current_thread_info()->flags, 0, 0);
 		smp_mb();
 		if (!need_resched())
-			__mwait(eax, ecx);
+			__mwait(ax, cx);
 	}
 }
 
@@ -320,15 +320,15 @@ void __show_registers(struct pt_regs *regs, int all)
 {
 	unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L;
 	unsigned long d0, d1, d2, d3, d6, d7;
-	unsigned long esp;
+	unsigned long sp;
 	unsigned short ss, gs;
 
 	if (user_mode_vm(regs)) {
-		esp = regs->esp;
-		ss = regs->xss & 0xffff;
+		sp = regs->sp;
+		ss = regs->ss & 0xffff;
 		savesegment(gs, gs);
 	} else {
-		esp = (unsigned long) (&regs->esp);
+		sp = (unsigned long) (&regs->sp);
 		savesegment(ss, ss);
 		savesegment(gs, gs);
 	}
@@ -341,17 +341,17 @@ void __show_registers(struct pt_regs *regs, int all)
 			init_utsname()->version);
 
 	printk("EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n",
-			0xffff & regs->xcs, regs->eip, regs->eflags,
+			0xffff & regs->cs, regs->ip, regs->flags,
 			smp_processor_id());
-	print_symbol("EIP is at %s\n", regs->eip);
+	print_symbol("EIP is at %s\n", regs->ip);
 
 	printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n",
-		regs->eax, regs->ebx, regs->ecx, regs->edx);
+		regs->ax, regs->bx, regs->cx, regs->dx);
 	printk("ESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n",
-		regs->esi, regs->edi, regs->ebp, esp);
+		regs->si, regs->di, regs->bp, sp);
 	printk(" DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x\n",
-	       regs->xds & 0xffff, regs->xes & 0xffff,
-	       regs->xfs & 0xffff, gs, ss);
+	       regs->ds & 0xffff, regs->es & 0xffff,
+	       regs->fs & 0xffff, gs, ss);
 
 	if (!all)
 		return;
@@ -379,12 +379,12 @@ void __show_registers(struct pt_regs *regs, int all)
 void show_regs(struct pt_regs *regs)
 {
 	__show_registers(regs, 1);
-	show_trace(NULL, regs, &regs->esp);
+	show_trace(NULL, regs, &regs->sp);
 }
 
 /*
- * This gets run with %ebx containing the
- * function to call, and %edx containing
+ * This gets run with %bx containing the
+ * function to call, and %dx containing
  * the "args".
  */
 extern void kernel_thread_helper(void);
@@ -398,16 +398,16 @@ int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
 
 	memset(&regs, 0, sizeof(regs));
 
-	regs.ebx = (unsigned long) fn;
-	regs.edx = (unsigned long) arg;
+	regs.bx = (unsigned long) fn;
+	regs.dx = (unsigned long) arg;
 
-	regs.xds = __USER_DS;
-	regs.xes = __USER_DS;
-	regs.xfs = __KERNEL_PERCPU;
-	regs.orig_eax = -1;
-	regs.eip = (unsigned long) kernel_thread_helper;
-	regs.xcs = __KERNEL_CS | get_kernel_rpl();
-	regs.eflags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2;
+	regs.ds = __USER_DS;
+	regs.es = __USER_DS;
+	regs.fs = __KERNEL_PERCPU;
+	regs.orig_ax = -1;
+	regs.ip = (unsigned long) kernel_thread_helper;
+	regs.cs = __KERNEL_CS | get_kernel_rpl();
+	regs.flags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2;
 
 	/* Ok, create the new process.. */
 	return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, &regs, 0, NULL, NULL);
@@ -470,7 +470,7 @@ void prepare_to_copy(struct task_struct *tsk)
 	unlazy_fpu(tsk);
 }
 
-int copy_thread(int nr, unsigned long clone_flags, unsigned long esp,
+int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
 	unsigned long unused,
 	struct task_struct * p, struct pt_regs * regs)
 {
@@ -480,8 +480,8 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long esp,
 
 	childregs = task_pt_regs(p);
 	*childregs = *regs;
-	childregs->eax = 0;
-	childregs->esp = esp;
+	childregs->ax = 0;
+	childregs->sp = sp;
 
 	p->thread.esp = (unsigned long) childregs;
 	p->thread.esp0 = (unsigned long) (childregs+1);
@@ -508,7 +508,7 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long esp,
 	 */
 	if (clone_flags & CLONE_SETTLS)
 		err = do_set_thread_area(p, -1,
-			(struct user_desc __user *)childregs->esi, 0);
+			(struct user_desc __user *)childregs->si, 0);
 
 	if (err && p->thread.io_bitmap_ptr) {
 		kfree(p->thread.io_bitmap_ptr);
@@ -527,7 +527,7 @@ void dump_thread(struct pt_regs * regs, struct user * dump)
 /* changed the size calculations - should hopefully work better. lbt */
 	dump->magic = CMAGIC;
 	dump->start_code = 0;
-	dump->start_stack = regs->esp & ~(PAGE_SIZE - 1);
+	dump->start_stack = regs->sp & ~(PAGE_SIZE - 1);
 	dump->u_tsize = ((unsigned long) current->mm->end_code) >> PAGE_SHIFT;
 	dump->u_dsize = ((unsigned long) (current->mm->brk + (PAGE_SIZE-1))) >> PAGE_SHIFT;
 	dump->u_dsize -= dump->u_tsize;
@@ -538,23 +538,23 @@ void dump_thread(struct pt_regs * regs, struct user * dump)
 	if (dump->start_stack < TASK_SIZE)
 		dump->u_ssize = ((unsigned long) (TASK_SIZE - dump->start_stack)) >> PAGE_SHIFT;
 
-	dump->regs.ebx = regs->ebx;
-	dump->regs.ecx = regs->ecx;
-	dump->regs.edx = regs->edx;
-	dump->regs.esi = regs->esi;
-	dump->regs.edi = regs->edi;
-	dump->regs.ebp = regs->ebp;
-	dump->regs.eax = regs->eax;
-	dump->regs.ds = regs->xds;
-	dump->regs.es = regs->xes;
-	dump->regs.fs = regs->xfs;
+	dump->regs.ebx = regs->bx;
+	dump->regs.ecx = regs->cx;
+	dump->regs.edx = regs->dx;
+	dump->regs.esi = regs->si;
+	dump->regs.edi = regs->di;
+	dump->regs.ebp = regs->bp;
+	dump->regs.eax = regs->ax;
+	dump->regs.ds = regs->ds;
+	dump->regs.es = regs->es;
+	dump->regs.fs = regs->fs;
 	savesegment(gs,dump->regs.gs);
-	dump->regs.orig_eax = regs->orig_eax;
-	dump->regs.eip = regs->eip;
-	dump->regs.cs = regs->xcs;
-	dump->regs.eflags = regs->eflags;
-	dump->regs.esp = regs->esp;
-	dump->regs.ss = regs->xss;
+	dump->regs.orig_eax = regs->orig_ax;
+	dump->regs.eip = regs->ip;
+	dump->regs.cs = regs->cs;
+	dump->regs.eflags = regs->flags;
+	dump->regs.esp = regs->sp;
+	dump->regs.ss = regs->ss;
 
 	dump->u_fpvalid = dump_fpu (regs, &dump->i387);
 }
@@ -566,10 +566,10 @@ EXPORT_SYMBOL(dump_thread);
 int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
 {
 	struct pt_regs ptregs = *task_pt_regs(tsk);
-	ptregs.xcs &= 0xffff;
-	ptregs.xds &= 0xffff;
-	ptregs.xes &= 0xffff;
-	ptregs.xss &= 0xffff;
+	ptregs.cs &= 0xffff;
+	ptregs.ds &= 0xffff;
+	ptregs.es &= 0xffff;
+	ptregs.ss &= 0xffff;
 
 	elf_core_copy_regs(regs, &ptregs);
 
@@ -684,7 +684,7 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
  * More important, however, is the fact that this allows us much
  * more flexibility.
  *
- * The return value (in %eax) will be the "prev" task after
+ * The return value (in %ax) will be the "prev" task after
  * the task-switch, and shows up in ret_from_fork in entry.S,
  * for example.
  */
@@ -771,7 +771,7 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas
 
 asmlinkage int sys_fork(struct pt_regs regs)
 {
-	return do_fork(SIGCHLD, regs.esp, &regs, 0, NULL, NULL);
+	return do_fork(SIGCHLD, regs.sp, &regs, 0, NULL, NULL);
 }
 
 asmlinkage int sys_clone(struct pt_regs regs)
@@ -780,12 +780,12 @@ asmlinkage int sys_clone(struct pt_regs regs)
 	unsigned long newsp;
 	int __user *parent_tidptr, *child_tidptr;
 
-	clone_flags = regs.ebx;
-	newsp = regs.ecx;
-	parent_tidptr = (int __user *)regs.edx;
-	child_tidptr = (int __user *)regs.edi;
+	clone_flags = regs.bx;
+	newsp = regs.cx;
+	parent_tidptr = (int __user *)regs.dx;
+	child_tidptr = (int __user *)regs.di;
 	if (!newsp)
-		newsp = regs.esp;
+		newsp = regs.sp;
 	return do_fork(clone_flags, newsp, &regs, 0, parent_tidptr, child_tidptr);
 }
 
@@ -801,7 +801,7 @@ asmlinkage int sys_clone(struct pt_regs regs)
  */
 asmlinkage int sys_vfork(struct pt_regs regs)
 {
-	return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.esp, &regs, 0, NULL, NULL);
+	return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.sp, &regs, 0, NULL, NULL);
 }
 
 /*
@@ -812,13 +812,13 @@ asmlinkage int sys_execve(struct pt_regs regs)
 	int error;
 	char * filename;
 
-	filename = getname((char __user *) regs.ebx);
+	filename = getname((char __user *) regs.bx);
 	error = PTR_ERR(filename);
 	if (IS_ERR(filename))
 		goto out;
 	error = do_execve(filename,
-			(char __user * __user *) regs.ecx,
-			(char __user * __user *) regs.edx,
+			(char __user * __user *) regs.cx,
+			(char __user * __user *) regs.dx,
 			&regs);
 	if (error == 0) {
 		/* Make sure we don't return using sysenter.. */
@@ -834,24 +834,24 @@ out:
 
 unsigned long get_wchan(struct task_struct *p)
 {
-	unsigned long ebp, esp, eip;
+	unsigned long bp, sp, ip;
 	unsigned long stack_page;
 	int count = 0;
 	if (!p || p == current || p->state == TASK_RUNNING)
 		return 0;
 	stack_page = (unsigned long)task_stack_page(p);
-	esp = p->thread.esp;
-	if (!stack_page || esp < stack_page || esp > top_esp+stack_page)
+	sp = p->thread.esp;
+	if (!stack_page || sp < stack_page || sp > top_esp+stack_page)
 		return 0;
-	/* include/asm-i386/system.h:switch_to() pushes ebp last. */
-	ebp = *(unsigned long *) esp;
+	/* include/asm-i386/system.h:switch_to() pushes bp last. */
+	bp = *(unsigned long *) sp;
 	do {
-		if (ebp < stack_page || ebp > top_ebp+stack_page)
+		if (bp < stack_page || bp > top_ebp+stack_page)
 			return 0;
-		eip = *(unsigned long *) (ebp+4);
-		if (!in_sched_functions(eip))
-			return eip;
-		ebp = *(unsigned long *) ebp;
+		ip = *(unsigned long *) (bp+4);
+		if (!in_sched_functions(ip))
+			return ip;
+		bp = *(unsigned long *) bp;
 	} while (count++ < 16);
 	return 0;
 }
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index ae5eca17aa3c..efbb1a2eab97 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -257,13 +257,13 @@ void cpu_idle(void)
  * New with Core Duo processors, MWAIT can take some hints based on CPU
  * capability.
  */
-void mwait_idle_with_hints(unsigned long eax, unsigned long ecx)
+void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
 {
 	if (!need_resched()) {
 		__monitor((void *)&current_thread_info()->flags, 0, 0);
 		smp_mb();
 		if (!need_resched())
-			__mwait(eax, ecx);
+			__mwait(ax, cx);
 	}
 }
 
@@ -330,16 +330,16 @@ void __show_regs(struct pt_regs * regs)
 		init_utsname()->release,
 		(int)strcspn(init_utsname()->version, " "),
 		init_utsname()->version);
-	printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip);
-	printk_address(regs->rip); 
-	printk("RSP: %04lx:%016lx  EFLAGS: %08lx\n", regs->ss, regs->rsp,
-		regs->eflags);
+	printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
+	printk_address(regs->ip);
+	printk("RSP: %04lx:%016lx  EFLAGS: %08lx\n", regs->ss, regs->sp,
+		regs->flags);
 	printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
-	       regs->rax, regs->rbx, regs->rcx);
+	       regs->ax, regs->bx, regs->cx);
 	printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
-	       regs->rdx, regs->rsi, regs->rdi); 
+	       regs->dx, regs->si, regs->di);
 	printk("RBP: %016lx R08: %016lx R09: %016lx\n",
-	       regs->rbp, regs->r8, regs->r9); 
+	       regs->bp, regs->r8, regs->r9);
 	printk("R10: %016lx R11: %016lx R12: %016lx\n",
 	       regs->r10, regs->r11, regs->r12); 
 	printk("R13: %016lx R14: %016lx R15: %016lx\n",
@@ -476,7 +476,7 @@ void prepare_to_copy(struct task_struct *tsk)
 	unlazy_fpu(tsk);
 }
 
-int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp, 
+int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
 		unsigned long unused,
 	struct task_struct * p, struct pt_regs * regs)
 {
@@ -488,10 +488,10 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp,
 			(THREAD_SIZE + task_stack_page(p))) - 1;
 	*childregs = *regs;
 
-	childregs->rax = 0;
-	childregs->rsp = rsp;
-	if (rsp == ~0UL)
-		childregs->rsp = (unsigned long)childregs;
+	childregs->ax = 0;
+	childregs->sp = sp;
+	if (sp == ~0UL)
+		childregs->sp = (unsigned long)childregs;
 
 	p->thread.rsp = (unsigned long) childregs;
 	p->thread.rsp0 = (unsigned long) (childregs+1);
@@ -525,7 +525,7 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp,
 #ifdef CONFIG_IA32_EMULATION
 		if (test_thread_flag(TIF_IA32))
 			err = do_set_thread_area(p, -1,
-				(struct user_desc __user *)childregs->rsi, 0);
+				(struct user_desc __user *)childregs->si, 0);
 		else 			
 #endif	 
 			err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8); 
@@ -732,7 +732,7 @@ void set_personality_64bit(void)
 
 asmlinkage long sys_fork(struct pt_regs *regs)
 {
-	return do_fork(SIGCHLD, regs->rsp, regs, 0, NULL, NULL);
+	return do_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL);
 }
 
 asmlinkage long
@@ -740,7 +740,7 @@ sys_clone(unsigned long clone_flags, unsigned long newsp,
 	  void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
 {
 	if (!newsp)
-		newsp = regs->rsp;
+		newsp = regs->sp;
 	return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
 }
 
@@ -756,14 +756,14 @@ sys_clone(unsigned long clone_flags, unsigned long newsp,
  */
 asmlinkage long sys_vfork(struct pt_regs *regs)
 {
-	return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->rsp, regs, 0,
+	return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->sp, regs, 0,
 		    NULL, NULL);
 }
 
 unsigned long get_wchan(struct task_struct *p)
 {
 	unsigned long stack;
-	u64 fp,rip;
+	u64 fp,ip;
 	int count = 0;
 
 	if (!p || p == current || p->state==TASK_RUNNING)
@@ -776,9 +776,9 @@ unsigned long get_wchan(struct task_struct *p)
 		if (fp < (unsigned long)stack ||
 		    fp > (unsigned long)stack+THREAD_SIZE)
 			return 0; 
-		rip = *(u64 *)(fp+8); 
-		if (!in_sched_functions(rip))
-			return rip; 
+		ip = *(u64 *)(fp+8);
+		if (!in_sched_functions(ip))
+			return ip;
 		fp = *(u64 *)fp; 
 	} while (count++ < 16); 
 	return 0;
diff --git a/arch/x86/kernel/ptrace_32.c b/arch/x86/kernel/ptrace_32.c
index 512f8412b799..f81e2f1827d4 100644
--- a/arch/x86/kernel/ptrace_32.c
+++ b/arch/x86/kernel/ptrace_32.c
@@ -39,10 +39,10 @@
 
 static long *pt_regs_access(struct pt_regs *regs, unsigned long regno)
 {
-	BUILD_BUG_ON(offsetof(struct pt_regs, ebx) != 0);
+	BUILD_BUG_ON(offsetof(struct pt_regs, bx) != 0);
 	if (regno > FS)
 		--regno;
-	return &regs->ebx + regno;
+	return &regs->bx + regno;
 }
 
 static int putreg(struct task_struct *child,
@@ -80,7 +80,7 @@ static int putreg(struct task_struct *child,
 				clear_tsk_thread_flag(child, TIF_FORCED_TF);
 			else if (test_tsk_thread_flag(child, TIF_FORCED_TF))
 				value |= X86_EFLAGS_TF;
-			value |= regs->eflags & ~FLAG_MASK;
+			value |= regs->flags & ~FLAG_MASK;
 			break;
 	}
 	*pt_regs_access(regs, regno) = value;
@@ -98,7 +98,7 @@ static unsigned long getreg(struct task_struct *child, unsigned long regno)
 			/*
 			 * If the debugger set TF, hide it from the readout.
 			 */
-			retval = regs->eflags;
+			retval = regs->flags;
 			if (test_tsk_thread_flag(child, TIF_FORCED_TF))
 				retval &= ~X86_EFLAGS_TF;
 			break;
@@ -369,8 +369,8 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code)
 	info.si_signo = SIGTRAP;
 	info.si_code = TRAP_BRKPT;
 
-	/* User-mode eip? */
-	info.si_addr = user_mode_vm(regs) ? (void __user *) regs->eip : NULL;
+	/* User-mode ip? */
+	info.si_addr = user_mode_vm(regs) ? (void __user *) regs->ip : NULL;
 
 	/* Send us the fake SIGTRAP */
 	force_sig_info(SIGTRAP, &info, tsk);
@@ -392,12 +392,12 @@ int do_syscall_trace(struct pt_regs *regs, int entryexit)
 
 	/* do the secure computing check first */
 	if (!entryexit)
-		secure_computing(regs->orig_eax);
+		secure_computing(regs->orig_ax);
 
 	if (unlikely(current->audit_context)) {
 		if (entryexit)
-			audit_syscall_exit(AUDITSC_RESULT(regs->eax),
-						regs->eax);
+			audit_syscall_exit(AUDITSC_RESULT(regs->ax),
+						regs->ax);
 		/* Debug traps, when using PTRACE_SINGLESTEP, must be sent only
 		 * on the syscall exit path. Normally, when TIF_SYSCALL_AUDIT is
 		 * not used, entry.S will call us only on syscall exit, not
@@ -445,13 +445,13 @@ int do_syscall_trace(struct pt_regs *regs, int entryexit)
 	ret = is_sysemu;
 out:
 	if (unlikely(current->audit_context) && !entryexit)
-		audit_syscall_entry(AUDIT_ARCH_I386, regs->orig_eax,
-				    regs->ebx, regs->ecx, regs->edx, regs->esi);
+		audit_syscall_entry(AUDIT_ARCH_I386, regs->orig_ax,
+				    regs->bx, regs->cx, regs->dx, regs->si);
 	if (ret == 0)
 		return 0;
 
-	regs->orig_eax = -1; /* force skip of syscall restarting */
+	regs->orig_ax = -1; /* force skip of syscall restarting */
 	if (unlikely(current->audit_context))
-		audit_syscall_exit(AUDITSC_RESULT(regs->eax), regs->eax);
+		audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax);
 	return 1;
 }
diff --git a/arch/x86/kernel/ptrace_64.c b/arch/x86/kernel/ptrace_64.c
index 4ba66d8af717..bee20bb1a6c0 100644
--- a/arch/x86/kernel/ptrace_64.c
+++ b/arch/x86/kernel/ptrace_64.c
@@ -119,7 +119,7 @@ static int putreg(struct task_struct *child,
 				clear_tsk_thread_flag(child, TIF_FORCED_TF);
 			else if (test_tsk_thread_flag(child, TIF_FORCED_TF))
 				value |= X86_EFLAGS_TF;
-			value |= regs->eflags & ~FLAG_MASK;
+			value |= regs->flags & ~FLAG_MASK;
 			break;
 		case offsetof(struct user_regs_struct,cs): 
 			if ((value & 3) != 3)
@@ -168,7 +168,7 @@ static unsigned long getreg(struct task_struct *child, unsigned long regno)
 			/*
 			 * If the debugger set TF, hide it from the readout.
 			 */
-			val = regs->eflags;
+			val = regs->flags;
 			if (test_tsk_thread_flag(child, TIF_IA32))
 				val &= 0xffffffff;
 			if (test_tsk_thread_flag(child, TIF_FORCED_TF))
@@ -383,9 +383,9 @@ static void syscall_trace(struct pt_regs *regs)
 {
 
 #if 0
-	printk("trace %s rip %lx rsp %lx rax %d origrax %d caller %lx tiflags %x ptrace %x\n",
+	printk("trace %s ip %lx sp %lx ax %d origrax %d caller %lx tiflags %x ptrace %x\n",
 	       current->comm,
-	       regs->rip, regs->rsp, regs->rax, regs->orig_rax, __builtin_return_address(0),
+	       regs->ip, regs->sp, regs->ax, regs->orig_ax, __builtin_return_address(0),
 	       current_thread_info()->flags, current->ptrace); 
 #endif
 
@@ -405,7 +405,7 @@ static void syscall_trace(struct pt_regs *regs)
 asmlinkage void syscall_trace_enter(struct pt_regs *regs)
 {
 	/* do the secure computing check first */
-	secure_computing(regs->orig_rax);
+	secure_computing(regs->orig_ax);
 
 	if (test_thread_flag(TIF_SYSCALL_TRACE)
 	    && (current->ptrace & PT_PTRACED))
@@ -414,14 +414,14 @@ asmlinkage void syscall_trace_enter(struct pt_regs *regs)
 	if (unlikely(current->audit_context)) {
 		if (test_thread_flag(TIF_IA32)) {
 			audit_syscall_entry(AUDIT_ARCH_I386,
-					    regs->orig_rax,
-					    regs->rbx, regs->rcx,
-					    regs->rdx, regs->rsi);
+					    regs->orig_ax,
+					    regs->bx, regs->cx,
+					    regs->dx, regs->si);
 		} else {
 			audit_syscall_entry(AUDIT_ARCH_X86_64,
-					    regs->orig_rax,
-					    regs->rdi, regs->rsi,
-					    regs->rdx, regs->r10);
+					    regs->orig_ax,
+					    regs->di, regs->si,
+					    regs->dx, regs->r10);
 		}
 	}
 }
@@ -429,7 +429,7 @@ asmlinkage void syscall_trace_enter(struct pt_regs *regs)
 asmlinkage void syscall_trace_leave(struct pt_regs *regs)
 {
 	if (unlikely(current->audit_context))
-		audit_syscall_exit(AUDITSC_RESULT(regs->rax), regs->rax);
+		audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax);
 
 	if ((test_thread_flag(TIF_SYSCALL_TRACE)
 	     || test_thread_flag(TIF_SINGLESTEP))
diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c
index 0a7c812212c9..40fd3515ccf1 100644
--- a/arch/x86/kernel/signal_32.c
+++ b/arch/x86/kernel/signal_32.c
@@ -82,14 +82,14 @@ sys_sigaction(int sig, const struct old_sigaction __user *act,
 }
 
 asmlinkage int
-sys_sigaltstack(unsigned long ebx)
+sys_sigaltstack(unsigned long bx)
 {
 	/* This is needed to make gcc realize it doesn't own the "struct pt_regs" */
-	struct pt_regs *regs = (struct pt_regs *)&ebx;
-	const stack_t __user *uss = (const stack_t __user *)ebx;
-	stack_t __user *uoss = (stack_t __user *)regs->ecx;
+	struct pt_regs *regs = (struct pt_regs *)&bx;
+	const stack_t __user *uss = (const stack_t __user *)bx;
+	stack_t __user *uoss = (stack_t __user *)regs->cx;
 
-	return do_sigaltstack(uss, uoss, regs->esp);
+	return do_sigaltstack(uss, uoss, regs->sp);
 }
 
 
@@ -105,17 +105,17 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, int *peax
 	/* Always make any pending restarted system calls return -EINTR */
 	current_thread_info()->restart_block.fn = do_no_restart_syscall;
 
-#define COPY(x)		err |= __get_user(regs->x, &sc->x)
+#define COPY(x)		err |= __get_user(regs->x, &sc->e ## x)
 
 #define COPY_SEG(seg)							\
 	{ unsigned short tmp;						\
 	  err |= __get_user(tmp, &sc->seg);				\
-	  regs->x##seg = tmp; }
+	  regs->seg = tmp; }
 
 #define COPY_SEG_STRICT(seg)						\
 	{ unsigned short tmp;						\
 	  err |= __get_user(tmp, &sc->seg);				\
-	  regs->x##seg = tmp|3; }
+	  regs->seg = tmp|3; }
 
 #define GET_SEG(seg)							\
 	{ unsigned short tmp;						\
@@ -131,22 +131,22 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, int *peax
 	COPY_SEG(fs);
 	COPY_SEG(es);
 	COPY_SEG(ds);
-	COPY(edi);
-	COPY(esi);
-	COPY(ebp);
-	COPY(esp);
-	COPY(ebx);
-	COPY(edx);
-	COPY(ecx);
-	COPY(eip);
+	COPY(di);
+	COPY(si);
+	COPY(bp);
+	COPY(sp);
+	COPY(bx);
+	COPY(dx);
+	COPY(cx);
+	COPY(ip);
 	COPY_SEG_STRICT(cs);
 	COPY_SEG_STRICT(ss);
 	
 	{
 		unsigned int tmpflags;
 		err |= __get_user(tmpflags, &sc->eflags);
-		regs->eflags = (regs->eflags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
-		regs->orig_eax = -1;		/* disable syscall checks */
+		regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
+		regs->orig_ax = -1;		/* disable syscall checks */
 	}
 
 	{
@@ -175,9 +175,9 @@ badframe:
 asmlinkage int sys_sigreturn(unsigned long __unused)
 {
 	struct pt_regs *regs = (struct pt_regs *) &__unused;
-	struct sigframe __user *frame = (struct sigframe __user *)(regs->esp - 8);
+	struct sigframe __user *frame = (struct sigframe __user *)(regs->sp - 8);
 	sigset_t set;
-	int eax;
+	int ax;
 
 	if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
 		goto badframe;
@@ -193,17 +193,17 @@ asmlinkage int sys_sigreturn(unsigned long __unused)
 	recalc_sigpending();
 	spin_unlock_irq(&current->sighand->siglock);
 	
-	if (restore_sigcontext(regs, &frame->sc, &eax))
+	if (restore_sigcontext(regs, &frame->sc, &ax))
 		goto badframe;
-	return eax;
+	return ax;
 
 badframe:
 	if (show_unhandled_signals && printk_ratelimit())
-		printk("%s%s[%d] bad frame in sigreturn frame:%p eip:%lx"
-		       " esp:%lx oeax:%lx\n",
+		printk("%s%s[%d] bad frame in sigreturn frame:%p ip:%lx"
+		       " sp:%lx oeax:%lx\n",
 		    task_pid_nr(current) > 1 ? KERN_INFO : KERN_EMERG,
-		    current->comm, task_pid_nr(current), frame, regs->eip,
-		    regs->esp, regs->orig_eax);
+		    current->comm, task_pid_nr(current), frame, regs->ip,
+		    regs->sp, regs->orig_ax);
 
 	force_sig(SIGSEGV, current);
 	return 0;
@@ -212,9 +212,9 @@ badframe:
 asmlinkage int sys_rt_sigreturn(unsigned long __unused)
 {
 	struct pt_regs *regs = (struct pt_regs *) &__unused;
-	struct rt_sigframe __user *frame = (struct rt_sigframe __user *)(regs->esp - 4);
+	struct rt_sigframe __user *frame = (struct rt_sigframe __user *)(regs->sp - 4);
 	sigset_t set;
-	int eax;
+	int ax;
 
 	if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
 		goto badframe;
@@ -227,13 +227,13 @@ asmlinkage int sys_rt_sigreturn(unsigned long __unused)
 	recalc_sigpending();
 	spin_unlock_irq(&current->sighand->siglock);
 	
-	if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &eax))
+	if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax))
 		goto badframe;
 
-	if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->esp) == -EFAULT)
+	if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->sp) == -EFAULT)
 		goto badframe;
 
-	return eax;
+	return ax;
 
 badframe:
 	force_sig(SIGSEGV, current);
@@ -250,27 +250,27 @@ setup_sigcontext(struct sigcontext __user *sc, struct _fpstate __user *fpstate,
 {
 	int tmp, err = 0;
 
-	err |= __put_user(regs->xfs, (unsigned int __user *)&sc->fs);
+	err |= __put_user(regs->fs, (unsigned int __user *)&sc->fs);
 	savesegment(gs, tmp);
 	err |= __put_user(tmp, (unsigned int __user *)&sc->gs);
 
-	err |= __put_user(regs->xes, (unsigned int __user *)&sc->es);
-	err |= __put_user(regs->xds, (unsigned int __user *)&sc->ds);
-	err |= __put_user(regs->edi, &sc->edi);
-	err |= __put_user(regs->esi, &sc->esi);
-	err |= __put_user(regs->ebp, &sc->ebp);
-	err |= __put_user(regs->esp, &sc->esp);
-	err |= __put_user(regs->ebx, &sc->ebx);
-	err |= __put_user(regs->edx, &sc->edx);
-	err |= __put_user(regs->ecx, &sc->ecx);
-	err |= __put_user(regs->eax, &sc->eax);
+	err |= __put_user(regs->es, (unsigned int __user *)&sc->es);
+	err |= __put_user(regs->ds, (unsigned int __user *)&sc->ds);
+	err |= __put_user(regs->di, &sc->edi);
+	err |= __put_user(regs->si, &sc->esi);
+	err |= __put_user(regs->bp, &sc->ebp);
+	err |= __put_user(regs->sp, &sc->esp);
+	err |= __put_user(regs->bx, &sc->ebx);
+	err |= __put_user(regs->dx, &sc->edx);
+	err |= __put_user(regs->cx, &sc->ecx);
+	err |= __put_user(regs->ax, &sc->eax);
 	err |= __put_user(current->thread.trap_no, &sc->trapno);
 	err |= __put_user(current->thread.error_code, &sc->err);
-	err |= __put_user(regs->eip, &sc->eip);
-	err |= __put_user(regs->xcs, (unsigned int __user *)&sc->cs);
-	err |= __put_user(regs->eflags, &sc->eflags);
-	err |= __put_user(regs->esp, &sc->esp_at_signal);
-	err |= __put_user(regs->xss, (unsigned int __user *)&sc->ss);
+	err |= __put_user(regs->ip, &sc->eip);
+	err |= __put_user(regs->cs, (unsigned int __user *)&sc->cs);
+	err |= __put_user(regs->flags, &sc->eflags);
+	err |= __put_user(regs->sp, &sc->esp_at_signal);
+	err |= __put_user(regs->ss, (unsigned int __user *)&sc->ss);
 
 	tmp = save_i387(fpstate);
 	if (tmp < 0)
@@ -291,36 +291,36 @@ setup_sigcontext(struct sigcontext __user *sc, struct _fpstate __user *fpstate,
 static inline void __user *
 get_sigframe(struct k_sigaction *ka, struct pt_regs * regs, size_t frame_size)
 {
-	unsigned long esp;
+	unsigned long sp;
 
 	/* Default to using normal stack */
-	esp = regs->esp;
+	sp = regs->sp;
 
 	/*
 	 * If we are on the alternate signal stack and would overflow it, don't.
 	 * Return an always-bogus address instead so we will die with SIGSEGV.
 	 */
-	if (on_sig_stack(esp) && !likely(on_sig_stack(esp - frame_size)))
+	if (on_sig_stack(sp) && !likely(on_sig_stack(sp - frame_size)))
 		return (void __user *) -1L;
 
 	/* This is the X/Open sanctioned signal stack switching.  */
 	if (ka->sa.sa_flags & SA_ONSTACK) {
-		if (sas_ss_flags(esp) == 0)
-			esp = current->sas_ss_sp + current->sas_ss_size;
+		if (sas_ss_flags(sp) == 0)
+			sp = current->sas_ss_sp + current->sas_ss_size;
 	}
 
 	/* This is the legacy signal stack switching. */
-	else if ((regs->xss & 0xffff) != __USER_DS &&
+	else if ((regs->ss & 0xffff) != __USER_DS &&
 		 !(ka->sa.sa_flags & SA_RESTORER) &&
 		 ka->sa.sa_restorer) {
-		esp = (unsigned long) ka->sa.sa_restorer;
+		sp = (unsigned long) ka->sa.sa_restorer;
 	}
 
-	esp -= frame_size;
+	sp -= frame_size;
 	/* Align the stack pointer according to the i386 ABI,
 	 * i.e. so that on function entry ((sp + 4) & 15) == 0. */
-	esp = ((esp + 4) & -16ul) - 4;
-	return (void __user *) esp;
+	sp = ((sp + 4) & -16ul) - 4;
+	return (void __user *) sp;
 }
 
 /* These symbols are defined with the addresses in the vsyscall page.
@@ -387,16 +387,16 @@ static int setup_frame(int sig, struct k_sigaction *ka,
 		goto give_sigsegv;
 
 	/* Set up registers for signal handler */
-	regs->esp = (unsigned long) frame;
-	regs->eip = (unsigned long) ka->sa.sa_handler;
-	regs->eax = (unsigned long) sig;
-	regs->edx = (unsigned long) 0;
-	regs->ecx = (unsigned long) 0;
+	regs->sp = (unsigned long) frame;
+	regs->ip = (unsigned long) ka->sa.sa_handler;
+	regs->ax = (unsigned long) sig;
+	regs->dx = (unsigned long) 0;
+	regs->cx = (unsigned long) 0;
 
-	regs->xds = __USER_DS;
-	regs->xes = __USER_DS;
-	regs->xss = __USER_DS;
-	regs->xcs = __USER_CS;
+	regs->ds = __USER_DS;
+	regs->es = __USER_DS;
+	regs->ss = __USER_DS;
+	regs->cs = __USER_CS;
 
 	/*
 	 * Clear TF when entering the signal handler, but
@@ -404,13 +404,13 @@ static int setup_frame(int sig, struct k_sigaction *ka,
 	 * The tracer may want to single-step inside the
 	 * handler too.
 	 */
-	regs->eflags &= ~TF_MASK;
+	regs->flags &= ~TF_MASK;
 	if (test_thread_flag(TIF_SINGLESTEP))
 		ptrace_notify(SIGTRAP);
 
 #if DEBUG_SIG
 	printk("SIG deliver (%s:%d): sp=%p pc=%p ra=%p\n",
-		current->comm, current->pid, frame, regs->eip, frame->pretcode);
+		current->comm, current->pid, frame, regs->ip, frame->pretcode);
 #endif
 
 	return 0;
@@ -450,7 +450,7 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
 	err |= __put_user(0, &frame->uc.uc_flags);
 	err |= __put_user(0, &frame->uc.uc_link);
 	err |= __put_user(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
-	err |= __put_user(sas_ss_flags(regs->esp),
+	err |= __put_user(sas_ss_flags(regs->sp),
 			  &frame->uc.uc_stack.ss_flags);
 	err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size);
 	err |= setup_sigcontext(&frame->uc.uc_mcontext, &frame->fpstate,
@@ -466,7 +466,7 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
 	err |= __put_user(restorer, &frame->pretcode);
 	 
 	/*
-	 * This is movl $,%eax ; int $0x80
+	 * This is movl $,%ax ; int $0x80
 	 *
 	 * WE DO NOT USE IT ANY MORE! It's only left here for historical
 	 * reasons and because gdb uses it as a signature to notice
@@ -480,16 +480,16 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
 		goto give_sigsegv;
 
 	/* Set up registers for signal handler */
-	regs->esp = (unsigned long) frame;
-	regs->eip = (unsigned long) ka->sa.sa_handler;
-	regs->eax = (unsigned long) usig;
-	regs->edx = (unsigned long) &frame->info;
-	regs->ecx = (unsigned long) &frame->uc;
+	regs->sp = (unsigned long) frame;
+	regs->ip = (unsigned long) ka->sa.sa_handler;
+	regs->ax = (unsigned long) usig;
+	regs->dx = (unsigned long) &frame->info;
+	regs->cx = (unsigned long) &frame->uc;
 
-	regs->xds = __USER_DS;
-	regs->xes = __USER_DS;
-	regs->xss = __USER_DS;
-	regs->xcs = __USER_CS;
+	regs->ds = __USER_DS;
+	regs->es = __USER_DS;
+	regs->ss = __USER_DS;
+	regs->cs = __USER_CS;
 
 	/*
 	 * Clear TF when entering the signal handler, but
@@ -497,13 +497,13 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
 	 * The tracer may want to single-step inside the
 	 * handler too.
 	 */
-	regs->eflags &= ~TF_MASK;
+	regs->flags &= ~TF_MASK;
 	if (test_thread_flag(TIF_SINGLESTEP))
 		ptrace_notify(SIGTRAP);
 
 #if DEBUG_SIG
 	printk("SIG deliver (%s:%d): sp=%p pc=%p ra=%p\n",
-		current->comm, current->pid, frame, regs->eip, frame->pretcode);
+		current->comm, current->pid, frame, regs->ip, frame->pretcode);
 #endif
 
 	return 0;
@@ -524,23 +524,23 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
 	int ret;
 
 	/* Are we from a system call? */
-	if (regs->orig_eax >= 0) {
+	if (regs->orig_ax >= 0) {
 		/* If so, check system call restarting.. */
-		switch (regs->eax) {
+		switch (regs->ax) {
 		        case -ERESTART_RESTARTBLOCK:
 			case -ERESTARTNOHAND:
-				regs->eax = -EINTR;
+				regs->ax = -EINTR;
 				break;
 
 			case -ERESTARTSYS:
 				if (!(ka->sa.sa_flags & SA_RESTART)) {
-					regs->eax = -EINTR;
+					regs->ax = -EINTR;
 					break;
 				}
 			/* fallthrough */
 			case -ERESTARTNOINTR:
-				regs->eax = regs->orig_eax;
-				regs->eip -= 2;
+				regs->ax = regs->orig_ax;
+				regs->ip -= 2;
 		}
 	}
 
@@ -548,9 +548,9 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
 	 * If TF is set due to a debugger (TIF_FORCED_TF), clear the TF
 	 * flag so that register information in the sigcontext is correct.
 	 */
-	if (unlikely(regs->eflags & X86_EFLAGS_TF) &&
+	if (unlikely(regs->flags & X86_EFLAGS_TF) &&
 	    likely(test_and_clear_thread_flag(TIF_FORCED_TF)))
-		regs->eflags &= ~X86_EFLAGS_TF;
+		regs->flags &= ~X86_EFLAGS_TF;
 
 	/* Set up the stack frame */
 	if (ka->sa.sa_flags & SA_SIGINFO)
@@ -622,19 +622,19 @@ static void fastcall do_signal(struct pt_regs *regs)
 	}
 
 	/* Did we come from a system call? */
-	if (regs->orig_eax >= 0) {
+	if (regs->orig_ax >= 0) {
 		/* Restart the system call - no handlers present */
-		switch (regs->eax) {
+		switch (regs->ax) {
 		case -ERESTARTNOHAND:
 		case -ERESTARTSYS:
 		case -ERESTARTNOINTR:
-			regs->eax = regs->orig_eax;
-			regs->eip -= 2;
+			regs->ax = regs->orig_ax;
+			regs->ip -= 2;
 			break;
 
 		case -ERESTART_RESTARTBLOCK:
-			regs->eax = __NR_restart_syscall;
-			regs->eip -= 2;
+			regs->ax = __NR_restart_syscall;
+			regs->ip -= 2;
 			break;
 		}
 	}
@@ -657,7 +657,7 @@ void do_notify_resume(struct pt_regs *regs, void *_unused,
 {
 	/* Pending single-step? */
 	if (thread_info_flags & _TIF_SINGLESTEP) {
-		regs->eflags |= TF_MASK;
+		regs->flags |= TF_MASK;
 		clear_thread_flag(TIF_SINGLESTEP);
 	}
 
diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c
index ab0178ebe00a..4b228fd83b31 100644
--- a/arch/x86/kernel/signal_64.c
+++ b/arch/x86/kernel/signal_64.c
@@ -39,7 +39,7 @@ asmlinkage long
 sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss,
 		struct pt_regs *regs)
 {
-	return do_sigaltstack(uss, uoss, regs->rsp);
+	return do_sigaltstack(uss, uoss, regs->sp);
 }
 
 
@@ -62,10 +62,11 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, unsigned
 	/* Always make any pending restarted system calls return -EINTR */
 	current_thread_info()->restart_block.fn = do_no_restart_syscall;
 
+#define COPYR(x)	err |= __get_user(regs->x, &sc->r ## x)
 #define COPY(x)		err |= __get_user(regs->x, &sc->x)
 
-	COPY(rdi); COPY(rsi); COPY(rbp); COPY(rsp); COPY(rbx);
-	COPY(rdx); COPY(rcx); COPY(rip);
+	COPYR(di); COPYR(si); COPYR(bp); COPYR(sp); COPYR(bx);
+	COPYR(dx); COPYR(cx); COPYR(ip);
 	COPY(r8);
 	COPY(r9);
 	COPY(r10);
@@ -87,8 +88,8 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, unsigned
 	{
 		unsigned int tmpflags;
 		err |= __get_user(tmpflags, &sc->eflags);
-		regs->eflags = (regs->eflags & ~0x40DD5) | (tmpflags & 0x40DD5);
-		regs->orig_rax = -1;		/* disable syscall checks */
+		regs->flags = (regs->flags & ~0x40DD5) | (tmpflags & 0x40DD5);
+		regs->orig_ax = -1;		/* disable syscall checks */
 	}
 
 	{
@@ -119,9 +120,9 @@ asmlinkage long sys_rt_sigreturn(struct pt_regs *regs)
 {
 	struct rt_sigframe __user *frame;
 	sigset_t set;
-	unsigned long eax;
+	unsigned long ax;
 
-	frame = (struct rt_sigframe __user *)(regs->rsp - 8);
+	frame = (struct rt_sigframe __user *)(regs->sp - 8);
 	if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) {
 		goto badframe;
 	} 
@@ -135,17 +136,17 @@ asmlinkage long sys_rt_sigreturn(struct pt_regs *regs)
 	recalc_sigpending();
 	spin_unlock_irq(&current->sighand->siglock);
 	
-	if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &eax))
+	if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax))
 		goto badframe;
 
 #ifdef DEBUG_SIG
-	printk("%d sigreturn rip:%lx rsp:%lx frame:%p rax:%lx\n",current->pid,regs->rip,regs->rsp,frame,eax);
+	printk("%d sigreturn ip:%lx sp:%lx frame:%p ax:%lx\n",current->pid,regs->ip,regs->sp,frame,ax);
 #endif
 
-	if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->rsp) == -EFAULT)
+	if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->sp) == -EFAULT)
 		goto badframe;
 
-	return eax;
+	return ax;
 
 badframe:
 	signal_fault(regs,frame,"sigreturn");
@@ -165,14 +166,14 @@ setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs, unsigned lo
 	err |= __put_user(0, &sc->gs);
 	err |= __put_user(0, &sc->fs);
 
-	err |= __put_user(regs->rdi, &sc->rdi);
-	err |= __put_user(regs->rsi, &sc->rsi);
-	err |= __put_user(regs->rbp, &sc->rbp);
-	err |= __put_user(regs->rsp, &sc->rsp);
-	err |= __put_user(regs->rbx, &sc->rbx);
-	err |= __put_user(regs->rdx, &sc->rdx);
-	err |= __put_user(regs->rcx, &sc->rcx);
-	err |= __put_user(regs->rax, &sc->rax);
+	err |= __put_user(regs->di, &sc->rdi);
+	err |= __put_user(regs->si, &sc->rsi);
+	err |= __put_user(regs->bp, &sc->rbp);
+	err |= __put_user(regs->sp, &sc->rsp);
+	err |= __put_user(regs->bx, &sc->rbx);
+	err |= __put_user(regs->dx, &sc->rdx);
+	err |= __put_user(regs->cx, &sc->rcx);
+	err |= __put_user(regs->ax, &sc->rax);
 	err |= __put_user(regs->r8, &sc->r8);
 	err |= __put_user(regs->r9, &sc->r9);
 	err |= __put_user(regs->r10, &sc->r10);
@@ -183,8 +184,8 @@ setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs, unsigned lo
 	err |= __put_user(regs->r15, &sc->r15);
 	err |= __put_user(me->thread.trap_no, &sc->trapno);
 	err |= __put_user(me->thread.error_code, &sc->err);
-	err |= __put_user(regs->rip, &sc->rip);
-	err |= __put_user(regs->eflags, &sc->eflags);
+	err |= __put_user(regs->ip, &sc->rip);
+	err |= __put_user(regs->flags, &sc->eflags);
 	err |= __put_user(mask, &sc->oldmask);
 	err |= __put_user(me->thread.cr2, &sc->cr2);
 
@@ -198,18 +199,18 @@ setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs, unsigned lo
 static void __user *
 get_stack(struct k_sigaction *ka, struct pt_regs *regs, unsigned long size)
 {
-	unsigned long rsp;
+	unsigned long sp;
 
 	/* Default to using normal stack - redzone*/
-	rsp = regs->rsp - 128;
+	sp = regs->sp - 128;
 
 	/* This is the X/Open sanctioned signal stack switching.  */
 	if (ka->sa.sa_flags & SA_ONSTACK) {
-		if (sas_ss_flags(rsp) == 0)
-			rsp = current->sas_ss_sp + current->sas_ss_size;
+		if (sas_ss_flags(sp) == 0)
+			sp = current->sas_ss_sp + current->sas_ss_size;
 	}
 
-	return (void __user *)round_down(rsp - size, 16); 
+	return (void __user *)round_down(sp - size, 16);
 }
 
 static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
@@ -246,7 +247,7 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
 	err |= __put_user(0, &frame->uc.uc_flags);
 	err |= __put_user(0, &frame->uc.uc_link);
 	err |= __put_user(me->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
-	err |= __put_user(sas_ss_flags(regs->rsp),
+	err |= __put_user(sas_ss_flags(regs->sp),
 			  &frame->uc.uc_stack.ss_flags);
 	err |= __put_user(me->sas_ss_size, &frame->uc.uc_stack.ss_size);
 	err |= setup_sigcontext(&frame->uc.uc_mcontext, regs, set->sig[0], me);
@@ -271,21 +272,21 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
 		goto give_sigsegv;
 
 #ifdef DEBUG_SIG
-	printk("%d old rip %lx old rsp %lx old rax %lx\n", current->pid,regs->rip,regs->rsp,regs->rax);
+	printk("%d old ip %lx old sp %lx old ax %lx\n", current->pid,regs->ip,regs->sp,regs->ax);
 #endif
 
 	/* Set up registers for signal handler */
-	regs->rdi = sig;
+	regs->di = sig;
 	/* In case the signal handler was declared without prototypes */ 
-	regs->rax = 0;	
+	regs->ax = 0;
 
 	/* This also works for non SA_SIGINFO handlers because they expect the
 	   next argument after the signal number on the stack. */
-	regs->rsi = (unsigned long)&frame->info; 
-	regs->rdx = (unsigned long)&frame->uc; 
-	regs->rip = (unsigned long) ka->sa.sa_handler;
+	regs->si = (unsigned long)&frame->info;
+	regs->dx = (unsigned long)&frame->uc;
+	regs->ip = (unsigned long) ka->sa.sa_handler;
 
-	regs->rsp = (unsigned long)frame;
+	regs->sp = (unsigned long)frame;
 
 	/* Set up the CS register to run signal handlers in 64-bit mode,
 	   even if the handler happens to be interrupting 32-bit code. */
@@ -295,12 +296,12 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
 	   see include/asm-x86_64/uaccess.h for details. */
 	set_fs(USER_DS);
 
-	regs->eflags &= ~TF_MASK;
+	regs->flags &= ~TF_MASK;
 	if (test_thread_flag(TIF_SINGLESTEP))
 		ptrace_notify(SIGTRAP);
 #ifdef DEBUG_SIG
 	printk("SIG deliver (%s:%d): sp=%p pc=%lx ra=%p\n",
-		current->comm, current->pid, frame, regs->rip, frame->pretcode);
+		current->comm, current->pid, frame, regs->ip, frame->pretcode);
 #endif
 
 	return 0;
@@ -321,29 +322,29 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
 	int ret;
 
 #ifdef DEBUG_SIG
-	printk("handle_signal pid:%d sig:%lu rip:%lx rsp:%lx regs=%p\n",
+	printk("handle_signal pid:%d sig:%lu ip:%lx sp:%lx regs=%p\n",
 		current->pid, sig,
-		regs->rip, regs->rsp, regs);
+		regs->ip, regs->sp, regs);
 #endif
 
 	/* Are we from a system call? */
-	if ((long)regs->orig_rax >= 0) {
+	if ((long)regs->orig_ax >= 0) {
 		/* If so, check system call restarting.. */
-		switch (regs->rax) {
+		switch (regs->ax) {
 		        case -ERESTART_RESTARTBLOCK:
 			case -ERESTARTNOHAND:
-				regs->rax = -EINTR;
+				regs->ax = -EINTR;
 				break;
 
 			case -ERESTARTSYS:
 				if (!(ka->sa.sa_flags & SA_RESTART)) {
-					regs->rax = -EINTR;
+					regs->ax = -EINTR;
 					break;
 				}
 				/* fallthrough */
 			case -ERESTARTNOINTR:
-				regs->rax = regs->orig_rax;
-				regs->rip -= 2;
+				regs->ax = regs->orig_ax;
+				regs->ip -= 2;
 				break;
 		}
 	}
@@ -352,9 +353,9 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
 	 * If TF is set due to a debugger (TIF_FORCED_TF), clear the TF
 	 * flag so that register information in the sigcontext is correct.
 	 */
-	if (unlikely(regs->eflags & X86_EFLAGS_TF) &&
+	if (unlikely(regs->flags & X86_EFLAGS_TF) &&
 	    likely(test_and_clear_thread_flag(TIF_FORCED_TF)))
-		regs->eflags &= ~X86_EFLAGS_TF;
+		regs->flags &= ~X86_EFLAGS_TF;
 
 #ifdef CONFIG_IA32_EMULATION
 	if (test_thread_flag(TIF_IA32)) {
@@ -426,21 +427,21 @@ static void do_signal(struct pt_regs *regs)
 	}
 
 	/* Did we come from a system call? */
-	if ((long)regs->orig_rax >= 0) {
+	if ((long)regs->orig_ax >= 0) {
 		/* Restart the system call - no handlers present */
-		long res = regs->rax;
+		long res = regs->ax;
 		switch (res) {
 		case -ERESTARTNOHAND:
 		case -ERESTARTSYS:
 		case -ERESTARTNOINTR:
-			regs->rax = regs->orig_rax;
-			regs->rip -= 2;
+			regs->ax = regs->orig_ax;
+			regs->ip -= 2;
 			break;
 		case -ERESTART_RESTARTBLOCK:
-			regs->rax = test_thread_flag(TIF_IA32) ?
+			regs->ax = test_thread_flag(TIF_IA32) ?
 					__NR_ia32_restart_syscall :
 					__NR_restart_syscall;
-			regs->rip -= 2;
+			regs->ip -= 2;
 			break;
 		}
 	}
@@ -457,13 +458,13 @@ void
 do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
 {
 #ifdef DEBUG_SIG
-	printk("do_notify_resume flags:%x rip:%lx rsp:%lx caller:%p pending:%x\n",
-	       thread_info_flags, regs->rip, regs->rsp, __builtin_return_address(0),signal_pending(current)); 
+	printk("do_notify_resume flags:%x ip:%lx sp:%lx caller:%p pending:%x\n",
+	       thread_info_flags, regs->ip, regs->sp, __builtin_return_address(0),signal_pending(current));
 #endif
 	       
 	/* Pending single-step? */
 	if (thread_info_flags & _TIF_SINGLESTEP) {
-		regs->eflags |= TF_MASK;
+		regs->flags |= TF_MASK;
 		clear_thread_flag(TIF_SINGLESTEP);
 	}
 
@@ -485,8 +486,8 @@ void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
 { 
 	struct task_struct *me = current; 
 	if (show_unhandled_signals && printk_ratelimit())
-		printk("%s[%d] bad frame in %s frame:%p rip:%lx rsp:%lx orax:%lx\n",
-	       me->comm,me->pid,where,frame,regs->rip,regs->rsp,regs->orig_rax); 
+		printk("%s[%d] bad frame in %s frame:%p ip:%lx sp:%lx orax:%lx\n",
+	       me->comm,me->pid,where,frame,regs->ip,regs->sp,regs->orig_ax);
 
 	force_sig(SIGSEGV, me); 
 } 
diff --git a/arch/x86/kernel/smp_64.c b/arch/x86/kernel/smp_64.c
index 7142447b5666..02a6533e8909 100644
--- a/arch/x86/kernel/smp_64.c
+++ b/arch/x86/kernel/smp_64.c
@@ -136,7 +136,7 @@ asmlinkage void smp_invalidate_interrupt(struct pt_regs *regs)
 	 * orig_rax contains the negated interrupt vector.
 	 * Use that to determine where the sender put the data.
 	 */
-	sender = ~regs->orig_rax - INVALIDATE_TLB_VECTOR_START;
+	sender = ~regs->orig_ax - INVALIDATE_TLB_VECTOR_START;
 	f = &per_cpu(flush_state, sender);
 
 	if (!cpu_isset(cpu, f->flush_cpumask))
diff --git a/arch/x86/kernel/smpboot_32.c b/arch/x86/kernel/smpboot_32.c
index 0bf7f20baba0..3566191832b3 100644
--- a/arch/x86/kernel/smpboot_32.c
+++ b/arch/x86/kernel/smpboot_32.c
@@ -447,7 +447,7 @@ void __devinit initialize_secondary(void)
 {
 	/*
 	 * We don't actually need to load the full TSS,
-	 * basically just the stack pointer and the eip.
+	 * basically just the stack pointer and the ip.
 	 */
 
 	asm volatile(
@@ -459,7 +459,7 @@ void __devinit initialize_secondary(void)
 
 /* Static state in head.S used to set up a CPU */
 extern struct {
-	void * esp;
+	void * sp;
 	unsigned short ss;
 } stack_start;
 
@@ -667,7 +667,7 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
 	 * target processor state.
 	 */
 	startup_ipi_hook(phys_apicid, (unsigned long) start_secondary,
-		         (unsigned long) stack_start.esp);
+		         (unsigned long) stack_start.sp);
 
 	/*
 	 * Run STARTUP IPI loop.
@@ -806,9 +806,9 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu)
 	alternatives_smp_switch(1);
 
 	/* So we see what's up   */
-	printk("Booting processor %d/%d eip %lx\n", cpu, apicid, start_eip);
+	printk("Booting processor %d/%d ip %lx\n", cpu, apicid, start_eip);
 	/* Stack for startup_32 can be just as for start_secondary onwards */
-	stack_start.esp = (void *) idle->thread.esp;
+	stack_start.sp = (void *) idle->thread.esp;
 
 	irq_ctx_init(cpu);
 
diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c
index cf4b9dac4a05..f55c003f5b63 100644
--- a/arch/x86/kernel/step.c
+++ b/arch/x86/kernel/step.c
@@ -12,17 +12,12 @@ unsigned long convert_rip_to_linear(struct task_struct *child, struct pt_regs *r
 {
 	unsigned long addr, seg;
 
-#ifdef CONFIG_X86_64
-	addr = regs->rip;
+	addr = regs->ip;
 	seg = regs->cs & 0xffff;
-#else
-	addr = regs->eip;
-	seg = regs->xcs & 0xffff;
-	if (regs->eflags & X86_EFLAGS_VM) {
+	if (v8086_mode(regs)) {
 		addr = (addr & 0xffff) + (seg << 4);
 		return addr;
 	}
-#endif
 
 	/*
 	 * We'll assume that the code segments in the GDT
@@ -124,11 +119,11 @@ static int enable_single_step(struct task_struct *child)
 	/*
 	 * If TF was already set, don't do anything else
 	 */
-	if (regs->eflags & X86_EFLAGS_TF)
+	if (regs->flags & X86_EFLAGS_TF)
 		return 0;
 
 	/* Set TF on the kernel stack.. */
-	regs->eflags |= X86_EFLAGS_TF;
+	regs->flags |= X86_EFLAGS_TF;
 
 	/*
 	 * ..but if TF is changed by the instruction we will trace,
@@ -203,5 +198,5 @@ void user_disable_single_step(struct task_struct *child)
 
 	/* But touch TF only if it was set by us.. */
 	if (test_and_clear_tsk_thread_flag(child, TIF_FORCED_TF))
-		task_pt_regs(child)->eflags &= ~X86_EFLAGS_TF;
+		task_pt_regs(child)->flags &= ~X86_EFLAGS_TF;
 }
diff --git a/arch/x86/kernel/suspend_asm_64.S b/arch/x86/kernel/suspend_asm_64.S
index 72f952103e50..aeb9a4d7681e 100644
--- a/arch/x86/kernel/suspend_asm_64.S
+++ b/arch/x86/kernel/suspend_asm_64.S
@@ -18,13 +18,13 @@
 
 ENTRY(swsusp_arch_suspend)
 	movq	$saved_context, %rax
-	movq	%rsp, pt_regs_rsp(%rax)
-	movq	%rbp, pt_regs_rbp(%rax)
-	movq	%rsi, pt_regs_rsi(%rax)
-	movq	%rdi, pt_regs_rdi(%rax)
-	movq	%rbx, pt_regs_rbx(%rax)
-	movq	%rcx, pt_regs_rcx(%rax)
-	movq	%rdx, pt_regs_rdx(%rax)
+	movq	%rsp, pt_regs_sp(%rax)
+	movq	%rbp, pt_regs_bp(%rax)
+	movq	%rsi, pt_regs_si(%rax)
+	movq	%rdi, pt_regs_di(%rax)
+	movq	%rbx, pt_regs_bx(%rax)
+	movq	%rcx, pt_regs_cx(%rax)
+	movq	%rdx, pt_regs_dx(%rax)
 	movq	%r8, pt_regs_r8(%rax)
 	movq	%r9, pt_regs_r9(%rax)
 	movq	%r10, pt_regs_r10(%rax)
@@ -34,7 +34,7 @@ ENTRY(swsusp_arch_suspend)
 	movq	%r14, pt_regs_r14(%rax)
 	movq	%r15, pt_regs_r15(%rax)
 	pushfq
-	popq	pt_regs_eflags(%rax)
+	popq	pt_regs_flags(%rax)
 
 	/* save the address of restore_registers */
 	movq	$restore_registers, %rax
@@ -115,13 +115,13 @@ ENTRY(restore_registers)
 
 	/* We don't restore %rax, it must be 0 anyway */
 	movq	$saved_context, %rax
-	movq	pt_regs_rsp(%rax), %rsp
-	movq	pt_regs_rbp(%rax), %rbp
-	movq	pt_regs_rsi(%rax), %rsi
-	movq	pt_regs_rdi(%rax), %rdi
-	movq	pt_regs_rbx(%rax), %rbx
-	movq	pt_regs_rcx(%rax), %rcx
-	movq	pt_regs_rdx(%rax), %rdx
+	movq	pt_regs_sp(%rax), %rsp
+	movq	pt_regs_bp(%rax), %rbp
+	movq	pt_regs_si(%rax), %rsi
+	movq	pt_regs_di(%rax), %rdi
+	movq	pt_regs_bx(%rax), %rbx
+	movq	pt_regs_cx(%rax), %rcx
+	movq	pt_regs_dx(%rax), %rdx
 	movq	pt_regs_r8(%rax), %r8
 	movq	pt_regs_r9(%rax), %r9
 	movq	pt_regs_r10(%rax), %r10
@@ -130,7 +130,7 @@ ENTRY(restore_registers)
 	movq	pt_regs_r13(%rax), %r13
 	movq	pt_regs_r14(%rax), %r14
 	movq	pt_regs_r15(%rax), %r15
-	pushq	pt_regs_eflags(%rax)
+	pushq	pt_regs_flags(%rax)
 	popfq
 
 	xorq	%rax, %rax
diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c
index 2dcbb81b4cd3..1a89e93f3f1c 100644
--- a/arch/x86/kernel/time_32.c
+++ b/arch/x86/kernel/time_32.c
@@ -49,15 +49,15 @@ unsigned long profile_pc(struct pt_regs *regs)
 	unsigned long pc = instruction_pointer(regs);
 
 #ifdef CONFIG_SMP
-	if (!v8086_mode(regs) && SEGMENT_IS_KERNEL_CODE(regs->xcs) &&
+	if (!v8086_mode(regs) && SEGMENT_IS_KERNEL_CODE(regs->cs) &&
 	    in_lock_functions(pc)) {
 #ifdef CONFIG_FRAME_POINTER
-		return *(unsigned long *)(regs->ebp + 4);
+		return *(unsigned long *)(regs->bp + 4);
 #else
-		unsigned long *sp = (unsigned long *)&regs->esp;
+		unsigned long *sp = (unsigned long *)&regs->sp;
 
 		/* Return address is either directly at stack pointer
-		   or above a saved eflags. Eflags has bits 22-31 zero,
+		   or above a saved flags. Eflags has bits 22-31 zero,
 		   kernel addresses don't. */
 		if (sp[0] >> 22)
 			return sp[0];
diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c
index f88bf6b802e3..bf0bcc9bb001 100644
--- a/arch/x86/kernel/time_64.c
+++ b/arch/x86/kernel/time_64.c
@@ -29,10 +29,10 @@ unsigned long profile_pc(struct pt_regs *regs)
 	unsigned long pc = instruction_pointer(regs);
 
 	/* Assume the lock function has either no stack frame or a copy
-	   of eflags from PUSHF
+	   of flags from PUSHF
 	   Eflags always has bits 22 and up cleared unlike kernel addresses. */
 	if (!user_mode(regs) && in_lock_functions(pc)) {
-		unsigned long *sp = (unsigned long *)regs->rsp;
+		unsigned long *sp = (unsigned long *)regs->sp;
 		if (sp[0] >> 22)
 			return sp[0];
 		if (sp[1] >> 22)
diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c
index 9b0bbd508cd5..931ef10960ee 100644
--- a/arch/x86/kernel/traps_32.c
+++ b/arch/x86/kernel/traps_32.c
@@ -114,11 +114,11 @@ struct stack_frame {
 };
 
 static inline unsigned long print_context_stack(struct thread_info *tinfo,
-				unsigned long *stack, unsigned long ebp,
+				unsigned long *stack, unsigned long bp,
 				const struct stacktrace_ops *ops, void *data)
 {
 #ifdef	CONFIG_FRAME_POINTER
-	struct stack_frame *frame = (struct stack_frame *)ebp;
+	struct stack_frame *frame = (struct stack_frame *)bp;
 	while (valid_stack_ptr(tinfo, frame, sizeof(*frame))) {
 		struct stack_frame *next;
 		unsigned long addr;
@@ -145,7 +145,7 @@ static inline unsigned long print_context_stack(struct thread_info *tinfo,
 			ops->address(data, addr);
 	}
 #endif
-	return ebp;
+	return bp;
 }
 
 #define MSG(msg) ops->warning(data, msg)
@@ -154,7 +154,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
 	        unsigned long *stack,
 		const struct stacktrace_ops *ops, void *data)
 {
-	unsigned long ebp = 0;
+	unsigned long bp = 0;
 
 	if (!task)
 		task = current;
@@ -167,13 +167,13 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
 	}
 
 #ifdef CONFIG_FRAME_POINTER
-	if (!ebp) {
+	if (!bp) {
 		if (task == current) {
-			/* Grab ebp right from our regs */
-			asm ("movl %%ebp, %0" : "=r" (ebp) : );
+			/* Grab bp right from our regs */
+			asm ("movl %%ebp, %0" : "=r" (bp) : );
 		} else {
-			/* ebp is the last reg pushed by switch_to */
-			ebp = *(unsigned long *) task->thread.esp;
+			/* bp is the last reg pushed by switch_to */
+			bp = *(unsigned long *) task->thread.esp;
 		}
 	}
 #endif
@@ -182,7 +182,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
 		struct thread_info *context;
 		context = (struct thread_info *)
 			((unsigned long)stack & (~(THREAD_SIZE - 1)));
-		ebp = print_context_stack(context, stack, ebp, ops, data);
+		bp = print_context_stack(context, stack, bp, ops, data);
 		/* Should be after the line below, but somewhere
 		   in early boot context comes out corrupted and we
 		   can't reference it -AK */
@@ -246,19 +246,19 @@ void show_trace(struct task_struct *task, struct pt_regs *regs,
 }
 
 static void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
-			       unsigned long *esp, char *log_lvl)
+			       unsigned long *sp, char *log_lvl)
 {
 	unsigned long *stack;
 	int i;
 
-	if (esp == NULL) {
+	if (sp == NULL) {
 		if (task)
-			esp = (unsigned long*)task->thread.esp;
+			sp = (unsigned long*)task->thread.esp;
 		else
-			esp = (unsigned long *)&esp;
+			sp = (unsigned long *)&sp;
 	}
 
-	stack = esp;
+	stack = sp;
 	for(i = 0; i < kstack_depth_to_print; i++) {
 		if (kstack_end(stack))
 			break;
@@ -267,13 +267,13 @@ static void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
 		printk("%08lx ", *stack++);
 	}
 	printk("\n%sCall Trace:\n", log_lvl);
-	show_trace_log_lvl(task, regs, esp, log_lvl);
+	show_trace_log_lvl(task, regs, sp, log_lvl);
 }
 
-void show_stack(struct task_struct *task, unsigned long *esp)
+void show_stack(struct task_struct *task, unsigned long *sp)
 {
 	printk("       ");
-	show_stack_log_lvl(task, NULL, esp, "");
+	show_stack_log_lvl(task, NULL, sp, "");
 }
 
 /*
@@ -307,30 +307,30 @@ void show_registers(struct pt_regs *regs)
 	 * time of the fault..
 	 */
 	if (!user_mode_vm(regs)) {
-		u8 *eip;
+		u8 *ip;
 		unsigned int code_prologue = code_bytes * 43 / 64;
 		unsigned int code_len = code_bytes;
 		unsigned char c;
 
 		printk("\n" KERN_EMERG "Stack: ");
-		show_stack_log_lvl(NULL, regs, &regs->esp, KERN_EMERG);
+		show_stack_log_lvl(NULL, regs, &regs->sp, KERN_EMERG);
 
 		printk(KERN_EMERG "Code: ");
 
-		eip = (u8 *)regs->eip - code_prologue;
-		if (eip < (u8 *)PAGE_OFFSET ||
-			probe_kernel_address(eip, c)) {
+		ip = (u8 *)regs->ip - code_prologue;
+		if (ip < (u8 *)PAGE_OFFSET ||
+			probe_kernel_address(ip, c)) {
 			/* try starting at EIP */
-			eip = (u8 *)regs->eip;
+			ip = (u8 *)regs->ip;
 			code_len = code_len - code_prologue + 1;
 		}
-		for (i = 0; i < code_len; i++, eip++) {
-			if (eip < (u8 *)PAGE_OFFSET ||
-				probe_kernel_address(eip, c)) {
+		for (i = 0; i < code_len; i++, ip++) {
+			if (ip < (u8 *)PAGE_OFFSET ||
+				probe_kernel_address(ip, c)) {
 				printk(" Bad EIP value.");
 				break;
 			}
-			if (eip == (u8 *)regs->eip)
+			if (ip == (u8 *)regs->ip)
 				printk("<%02x> ", c);
 			else
 				printk("%02x ", c);
@@ -339,13 +339,13 @@ void show_registers(struct pt_regs *regs)
 	printk("\n");
 }	
 
-int is_valid_bugaddr(unsigned long eip)
+int is_valid_bugaddr(unsigned long ip)
 {
 	unsigned short ud2;
 
-	if (eip < PAGE_OFFSET)
+	if (ip < PAGE_OFFSET)
 		return 0;
-	if (probe_kernel_address((unsigned short *)eip, ud2))
+	if (probe_kernel_address((unsigned short *)ip, ud2))
 		return 0;
 
 	return ud2 == 0x0b0f;
@@ -382,10 +382,10 @@ void die(const char * str, struct pt_regs * regs, long err)
 		raw_local_irq_save(flags);
 
 	if (++die.lock_owner_depth < 3) {
-		unsigned long esp;
+		unsigned long sp;
 		unsigned short ss;
 
-		report_bug(regs->eip, regs);
+		report_bug(regs->ip, regs);
 
 		printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff,
 		       ++die_counter);
@@ -405,15 +405,15 @@ void die(const char * str, struct pt_regs * regs, long err)
 				NOTIFY_STOP) {
 			show_registers(regs);
 			/* Executive summary in case the oops scrolled away */
-			esp = (unsigned long) (&regs->esp);
+			sp = (unsigned long) (&regs->sp);
 			savesegment(ss, ss);
 			if (user_mode(regs)) {
-				esp = regs->esp;
-				ss = regs->xss & 0xffff;
+				sp = regs->sp;
+				ss = regs->ss & 0xffff;
 			}
-			printk(KERN_EMERG "EIP: [<%08lx>] ", regs->eip);
-			print_symbol("%s", regs->eip);
-			printk(" SS:ESP %04x:%08lx\n", ss, esp);
+			printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip);
+			print_symbol("%s", regs->ip);
+			printk(" SS:ESP %04x:%08lx\n", ss, sp);
 		}
 		else
 			regs = NULL;
@@ -454,7 +454,7 @@ static void __kprobes do_trap(int trapnr, int signr, char *str, int vm86,
 {
 	struct task_struct *tsk = current;
 
-	if (regs->eflags & VM_MASK) {
+	if (regs->flags & VM_MASK) {
 		if (vm86)
 			goto vm86_trap;
 		goto trap_signal;
@@ -548,13 +548,13 @@ fastcall void do_##name(struct pt_regs * regs, long error_code) \
 	do_trap(trapnr, signr, str, 1, regs, error_code, &info); \
 }
 
-DO_VM86_ERROR_INFO( 0, SIGFPE,  "divide error", divide_error, FPE_INTDIV, regs->eip)
+DO_VM86_ERROR_INFO( 0, SIGFPE,  "divide error", divide_error, FPE_INTDIV, regs->ip)
 #ifndef CONFIG_KPROBES
 DO_VM86_ERROR( 3, SIGTRAP, "int3", int3)
 #endif
 DO_VM86_ERROR( 4, SIGSEGV, "overflow", overflow)
 DO_VM86_ERROR( 5, SIGSEGV, "bounds", bounds)
-DO_ERROR_INFO( 6, SIGILL,  "invalid opcode", invalid_op, ILL_ILLOPN, regs->eip, 0)
+DO_ERROR_INFO( 6, SIGILL,  "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0)
 DO_ERROR( 9, SIGFPE,  "coprocessor segment overrun", coprocessor_segment_overrun)
 DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
 DO_ERROR(11, SIGBUS,  "segment not present", segment_not_present)
@@ -596,7 +596,7 @@ fastcall void __kprobes do_general_protection(struct pt_regs * regs,
 	}
 	put_cpu();
 
-	if (regs->eflags & VM_MASK)
+	if (regs->flags & VM_MASK)
 		goto gp_in_vm86;
 
 	if (!user_mode(regs))
@@ -607,9 +607,9 @@ fastcall void __kprobes do_general_protection(struct pt_regs * regs,
 	if (show_unhandled_signals && unhandled_signal(current, SIGSEGV) &&
 	    printk_ratelimit())
 		printk(KERN_INFO
-		    "%s[%d] general protection eip:%lx esp:%lx error:%lx\n",
+		    "%s[%d] general protection ip:%lx sp:%lx error:%lx\n",
 		    current->comm, task_pid_nr(current),
-		    regs->eip, regs->esp, error_code);
+		    regs->ip, regs->sp, error_code);
 
 	force_sig(SIGSEGV, current);
 	return;
@@ -705,8 +705,8 @@ void __kprobes die_nmi(struct pt_regs *regs, const char *msg)
 	*/
 	bust_spinlocks(1);
 	printk(KERN_EMERG "%s", msg);
-	printk(" on CPU%d, eip %08lx, registers:\n",
-		smp_processor_id(), regs->eip);
+	printk(" on CPU%d, ip %08lx, registers:\n",
+		smp_processor_id(), regs->ip);
 	show_registers(regs);
 	console_silent();
 	spin_unlock(&nmi_print_lock);
@@ -847,7 +847,7 @@ fastcall void __kprobes do_debug(struct pt_regs * regs, long error_code)
 					SIGTRAP) == NOTIFY_STOP)
 		return;
 	/* It's safe to allow irq's after DR6 has been saved */
-	if (regs->eflags & X86_EFLAGS_IF)
+	if (regs->flags & X86_EFLAGS_IF)
 		local_irq_enable();
 
 	/* Mask out spurious debug traps due to lazy DR7 setting */
@@ -856,7 +856,7 @@ fastcall void __kprobes do_debug(struct pt_regs * regs, long error_code)
 			goto clear_dr7;
 	}
 
-	if (regs->eflags & VM_MASK)
+	if (regs->flags & VM_MASK)
 		goto debug_vm86;
 
 	/* Save debug status register where ptrace can see it */
@@ -892,7 +892,7 @@ debug_vm86:
 
 clear_TF_reenable:
 	set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
-	regs->eflags &= ~TF_MASK;
+	regs->flags &= ~TF_MASK;
 	return;
 }
 
@@ -901,7 +901,7 @@ clear_TF_reenable:
  * the correct behaviour even in the presence of the asynchronous
  * IRQ13 behaviour
  */
-void math_error(void __user *eip)
+void math_error(void __user *ip)
 {
 	struct task_struct * task;
 	siginfo_t info;
@@ -917,7 +917,7 @@ void math_error(void __user *eip)
 	info.si_signo = SIGFPE;
 	info.si_errno = 0;
 	info.si_code = __SI_FAULT;
-	info.si_addr = eip;
+	info.si_addr = ip;
 	/*
 	 * (~cwd & swd) will mask out exceptions that are not set to unmasked
 	 * status.  0x3f is the exception bits in these regs, 0x200 is the
@@ -963,10 +963,10 @@ void math_error(void __user *eip)
 fastcall void do_coprocessor_error(struct pt_regs * regs, long error_code)
 {
 	ignore_fpu_irq = 1;
-	math_error((void __user *)regs->eip);
+	math_error((void __user *)regs->ip);
 }
 
-static void simd_math_error(void __user *eip)
+static void simd_math_error(void __user *ip)
 {
 	struct task_struct * task;
 	siginfo_t info;
@@ -982,7 +982,7 @@ static void simd_math_error(void __user *eip)
 	info.si_signo = SIGFPE;
 	info.si_errno = 0;
 	info.si_code = __SI_FAULT;
-	info.si_addr = eip;
+	info.si_addr = ip;
 	/*
 	 * The SIMD FPU exceptions are handled a little differently, as there
 	 * is only a single status/control register.  Thus, to determine which
@@ -1020,13 +1020,13 @@ fastcall void do_simd_coprocessor_error(struct pt_regs * regs,
 	if (cpu_has_xmm) {
 		/* Handle SIMD FPU exceptions on PIII+ processors. */
 		ignore_fpu_irq = 1;
-		simd_math_error((void __user *)regs->eip);
+		simd_math_error((void __user *)regs->ip);
 	} else {
 		/*
 		 * Handle strange cache flush from user space exception
 		 * in all other cases.  This is undocumented behaviour.
 		 */
-		if (regs->eflags & VM_MASK) {
+		if (regs->flags & VM_MASK) {
 			handle_vm86_fault((struct kernel_vm86_regs *)regs,
 					  error_code);
 			return;
diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c
index 610a64d6bdf0..f7fecf9d47c3 100644
--- a/arch/x86/kernel/traps_64.c
+++ b/arch/x86/kernel/traps_64.c
@@ -76,20 +76,20 @@ asmlinkage void spurious_interrupt_bug(void);
 
 static inline void conditional_sti(struct pt_regs *regs)
 {
-	if (regs->eflags & X86_EFLAGS_IF)
+	if (regs->flags & X86_EFLAGS_IF)
 		local_irq_enable();
 }
 
 static inline void preempt_conditional_sti(struct pt_regs *regs)
 {
 	preempt_disable();
-	if (regs->eflags & X86_EFLAGS_IF)
+	if (regs->flags & X86_EFLAGS_IF)
 		local_irq_enable();
 }
 
 static inline void preempt_conditional_cli(struct pt_regs *regs)
 {
-	if (regs->eflags & X86_EFLAGS_IF)
+	if (regs->flags & X86_EFLAGS_IF)
 		local_irq_disable();
 	/* Make sure to not schedule here because we could be running
 	   on an exception stack. */
@@ -353,7 +353,7 @@ show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long *stack)
 }
 
 static void
-_show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *rsp)
+_show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *sp)
 {
 	unsigned long *stack;
 	int i;
@@ -364,14 +364,14 @@ _show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *rsp)
 	// debugging aid: "show_stack(NULL, NULL);" prints the
 	// back trace for this cpu.
 
-	if (rsp == NULL) {
+	if (sp == NULL) {
 		if (tsk)
-			rsp = (unsigned long *)tsk->thread.rsp;
+			sp = (unsigned long *)tsk->thread.rsp;
 		else
-			rsp = (unsigned long *)&rsp;
+			sp = (unsigned long *)&sp;
 	}
 
-	stack = rsp;
+	stack = sp;
 	for(i=0; i < kstack_depth_to_print; i++) {
 		if (stack >= irqstack && stack <= irqstack_end) {
 			if (stack == irqstack_end) {
@@ -387,12 +387,12 @@ _show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *rsp)
 		printk(" %016lx", *stack++);
 		touch_nmi_watchdog();
 	}
-	show_trace(tsk, regs, rsp);
+	show_trace(tsk, regs, sp);
 }
 
-void show_stack(struct task_struct *tsk, unsigned long * rsp)
+void show_stack(struct task_struct *tsk, unsigned long * sp)
 {
-	_show_stack(tsk, NULL, rsp);
+	_show_stack(tsk, NULL, sp);
 }
 
 /*
@@ -416,11 +416,11 @@ void show_registers(struct pt_regs *regs)
 {
 	int i;
 	int in_kernel = !user_mode(regs);
-	unsigned long rsp;
+	unsigned long sp;
 	const int cpu = smp_processor_id();
 	struct task_struct *cur = cpu_pda(cpu)->pcurrent;
 
-	rsp = regs->rsp;
+	sp = regs->sp;
 	printk("CPU %d ", cpu);
 	__show_regs(regs);
 	printk("Process %s (pid: %d, threadinfo %p, task %p)\n",
@@ -432,15 +432,15 @@ void show_registers(struct pt_regs *regs)
 	 */
 	if (in_kernel) {
 		printk("Stack: ");
-		_show_stack(NULL, regs, (unsigned long*)rsp);
+		_show_stack(NULL, regs, (unsigned long*)sp);
 
 		printk("\nCode: ");
-		if (regs->rip < PAGE_OFFSET)
+		if (regs->ip < PAGE_OFFSET)
 			goto bad;
 
 		for (i=0; i<20; i++) {
 			unsigned char c;
-			if (__get_user(c, &((unsigned char*)regs->rip)[i])) {
+			if (__get_user(c, &((unsigned char*)regs->ip)[i])) {
 bad:
 				printk(" Bad RIP value.");
 				break;
@@ -451,11 +451,11 @@ bad:
 	printk("\n");
 }	
 
-int is_valid_bugaddr(unsigned long rip)
+int is_valid_bugaddr(unsigned long ip)
 {
 	unsigned short ud2;
 
-	if (__copy_from_user(&ud2, (const void __user *) rip, sizeof(ud2)))
+	if (__copy_from_user(&ud2, (const void __user *) ip, sizeof(ud2)))
 		return 0;
 
 	return ud2 == 0x0b0f;
@@ -521,8 +521,8 @@ void __kprobes __die(const char * str, struct pt_regs * regs, long err)
 	add_taint(TAINT_DIE);
 	/* Executive summary in case the oops scrolled away */
 	printk(KERN_ALERT "RIP ");
-	printk_address(regs->rip); 
-	printk(" RSP <%016lx>\n", regs->rsp); 
+	printk_address(regs->ip);
+	printk(" RSP <%016lx>\n", regs->sp);
 	if (kexec_should_crash(current))
 		crash_kexec(regs);
 }
@@ -532,7 +532,7 @@ void die(const char * str, struct pt_regs * regs, long err)
 	unsigned long flags = oops_begin();
 
 	if (!user_mode(regs))
-		report_bug(regs->rip, regs);
+		report_bug(regs->ip, regs);
 
 	__die(str, regs, err);
 	oops_end(flags);
@@ -582,9 +582,9 @@ static void __kprobes do_trap(int trapnr, int signr, char *str,
 		if (show_unhandled_signals && unhandled_signal(tsk, signr) &&
 		    printk_ratelimit())
 			printk(KERN_INFO
-			       "%s[%d] trap %s rip:%lx rsp:%lx error:%lx\n",
+			       "%s[%d] trap %s ip:%lx sp:%lx error:%lx\n",
 			       tsk->comm, tsk->pid, str,
-			       regs->rip, regs->rsp, error_code); 
+			       regs->ip, regs->sp, error_code);
 
 		if (info)
 			force_sig_info(signr, info, tsk);
@@ -597,9 +597,9 @@ static void __kprobes do_trap(int trapnr, int signr, char *str,
 	/* kernel trap */ 
 	{	     
 		const struct exception_table_entry *fixup;
-		fixup = search_exception_tables(regs->rip);
+		fixup = search_exception_tables(regs->ip);
 		if (fixup)
-			regs->rip = fixup->fixup;
+			regs->ip = fixup->fixup;
 		else {
 			tsk->thread.error_code = error_code;
 			tsk->thread.trap_no = trapnr;
@@ -635,10 +635,10 @@ asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
 	do_trap(trapnr, signr, str, regs, error_code, &info); \
 }
 
-DO_ERROR_INFO( 0, SIGFPE,  "divide error", divide_error, FPE_INTDIV, regs->rip)
+DO_ERROR_INFO( 0, SIGFPE,  "divide error", divide_error, FPE_INTDIV, regs->ip)
 DO_ERROR( 4, SIGSEGV, "overflow", overflow)
 DO_ERROR( 5, SIGSEGV, "bounds", bounds)
-DO_ERROR_INFO( 6, SIGILL,  "invalid opcode", invalid_op, ILL_ILLOPN, regs->rip)
+DO_ERROR_INFO( 6, SIGILL,  "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip)
 DO_ERROR( 7, SIGSEGV, "device not available", device_not_available)
 DO_ERROR( 9, SIGFPE,  "coprocessor segment overrun", coprocessor_segment_overrun)
 DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
@@ -688,9 +688,9 @@ asmlinkage void __kprobes do_general_protection(struct pt_regs * regs,
 		if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
 		    printk_ratelimit())
 			printk(KERN_INFO
-		       "%s[%d] general protection rip:%lx rsp:%lx error:%lx\n",
+		       "%s[%d] general protection ip:%lx sp:%lx error:%lx\n",
 			       tsk->comm, tsk->pid,
-			       regs->rip, regs->rsp, error_code); 
+			       regs->ip, regs->sp, error_code);
 
 		force_sig(SIGSEGV, tsk);
 		return;
@@ -699,9 +699,9 @@ asmlinkage void __kprobes do_general_protection(struct pt_regs * regs,
 	/* kernel gp */
 	{
 		const struct exception_table_entry *fixup;
-		fixup = search_exception_tables(regs->rip);
+		fixup = search_exception_tables(regs->ip);
 		if (fixup) {
-			regs->rip = fixup->fixup;
+			regs->ip = fixup->fixup;
 			return;
 		}
 
@@ -824,15 +824,15 @@ asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
 {
 	struct pt_regs *regs = eregs;
 	/* Did already sync */
-	if (eregs == (struct pt_regs *)eregs->rsp)
+	if (eregs == (struct pt_regs *)eregs->sp)
 		;
 	/* Exception from user space */
 	else if (user_mode(eregs))
 		regs = task_pt_regs(current);
 	/* Exception from kernel and interrupts are enabled. Move to
  	   kernel process stack. */
-	else if (eregs->eflags & X86_EFLAGS_IF)
-		regs = (struct pt_regs *)(eregs->rsp -= sizeof(struct pt_regs));
+	else if (eregs->flags & X86_EFLAGS_IF)
+		regs = (struct pt_regs *)(eregs->sp -= sizeof(struct pt_regs));
 	if (eregs != regs)
 		*regs = *eregs;
 	return regs;
@@ -887,7 +887,7 @@ asmlinkage void __kprobes do_debug(struct pt_regs * regs,
 	info.si_signo = SIGTRAP;
 	info.si_errno = 0;
 	info.si_code = TRAP_BRKPT;
-	info.si_addr = user_mode(regs) ? (void __user *)regs->rip : NULL;
+	info.si_addr = user_mode(regs) ? (void __user *)regs->ip : NULL;
 	force_sig_info(SIGTRAP, &info, tsk);
 
 clear_dr7:
@@ -897,16 +897,16 @@ clear_dr7:
 
 clear_TF_reenable:
 	set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
-	regs->eflags &= ~TF_MASK;
+	regs->flags &= ~TF_MASK;
 	preempt_conditional_cli(regs);
 }
 
 static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr)
 {
 	const struct exception_table_entry *fixup;
-	fixup = search_exception_tables(regs->rip);
+	fixup = search_exception_tables(regs->ip);
 	if (fixup) {
-		regs->rip = fixup->fixup;
+		regs->ip = fixup->fixup;
 		return 1;
 	}
 	notify_die(DIE_GPF, str, regs, 0, trapnr, SIGFPE);
@@ -923,7 +923,7 @@ static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr)
  */
 asmlinkage void do_coprocessor_error(struct pt_regs *regs)
 {
-	void __user *rip = (void __user *)(regs->rip);
+	void __user *ip = (void __user *)(regs->ip);
 	struct task_struct * task;
 	siginfo_t info;
 	unsigned short cwd, swd;
@@ -943,7 +943,7 @@ asmlinkage void do_coprocessor_error(struct pt_regs *regs)
 	info.si_signo = SIGFPE;
 	info.si_errno = 0;
 	info.si_code = __SI_FAULT;
-	info.si_addr = rip;
+	info.si_addr = ip;
 	/*
 	 * (~cwd & swd) will mask out exceptions that are not set to unmasked
 	 * status.  0x3f is the exception bits in these regs, 0x200 is the
@@ -992,7 +992,7 @@ asmlinkage void bad_intr(void)
 
 asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs)
 {
-	void __user *rip = (void __user *)(regs->rip);
+	void __user *ip = (void __user *)(regs->ip);
 	struct task_struct * task;
 	siginfo_t info;
 	unsigned short mxcsr;
@@ -1012,7 +1012,7 @@ asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs)
 	info.si_signo = SIGFPE;
 	info.si_errno = 0;
 	info.si_code = __SI_FAULT;
-	info.si_addr = rip;
+	info.si_addr = ip;
 	/*
 	 * The SIMD FPU exceptions are handled a little differently, as there
 	 * is only a single status/control register.  Thus, to determine which
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 157e4bedd3c5..980e85b90091 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -70,10 +70,10 @@
 /*
  * 8- and 16-bit register defines..
  */
-#define AL(regs)	(((unsigned char *)&((regs)->pt.eax))[0])
-#define AH(regs)	(((unsigned char *)&((regs)->pt.eax))[1])
-#define IP(regs)	(*(unsigned short *)&((regs)->pt.eip))
-#define SP(regs)	(*(unsigned short *)&((regs)->pt.esp))
+#define AL(regs)	(((unsigned char *)&((regs)->pt.ax))[0])
+#define AH(regs)	(((unsigned char *)&((regs)->pt.ax))[1])
+#define IP(regs)	(*(unsigned short *)&((regs)->pt.ip))
+#define SP(regs)	(*(unsigned short *)&((regs)->pt.sp))
 
 /*
  * virtual flags (16 and 32-bit versions)
@@ -93,12 +93,12 @@ static int copy_vm86_regs_to_user(struct vm86_regs __user *user,
 {
 	int ret = 0;
 
-	/* kernel_vm86_regs is missing xgs, so copy everything up to
+	/* kernel_vm86_regs is missing gs, so copy everything up to
 	   (but not including) orig_eax, and then rest including orig_eax. */
-	ret += copy_to_user(user, regs, offsetof(struct kernel_vm86_regs, pt.orig_eax));
-	ret += copy_to_user(&user->orig_eax, &regs->pt.orig_eax,
+	ret += copy_to_user(user, regs, offsetof(struct kernel_vm86_regs, pt.orig_ax));
+	ret += copy_to_user(&user->orig_eax, &regs->pt.orig_ax,
 			    sizeof(struct kernel_vm86_regs) -
-			    offsetof(struct kernel_vm86_regs, pt.orig_eax));
+			    offsetof(struct kernel_vm86_regs, pt.orig_ax));
 
 	return ret;
 }
@@ -110,12 +110,12 @@ static int copy_vm86_regs_from_user(struct kernel_vm86_regs *regs,
 {
 	int ret = 0;
 
-	/* copy eax-xfs inclusive */
-	ret += copy_from_user(regs, user, offsetof(struct kernel_vm86_regs, pt.orig_eax));
-	/* copy orig_eax-__gsh+extra */
-	ret += copy_from_user(&regs->pt.orig_eax, &user->orig_eax,
+	/* copy ax-fs inclusive */
+	ret += copy_from_user(regs, user, offsetof(struct kernel_vm86_regs, pt.orig_ax));
+	/* copy orig_ax-__gsh+extra */
+	ret += copy_from_user(&regs->pt.orig_ax, &user->orig_eax,
 			      sizeof(struct kernel_vm86_regs) -
-			      offsetof(struct kernel_vm86_regs, pt.orig_eax) +
+			      offsetof(struct kernel_vm86_regs, pt.orig_ax) +
 			      extra);
 	return ret;
 }
@@ -138,7 +138,7 @@ struct pt_regs * fastcall save_v86_state(struct kernel_vm86_regs * regs)
 		printk("no vm86_info: BAD\n");
 		do_exit(SIGSEGV);
 	}
-	set_flags(regs->pt.eflags, VEFLAGS, VIF_MASK | current->thread.v86mask);
+	set_flags(regs->pt.flags, VEFLAGS, VIF_MASK | current->thread.v86mask);
 	tmp = copy_vm86_regs_to_user(&current->thread.vm86_info->regs,regs);
 	tmp += put_user(current->thread.screen_bitmap,&current->thread.vm86_info->screen_bitmap);
 	if (tmp) {
@@ -155,7 +155,7 @@ struct pt_regs * fastcall save_v86_state(struct kernel_vm86_regs * regs)
 
 	ret = KVM86->regs32;
 
-	ret->xfs = current->thread.saved_fs;
+	ret->fs = current->thread.saved_fs;
 	loadsegment(gs, current->thread.saved_gs);
 
 	return ret;
@@ -197,7 +197,7 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
 
 asmlinkage int sys_vm86old(struct pt_regs regs)
 {
-	struct vm86_struct __user *v86 = (struct vm86_struct __user *)regs.ebx;
+	struct vm86_struct __user *v86 = (struct vm86_struct __user *)regs.bx;
 	struct kernel_vm86_struct info; /* declare this _on top_,
 					 * this avoids wasting of stack space.
 					 * This remains on the stack until we
@@ -237,12 +237,12 @@ asmlinkage int sys_vm86(struct pt_regs regs)
 	struct vm86plus_struct __user *v86;
 
 	tsk = current;
-	switch (regs.ebx) {
+	switch (regs.bx) {
 		case VM86_REQUEST_IRQ:
 		case VM86_FREE_IRQ:
 		case VM86_GET_IRQ_BITS:
 		case VM86_GET_AND_RESET_IRQ:
-			ret = do_vm86_irq_handling(regs.ebx, (int)regs.ecx);
+			ret = do_vm86_irq_handling(regs.bx, (int)regs.cx);
 			goto out;
 		case VM86_PLUS_INSTALL_CHECK:
 			/* NOTE: on old vm86 stuff this will return the error
@@ -258,7 +258,7 @@ asmlinkage int sys_vm86(struct pt_regs regs)
 	ret = -EPERM;
 	if (tsk->thread.saved_esp0)
 		goto out;
-	v86 = (struct vm86plus_struct __user *)regs.ecx;
+	v86 = (struct vm86plus_struct __user *)regs.cx;
 	tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs,
 				       offsetof(struct kernel_vm86_struct, regs32) -
 				       sizeof(info.regs));
@@ -281,23 +281,23 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
 /*
  * make sure the vm86() system call doesn't try to do anything silly
  */
-	info->regs.pt.xds = 0;
-	info->regs.pt.xes = 0;
-	info->regs.pt.xfs = 0;
+	info->regs.pt.ds = 0;
+	info->regs.pt.es = 0;
+	info->regs.pt.fs = 0;
 
 /* we are clearing gs later just before "jmp resume_userspace",
  * because it is not saved/restored.
  */
 
 /*
- * The eflags register is also special: we cannot trust that the user
+ * The flags register is also special: we cannot trust that the user
  * has set it up safely, so this makes sure interrupt etc flags are
  * inherited from protected mode.
  */
- 	VEFLAGS = info->regs.pt.eflags;
-	info->regs.pt.eflags &= SAFE_MASK;
-	info->regs.pt.eflags |= info->regs32->eflags & ~SAFE_MASK;
-	info->regs.pt.eflags |= VM_MASK;
+	VEFLAGS = info->regs.pt.flags;
+	info->regs.pt.flags &= SAFE_MASK;
+	info->regs.pt.flags |= info->regs32->flags & ~SAFE_MASK;
+	info->regs.pt.flags |= VM_MASK;
 
 	switch (info->cpu_type) {
 		case CPU_286:
@@ -315,11 +315,11 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
 	}
 
 /*
- * Save old state, set default return value (%eax) to 0
+ * Save old state, set default return value (%ax) to 0
  */
-	info->regs32->eax = 0;
+	info->regs32->ax = 0;
 	tsk->thread.saved_esp0 = tsk->thread.esp0;
-	tsk->thread.saved_fs = info->regs32->xfs;
+	tsk->thread.saved_fs = info->regs32->fs;
 	savesegment(gs, tsk->thread.saved_gs);
 
 	tss = &per_cpu(init_tss, get_cpu());
@@ -352,7 +352,7 @@ static inline void return_to_32bit(struct kernel_vm86_regs * regs16, int retval)
 	struct pt_regs * regs32;
 
 	regs32 = save_v86_state(regs16);
-	regs32->eax = retval;
+	regs32->ax = retval;
 	__asm__ __volatile__("movl %0,%%esp\n\t"
 		"movl %1,%%ebp\n\t"
 		"jmp resume_userspace"
@@ -373,12 +373,12 @@ static inline void clear_IF(struct kernel_vm86_regs * regs)
 
 static inline void clear_TF(struct kernel_vm86_regs * regs)
 {
-	regs->pt.eflags &= ~TF_MASK;
+	regs->pt.flags &= ~TF_MASK;
 }
 
 static inline void clear_AC(struct kernel_vm86_regs * regs)
 {
-	regs->pt.eflags &= ~AC_MASK;
+	regs->pt.flags &= ~AC_MASK;
 }
 
 /* It is correct to call set_IF(regs) from the set_vflags_*
@@ -392,11 +392,11 @@ static inline void clear_AC(struct kernel_vm86_regs * regs)
  * [KD]
  */
 
-static inline void set_vflags_long(unsigned long eflags, struct kernel_vm86_regs * regs)
+static inline void set_vflags_long(unsigned long flags, struct kernel_vm86_regs * regs)
 {
-	set_flags(VEFLAGS, eflags, current->thread.v86mask);
-	set_flags(regs->pt.eflags, eflags, SAFE_MASK);
-	if (eflags & IF_MASK)
+	set_flags(VEFLAGS, flags, current->thread.v86mask);
+	set_flags(regs->pt.flags, flags, SAFE_MASK);
+	if (flags & IF_MASK)
 		set_IF(regs);
 	else
 		clear_IF(regs);
@@ -405,7 +405,7 @@ static inline void set_vflags_long(unsigned long eflags, struct kernel_vm86_regs
 static inline void set_vflags_short(unsigned short flags, struct kernel_vm86_regs * regs)
 {
 	set_flags(VFLAGS, flags, current->thread.v86mask);
-	set_flags(regs->pt.eflags, flags, SAFE_MASK);
+	set_flags(regs->pt.flags, flags, SAFE_MASK);
 	if (flags & IF_MASK)
 		set_IF(regs);
 	else
@@ -414,7 +414,7 @@ static inline void set_vflags_short(unsigned short flags, struct kernel_vm86_reg
 
 static inline unsigned long get_vflags(struct kernel_vm86_regs * regs)
 {
-	unsigned long flags = regs->pt.eflags & RETURN_MASK;
+	unsigned long flags = regs->pt.flags & RETURN_MASK;
 
 	if (VEFLAGS & VIF_MASK)
 		flags |= IF_MASK;
@@ -518,7 +518,7 @@ static void do_int(struct kernel_vm86_regs *regs, int i,
 	unsigned long __user *intr_ptr;
 	unsigned long segoffs;
 
-	if (regs->pt.xcs == BIOSSEG)
+	if (regs->pt.cs == BIOSSEG)
 		goto cannot_handle;
 	if (is_revectored(i, &KVM86->int_revectored))
 		goto cannot_handle;
@@ -530,9 +530,9 @@ static void do_int(struct kernel_vm86_regs *regs, int i,
 	if ((segoffs >> 16) == BIOSSEG)
 		goto cannot_handle;
 	pushw(ssp, sp, get_vflags(regs), cannot_handle);
-	pushw(ssp, sp, regs->pt.xcs, cannot_handle);
+	pushw(ssp, sp, regs->pt.cs, cannot_handle);
 	pushw(ssp, sp, IP(regs), cannot_handle);
-	regs->pt.xcs = segoffs >> 16;
+	regs->pt.cs = segoffs >> 16;
 	SP(regs) -= 6;
 	IP(regs) = segoffs & 0xffff;
 	clear_TF(regs);
@@ -549,7 +549,7 @@ int handle_vm86_trap(struct kernel_vm86_regs * regs, long error_code, int trapno
 	if (VMPI.is_vm86pus) {
 		if ( (trapno==3) || (trapno==1) )
 			return_to_32bit(regs, VM86_TRAP + (trapno << 8));
-		do_int(regs, trapno, (unsigned char __user *) (regs->pt.xss << 4), SP(regs));
+		do_int(regs, trapno, (unsigned char __user *) (regs->pt.ss << 4), SP(regs));
 		return 0;
 	}
 	if (trapno !=1)
@@ -585,10 +585,10 @@ void handle_vm86_fault(struct kernel_vm86_regs * regs, long error_code)
 		handle_vm86_trap(regs, 0, 1); \
 	return; } while (0)
 
-	orig_flags = *(unsigned short *)&regs->pt.eflags;
+	orig_flags = *(unsigned short *)&regs->pt.flags;
 
-	csp = (unsigned char __user *) (regs->pt.xcs << 4);
-	ssp = (unsigned char __user *) (regs->pt.xss << 4);
+	csp = (unsigned char __user *) (regs->pt.cs << 4);
+	ssp = (unsigned char __user *) (regs->pt.ss << 4);
 	sp = SP(regs);
 	ip = IP(regs);
 
@@ -675,7 +675,7 @@ void handle_vm86_fault(struct kernel_vm86_regs * regs, long error_code)
 			SP(regs) += 6;
 		}
 		IP(regs) = newip;
-		regs->pt.xcs = newcs;
+		regs->pt.cs = newcs;
 		CHECK_IF_IN_TRAP;
 		if (data32) {
 			set_vflags_long(newflags, regs);
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index 87e5633805a9..599b6f2ed562 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -88,13 +88,13 @@ struct vmi_timer_ops vmi_timer_ops;
 #define IRQ_PATCH_DISABLE  5
 
 static inline void patch_offset(void *insnbuf,
-				unsigned long eip, unsigned long dest)
+				unsigned long ip, unsigned long dest)
 {
-        *(unsigned long *)(insnbuf+1) = dest-eip-5;
+        *(unsigned long *)(insnbuf+1) = dest-ip-5;
 }
 
 static unsigned patch_internal(int call, unsigned len, void *insnbuf,
-			       unsigned long eip)
+			       unsigned long ip)
 {
 	u64 reloc;
 	struct vmi_relocation_info *const rel = (struct vmi_relocation_info *)&reloc;
@@ -103,13 +103,13 @@ static unsigned patch_internal(int call, unsigned len, void *insnbuf,
 		case VMI_RELOCATION_CALL_REL:
 			BUG_ON(len < 5);
 			*(char *)insnbuf = MNEM_CALL;
-			patch_offset(insnbuf, eip, (unsigned long)rel->eip);
+			patch_offset(insnbuf, ip, (unsigned long)rel->eip);
 			return 5;
 
 		case VMI_RELOCATION_JUMP_REL:
 			BUG_ON(len < 5);
 			*(char *)insnbuf = MNEM_JMP;
-			patch_offset(insnbuf, eip, (unsigned long)rel->eip);
+			patch_offset(insnbuf, ip, (unsigned long)rel->eip);
 			return 5;
 
 		case VMI_RELOCATION_NOP:
@@ -131,25 +131,25 @@ static unsigned patch_internal(int call, unsigned len, void *insnbuf,
  * sequence.  The callee does nop padding for us.
  */
 static unsigned vmi_patch(u8 type, u16 clobbers, void *insns,
-			  unsigned long eip, unsigned len)
+			  unsigned long ip, unsigned len)
 {
 	switch (type) {
 		case PARAVIRT_PATCH(pv_irq_ops.irq_disable):
 			return patch_internal(VMI_CALL_DisableInterrupts, len,
-					      insns, eip);
+					      insns, ip);
 		case PARAVIRT_PATCH(pv_irq_ops.irq_enable):
 			return patch_internal(VMI_CALL_EnableInterrupts, len,
-					      insns, eip);
+					      insns, ip);
 		case PARAVIRT_PATCH(pv_irq_ops.restore_fl):
 			return patch_internal(VMI_CALL_SetInterruptMask, len,
-					      insns, eip);
+					      insns, ip);
 		case PARAVIRT_PATCH(pv_irq_ops.save_fl):
 			return patch_internal(VMI_CALL_GetInterruptMask, len,
-					      insns, eip);
+					      insns, ip);
 		case PARAVIRT_PATCH(pv_cpu_ops.iret):
-			return patch_internal(VMI_CALL_IRET, len, insns, eip);
+			return patch_internal(VMI_CALL_IRET, len, insns, ip);
 		case PARAVIRT_PATCH(pv_cpu_ops.irq_enable_syscall_ret):
-			return patch_internal(VMI_CALL_SYSEXIT, len, insns, eip);
+			return patch_internal(VMI_CALL_SYSEXIT, len, insns, ip);
 		default:
 			break;
 	}
@@ -157,29 +157,29 @@ static unsigned vmi_patch(u8 type, u16 clobbers, void *insns,
 }
 
 /* CPUID has non-C semantics, and paravirt-ops API doesn't match hardware ISA */
-static void vmi_cpuid(unsigned int *eax, unsigned int *ebx,
-                               unsigned int *ecx, unsigned int *edx)
+static void vmi_cpuid(unsigned int *ax, unsigned int *bx,
+                               unsigned int *cx, unsigned int *dx)
 {
 	int override = 0;
-	if (*eax == 1)
+	if (*ax == 1)
 		override = 1;
         asm volatile ("call *%6"
-                      : "=a" (*eax),
-                        "=b" (*ebx),
-                        "=c" (*ecx),
-                        "=d" (*edx)
-                      : "0" (*eax), "2" (*ecx), "r" (vmi_ops.cpuid));
+                      : "=a" (*ax),
+                        "=b" (*bx),
+                        "=c" (*cx),
+                        "=d" (*dx)
+                      : "0" (*ax), "2" (*cx), "r" (vmi_ops.cpuid));
 	if (override) {
 		if (disable_pse)
-			*edx &= ~X86_FEATURE_PSE;
+			*dx &= ~X86_FEATURE_PSE;
 		if (disable_pge)
-			*edx &= ~X86_FEATURE_PGE;
+			*dx &= ~X86_FEATURE_PGE;
 		if (disable_sep)
-			*edx &= ~X86_FEATURE_SEP;
+			*dx &= ~X86_FEATURE_SEP;
 		if (disable_tsc)
-			*edx &= ~X86_FEATURE_TSC;
+			*dx &= ~X86_FEATURE_TSC;
 		if (disable_mtrr)
-			*edx &= ~X86_FEATURE_MTRR;
+			*dx &= ~X86_FEATURE_MTRR;
 	}
 }
 
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index ad4005c6d4a1..018f7cf33790 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -43,7 +43,7 @@
 #include <asm/vgtod.h>
 
 #define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
-#define __syscall_clobber "r11","rcx","memory"
+#define __syscall_clobber "r11","cx","memory"
 #define __pa_vsymbol(x)			\
 	({unsigned long v;  		\
 	extern char __vsyscall_0; 	\
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index df04bf884dd4..ea46d05853bb 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -175,8 +175,8 @@ static void lguest_leave_lazy_mode(void)
  * check there when it wants to deliver an interrupt.
  */
 
-/* save_flags() is expected to return the processor state (ie. "eflags").  The
- * eflags word contains all kind of stuff, but in practice Linux only cares
+/* save_flags() is expected to return the processor state (ie. "flags").  The
+ * flags word contains all kind of stuff, but in practice Linux only cares
  * about the interrupt flag.  Our "save_flags()" just returns that. */
 static unsigned long save_fl(void)
 {
@@ -323,30 +323,30 @@ static void lguest_load_tr_desc(void)
  * anyone (including userspace) can just use the raw "cpuid" instruction and
  * the Host won't even notice since it isn't privileged.  So we try not to get
  * too worked up about it. */
-static void lguest_cpuid(unsigned int *eax, unsigned int *ebx,
-			 unsigned int *ecx, unsigned int *edx)
+static void lguest_cpuid(unsigned int *ax, unsigned int *bx,
+			 unsigned int *cx, unsigned int *dx)
 {
-	int function = *eax;
+	int function = *ax;
 
-	native_cpuid(eax, ebx, ecx, edx);
+	native_cpuid(ax, bx, cx, dx);
 	switch (function) {
 	case 1:	/* Basic feature request. */
 		/* We only allow kernel to see SSE3, CMPXCHG16B and SSSE3 */
-		*ecx &= 0x00002201;
+		*cx &= 0x00002201;
 		/* SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, FPU. */
-		*edx &= 0x07808101;
+		*dx &= 0x07808101;
 		/* The Host can do a nice optimization if it knows that the
 		 * kernel mappings (addresses above 0xC0000000 or whatever
 		 * PAGE_OFFSET is set to) haven't changed.  But Linux calls
 		 * flush_tlb_user() for both user and kernel mappings unless
 		 * the Page Global Enable (PGE) feature bit is set. */
-		*edx |= 0x00002000;
+		*dx |= 0x00002000;
 		break;
 	case 0x80000000:
 		/* Futureproof this a little: if they ask how much extended
 		 * processor information there is, limit it to known fields. */
-		if (*eax > 0x80000008)
-			*eax = 0x80000008;
+		if (*ax > 0x80000008)
+			*ax = 0x80000008;
 		break;
 	}
 }
diff --git a/arch/x86/mach-voyager/voyager_smp.c b/arch/x86/mach-voyager/voyager_smp.c
index b472a2df0b7f..f2c13482acc0 100644
--- a/arch/x86/mach-voyager/voyager_smp.c
+++ b/arch/x86/mach-voyager/voyager_smp.c
@@ -526,7 +526,7 @@ static void __init do_boot_cpu(__u8 cpu)
 	 * initial kernel stack.  We need to alter this to give the
 	 * booting CPU a new stack (taken from its idle process) */
 	extern struct {
-		__u8 *esp;
+		__u8 *sp;
 		unsigned short ss;
 	} stack_start;
 	/* This is the format of the CPI IDT gate (in real mode) which
@@ -555,9 +555,9 @@ static void __init do_boot_cpu(__u8 cpu)
 	idle = fork_idle(cpu);
 	if (IS_ERR(idle))
 		panic("failed fork for CPU%d", cpu);
-	idle->thread.eip = (unsigned long)start_secondary;
+	idle->thread.ip = (unsigned long)start_secondary;
 	/* init_tasks (in sched.c) is indexed logically */
-	stack_start.esp = (void *)idle->thread.esp;
+	stack_start.sp = (void *)idle->thread.sp;
 
 	init_gdt(cpu);
 	per_cpu(current_task, cpu) = idle;
@@ -567,7 +567,7 @@ static void __init do_boot_cpu(__u8 cpu)
 	/* Note: Don't modify initial ss override */
 	VDEBUG(("VOYAGER SMP: Booting CPU%d at 0x%lx[%x:%x], stack %p\n", cpu,
 		(unsigned long)hijack_source.val, hijack_source.idt.Segment,
-		hijack_source.idt.Offset, stack_start.esp));
+		hijack_source.idt.Offset, stack_start.sp));
 
 	/* init lowmem identity mapping */
 	clone_pgd_range(swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS,
@@ -745,8 +745,8 @@ void __init initialize_secondary(void)
 	 */
 
 	asm volatile ("movl %0,%%esp\n\t"
-		      "jmp *%1"::"r" (current->thread.esp),
-		      "r"(current->thread.eip));
+		      "jmp *%1"::"r" (current->thread.sp),
+		      "r"(current->thread.ip));
 }
 
 /* handle a Voyager SYS_INT -- If we don't, the base board will
diff --git a/arch/x86/mm/extable_32.c b/arch/x86/mm/extable_32.c
index 0ce4f22a2635..41685461f8b2 100644
--- a/arch/x86/mm/extable_32.c
+++ b/arch/x86/mm/extable_32.c
@@ -11,7 +11,7 @@ int fixup_exception(struct pt_regs *regs)
 	const struct exception_table_entry *fixup;
 
 #ifdef CONFIG_PNPBIOS
-	if (unlikely(SEGMENT_IS_PNP_CODE(regs->xcs)))
+	if (unlikely(SEGMENT_IS_PNP_CODE(regs->cs)))
 	{
 		extern u32 pnp_bios_fault_eip, pnp_bios_fault_esp;
 		extern u32 pnp_bios_is_utter_crap;
@@ -25,9 +25,9 @@ int fixup_exception(struct pt_regs *regs)
 	}
 #endif
 
-	fixup = search_exception_tables(regs->eip);
+	fixup = search_exception_tables(regs->ip);
 	if (fixup) {
-		regs->eip = fixup->fixup;
+		regs->ip = fixup->fixup;
 		return 1;
 	}
 
diff --git a/arch/x86/mm/fault_32.c b/arch/x86/mm/fault_32.c
index a2273d44aa27..6056c6d71835 100644
--- a/arch/x86/mm/fault_32.c
+++ b/arch/x86/mm/fault_32.c
@@ -72,15 +72,15 @@ static inline int notify_page_fault(struct pt_regs *regs)
 static inline unsigned long get_segment_eip(struct pt_regs *regs,
 					    unsigned long *eip_limit)
 {
-	unsigned long eip = regs->eip;
-	unsigned seg = regs->xcs & 0xffff;
+	unsigned long ip = regs->ip;
+	unsigned seg = regs->cs & 0xffff;
 	u32 seg_ar, seg_limit, base, *desc;
 
 	/* Unlikely, but must come before segment checks. */
-	if (unlikely(regs->eflags & VM_MASK)) {
+	if (unlikely(regs->flags & VM_MASK)) {
 		base = seg << 4;
 		*eip_limit = base + 0xffff;
-		return base + (eip & 0xffff);
+		return base + (ip & 0xffff);
 	}
 
 	/* The standard kernel/user address space limit. */
@@ -88,16 +88,16 @@ static inline unsigned long get_segment_eip(struct pt_regs *regs,
 	
 	/* By far the most common cases. */
 	if (likely(SEGMENT_IS_FLAT_CODE(seg)))
-		return eip;
+		return ip;
 
 	/* Check the segment exists, is within the current LDT/GDT size,
 	   that kernel/user (ring 0..3) has the appropriate privilege,
 	   that it's a code segment, and get the limit. */
 	__asm__ ("larl %3,%0; lsll %3,%1"
 		 : "=&r" (seg_ar), "=r" (seg_limit) : "0" (0), "rm" (seg));
-	if ((~seg_ar & 0x9800) || eip > seg_limit) {
+	if ((~seg_ar & 0x9800) || ip > seg_limit) {
 		*eip_limit = 0;
-		return 1;	 /* So that returned eip > *eip_limit. */
+		return 1;	 /* So that returned ip > *eip_limit. */
 	}
 
 	/* Get the GDT/LDT descriptor base. 
@@ -127,7 +127,7 @@ static inline unsigned long get_segment_eip(struct pt_regs *regs,
 	seg_limit += base;
 	if (seg_limit < *eip_limit && seg_limit >= base)
 		*eip_limit = seg_limit;
-	return eip + base;
+	return ip + base;
 }
 
 /* 
@@ -345,7 +345,7 @@ fastcall void __kprobes do_page_fault(struct pt_regs *regs,
 
 	/* It's safe to allow irq's after cr2 has been saved and the vmalloc
 	   fault has been handled. */
-	if (regs->eflags & (X86_EFLAGS_IF|VM_MASK))
+	if (regs->flags & (X86_EFLAGS_IF|VM_MASK))
 		local_irq_enable();
 
 	mm = tsk->mm;
@@ -374,7 +374,7 @@ fastcall void __kprobes do_page_fault(struct pt_regs *regs,
 	 */
 	if (!down_read_trylock(&mm->mmap_sem)) {
 		if ((error_code & 4) == 0 &&
-		    !search_exception_tables(regs->eip))
+		    !search_exception_tables(regs->ip))
 			goto bad_area_nosemaphore;
 		down_read(&mm->mmap_sem);
 	}
@@ -388,12 +388,12 @@ fastcall void __kprobes do_page_fault(struct pt_regs *regs,
 		goto bad_area;
 	if (error_code & 4) {
 		/*
-		 * Accessing the stack below %esp is always a bug.
+		 * Accessing the stack below %sp is always a bug.
 		 * The large cushion allows instructions like enter
 		 * and pusha to work.  ("enter $65535,$31" pushes
-		 * 32 pointers and then decrements %esp by 65535.)
+		 * 32 pointers and then decrements %sp by 65535.)
 		 */
-		if (address + 65536 + 32 * sizeof(unsigned long) < regs->esp)
+		if (address + 65536 + 32 * sizeof(unsigned long) < regs->sp)
 			goto bad_area;
 	}
 	if (expand_stack(vma, address))
@@ -442,7 +442,7 @@ good_area:
 	/*
 	 * Did it hit the DOS screen memory VA from vm86 mode?
 	 */
-	if (regs->eflags & VM_MASK) {
+	if (regs->flags & VM_MASK) {
 		unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT;
 		if (bit < 32)
 			tsk->thread.screen_bitmap |= 1 << bit;
@@ -474,11 +474,11 @@ bad_area_nosemaphore:
 
 		if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
 		    printk_ratelimit()) {
-			printk("%s%s[%d]: segfault at %08lx eip %08lx "
-			    "esp %08lx error %lx\n",
+			printk("%s%s[%d]: segfault at %08lx ip %08lx "
+			    "sp %08lx error %lx\n",
 			    task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
-			    tsk->comm, task_pid_nr(tsk), address, regs->eip,
-			    regs->esp, error_code);
+			    tsk->comm, task_pid_nr(tsk), address, regs->ip,
+			    regs->sp, error_code);
 		}
 		tsk->thread.cr2 = address;
 		/* Kernel addresses are always protection faults */
@@ -544,7 +544,7 @@ no_context:
 			printk(KERN_ALERT "BUG: unable to handle kernel paging"
 					" request");
 		printk(" at virtual address %08lx\n",address);
-		printk(KERN_ALERT "printing eip: %08lx ", regs->eip);
+		printk(KERN_ALERT "printing ip: %08lx ", regs->ip);
 
 		page = read_cr3();
 		page = ((__typeof__(page) *) __va(page))[address >> PGDIR_SHIFT];
diff --git a/arch/x86/mm/fault_64.c b/arch/x86/mm/fault_64.c
index 0e26230669ca..88a7abda29ce 100644
--- a/arch/x86/mm/fault_64.c
+++ b/arch/x86/mm/fault_64.c
@@ -198,7 +198,7 @@ KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
 static int is_errata93(struct pt_regs *regs, unsigned long address) 
 {
 	static int warned;
-	if (address != regs->rip)
+	if (address != regs->ip)
 		return 0;
 	if ((address >> 32) != 0) 
 		return 0;
@@ -209,7 +209,7 @@ static int is_errata93(struct pt_regs *regs, unsigned long address)
 			printk(errata93_warning); 		
 			warned = 1;
 		}
-		regs->rip = address;
+		regs->ip = address;
 		return 1;
 	}
 	return 0;
@@ -355,7 +355,7 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
 	if (notify_page_fault(regs))
 		return;
 
-	if (likely(regs->eflags & X86_EFLAGS_IF))
+	if (likely(regs->flags & X86_EFLAGS_IF))
 		local_irq_enable();
 
 	if (unlikely(error_code & PF_RSVD))
@@ -393,7 +393,7 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
 	 */
 	if (!down_read_trylock(&mm->mmap_sem)) {
 		if ((error_code & PF_USER) == 0 &&
-		    !search_exception_tables(regs->rip))
+		    !search_exception_tables(regs->ip))
 			goto bad_area_nosemaphore;
 		down_read(&mm->mmap_sem);
 	}
@@ -409,7 +409,7 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
 		/* Allow userspace just enough access below the stack pointer
 		 * to let the 'enter' instruction work.
 		 */
-		if (address + 65536 + 32 * sizeof(unsigned long) < regs->rsp)
+		if (address + 65536 + 32 * sizeof(unsigned long) < regs->sp)
 			goto bad_area;
 	}
 	if (expand_stack(vma, address))
@@ -488,10 +488,10 @@ bad_area_nosemaphore:
 		if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
 		    printk_ratelimit()) {
 			printk(
-		       "%s%s[%d]: segfault at %lx rip %lx rsp %lx error %lx\n",
+		       "%s%s[%d]: segfault at %lx ip %lx sp %lx error %lx\n",
 					tsk->pid > 1 ? KERN_INFO : KERN_EMERG,
-					tsk->comm, tsk->pid, address, regs->rip,
-					regs->rsp, error_code);
+					tsk->comm, tsk->pid, address, regs->ip,
+					regs->sp, error_code);
 		}
        
 		tsk->thread.cr2 = address;
@@ -509,9 +509,9 @@ bad_area_nosemaphore:
 no_context:
 	
 	/* Are we prepared to handle this kernel fault?  */
-	fixup = search_exception_tables(regs->rip);
+	fixup = search_exception_tables(regs->ip);
 	if (fixup) {
-		regs->rip = fixup->fixup;
+		regs->ip = fixup->fixup;
 		return;
 	}
 
@@ -537,7 +537,7 @@ no_context:
 	else
 		printk(KERN_ALERT "Unable to handle kernel paging request");
 	printk(" at %016lx RIP: \n" KERN_ALERT,address);
-	printk_address(regs->rip);
+	printk_address(regs->ip);
 	dump_pagetable(address);
 	tsk->thread.cr2 = address;
 	tsk->thread.trap_no = 14;
diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c
index 0ed046a187f7..cc353a0b183e 100644
--- a/arch/x86/oprofile/backtrace.c
+++ b/arch/x86/oprofile/backtrace.c
@@ -48,7 +48,7 @@ static struct stacktrace_ops backtrace_ops = {
 };
 
 struct frame_head {
-	struct frame_head *ebp;
+	struct frame_head *bp;
 	unsigned long ret;
 } __attribute__((packed));
 
@@ -67,10 +67,10 @@ dump_user_backtrace(struct frame_head * head)
 
 	/* frame pointers should strictly progress back up the stack
 	 * (towards higher addresses) */
-	if (head >= bufhead[0].ebp)
+	if (head >= bufhead[0].bp)
 		return NULL;
 
-	return bufhead[0].ebp;
+	return bufhead[0].bp;
 }
 
 void
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index d3574485cb15..29517faaa735 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -141,8 +141,8 @@ static void __init xen_banner(void)
 	printk(KERN_INFO "Hypervisor signature: %s\n", xen_start_info->magic);
 }
 
-static void xen_cpuid(unsigned int *eax, unsigned int *ebx,
-		      unsigned int *ecx, unsigned int *edx)
+static void xen_cpuid(unsigned int *ax, unsigned int *bx,
+		      unsigned int *cx, unsigned int *dx)
 {
 	unsigned maskedx = ~0;
 
@@ -150,18 +150,18 @@ static void xen_cpuid(unsigned int *eax, unsigned int *ebx,
 	 * Mask out inconvenient features, to try and disable as many
 	 * unsupported kernel subsystems as possible.
 	 */
-	if (*eax == 1)
+	if (*ax == 1)
 		maskedx = ~((1 << X86_FEATURE_APIC) |  /* disable APIC */
 			    (1 << X86_FEATURE_ACPI) |  /* disable ACPI */
 			    (1 << X86_FEATURE_ACC));   /* thermal monitoring */
 
 	asm(XEN_EMULATE_PREFIX "cpuid"
-		: "=a" (*eax),
-		  "=b" (*ebx),
-		  "=c" (*ecx),
-		  "=d" (*edx)
-		: "0" (*eax), "2" (*ecx));
-	*edx &= maskedx;
+		: "=a" (*ax),
+		  "=b" (*bx),
+		  "=c" (*cx),
+		  "=d" (*dx)
+		: "0" (*ax), "2" (*cx));
+	*dx &= maskedx;
 }
 
 static void xen_set_debugreg(int reg, unsigned long val)
diff --git a/arch/x86/xen/events.c b/arch/x86/xen/events.c
index 6d1da5809e6f..aebab9704dd7 100644
--- a/arch/x86/xen/events.c
+++ b/arch/x86/xen/events.c
@@ -487,7 +487,7 @@ fastcall void xen_evtchn_do_upcall(struct pt_regs *regs)
 			int irq = evtchn_to_irq[port];
 
 			if (irq != -1) {
-				regs->orig_eax = ~irq;
+				regs->orig_ax = ~irq;
 				do_IRQ(regs);
 			}
 		}
diff --git a/include/asm-x86/compat.h b/include/asm-x86/compat.h
index 66ba7987184a..b270ee04959e 100644
--- a/include/asm-x86/compat.h
+++ b/include/asm-x86/compat.h
@@ -207,7 +207,7 @@ static inline compat_uptr_t ptr_to_compat(void __user *uptr)
 static __inline__ void __user *compat_alloc_user_space(long len)
 {
 	struct pt_regs *regs = task_pt_regs(current);
-	return (void __user *)regs->rsp - len; 
+	return (void __user *)regs->sp - len;
 }
 
 static inline int is_compat_task(void)
diff --git a/include/asm-x86/elf.h b/include/asm-x86/elf.h
index 60f5101d9483..5e5705bf082a 100644
--- a/include/asm-x86/elf.h
+++ b/include/asm-x86/elf.h
@@ -99,32 +99,32 @@ typedef struct user_fxsr_struct elf_fpxregset_t;
    just to make things more deterministic.
  */
 #define ELF_PLAT_INIT(_r, load_addr)	do { \
-	_r->ebx = 0; _r->ecx = 0; _r->edx = 0; \
-	_r->esi = 0; _r->edi = 0; _r->ebp = 0; \
-	_r->eax = 0; \
+	_r->bx = 0; _r->cx = 0; _r->dx = 0; \
+	_r->si = 0; _r->di = 0; _r->bp = 0; \
+	_r->ax = 0; \
 } while (0)
 
 /* regs is struct pt_regs, pr_reg is elf_gregset_t (which is
    now struct_user_regs, they are different) */
 
 #define ELF_CORE_COPY_REGS(pr_reg, regs)		\
-	pr_reg[0] = regs->ebx;				\
-	pr_reg[1] = regs->ecx;				\
-	pr_reg[2] = regs->edx;				\
-	pr_reg[3] = regs->esi;				\
-	pr_reg[4] = regs->edi;				\
-	pr_reg[5] = regs->ebp;				\
-	pr_reg[6] = regs->eax;				\
-	pr_reg[7] = regs->xds & 0xffff;			\
-	pr_reg[8] = regs->xes & 0xffff;			\
-	pr_reg[9] = regs->xfs & 0xffff;			\
+	pr_reg[0] = regs->bx;				\
+	pr_reg[1] = regs->cx;				\
+	pr_reg[2] = regs->dx;				\
+	pr_reg[3] = regs->si;				\
+	pr_reg[4] = regs->di;				\
+	pr_reg[5] = regs->bp;				\
+	pr_reg[6] = regs->ax;				\
+	pr_reg[7] = regs->ds & 0xffff;			\
+	pr_reg[8] = regs->es & 0xffff;			\
+	pr_reg[9] = regs->fs & 0xffff;			\
 	savesegment(gs,pr_reg[10]);			\
-	pr_reg[11] = regs->orig_eax;			\
-	pr_reg[12] = regs->eip;				\
-	pr_reg[13] = regs->xcs & 0xffff;		\
-	pr_reg[14] = regs->eflags;			\
-	pr_reg[15] = regs->esp;				\
-	pr_reg[16] = regs->xss & 0xffff;
+	pr_reg[11] = regs->orig_ax;			\
+	pr_reg[12] = regs->ip;				\
+	pr_reg[13] = regs->cs & 0xffff;			\
+	pr_reg[14] = regs->flags;			\
+	pr_reg[15] = regs->sp;				\
+	pr_reg[16] = regs->ss & 0xffff;
 
 #define ELF_PLATFORM	(utsname()->machine)
 #define set_personality_64bit()	do { } while (0)
@@ -142,9 +142,9 @@ extern unsigned int vdso_enabled;
 
 #define ELF_PLAT_INIT(_r, load_addr)	do {		  \
 	struct task_struct *cur = current;		  \
-	(_r)->rbx = 0; (_r)->rcx = 0; (_r)->rdx = 0;	  \
-	(_r)->rsi = 0; (_r)->rdi = 0; (_r)->rbp = 0;	  \
-	(_r)->rax = 0;					  \
+	(_r)->bx = 0; (_r)->cx = 0; (_r)->dx = 0;	  \
+	(_r)->si = 0; (_r)->di = 0; (_r)->bp = 0;	  \
+	(_r)->ax = 0;					  \
 	(_r)->r8 = 0;					  \
 	(_r)->r9 = 0;					  \
 	(_r)->r10 = 0;					  \
@@ -169,22 +169,22 @@ extern unsigned int vdso_enabled;
 	(pr_reg)[1] = (regs)->r14;				\
 	(pr_reg)[2] = (regs)->r13;				\
 	(pr_reg)[3] = (regs)->r12;				\
-	(pr_reg)[4] = (regs)->rbp;				\
-	(pr_reg)[5] = (regs)->rbx;				\
+	(pr_reg)[4] = (regs)->bp;				\
+	(pr_reg)[5] = (regs)->bx;				\
 	(pr_reg)[6] = (regs)->r11;				\
 	(pr_reg)[7] = (regs)->r10;				\
 	(pr_reg)[8] = (regs)->r9;				\
 	(pr_reg)[9] = (regs)->r8;				\
-	(pr_reg)[10] = (regs)->rax;				\
-	(pr_reg)[11] = (regs)->rcx;				\
-	(pr_reg)[12] = (regs)->rdx;				\
-	(pr_reg)[13] = (regs)->rsi;				\
-	(pr_reg)[14] = (regs)->rdi;				\
-	(pr_reg)[15] = (regs)->orig_rax;			\
-	(pr_reg)[16] = (regs)->rip;				\
+	(pr_reg)[10] = (regs)->ax;				\
+	(pr_reg)[11] = (regs)->cx;				\
+	(pr_reg)[12] = (regs)->dx;				\
+	(pr_reg)[13] = (regs)->si;				\
+	(pr_reg)[14] = (regs)->di;				\
+	(pr_reg)[15] = (regs)->orig_ax;			\
+	(pr_reg)[16] = (regs)->ip;				\
 	(pr_reg)[17] = (regs)->cs;				\
-	(pr_reg)[18] = (regs)->eflags;				\
-	(pr_reg)[19] = (regs)->rsp;				\
+	(pr_reg)[18] = (regs)->flags;				\
+	(pr_reg)[19] = (regs)->sp;				\
 	(pr_reg)[20] = (regs)->ss;				\
 	(pr_reg)[21] = current->thread.fs;			\
 	(pr_reg)[22] = current->thread.gs;			\
diff --git a/include/asm-x86/kexec_32.h b/include/asm-x86/kexec_32.h
index 4b9dc9e6b701..ff39d2f88022 100644
--- a/include/asm-x86/kexec_32.h
+++ b/include/asm-x86/kexec_32.h
@@ -45,7 +45,7 @@
 /* We can also handle crash dumps from 64 bit kernel. */
 #define vmcore_elf_check_arch_cross(x) ((x)->e_machine == EM_X86_64)
 
-/* CPU does not save ss and esp on stack if execution is already
+/* CPU does not save ss and sp on stack if execution is already
  * running in kernel mode at the time of NMI occurrence. This code
  * fixes it.
  */
@@ -53,16 +53,16 @@ static inline void crash_fixup_ss_esp(struct pt_regs *newregs,
 					struct pt_regs *oldregs)
 {
 	memcpy(newregs, oldregs, sizeof(*newregs));
-	newregs->esp = (unsigned long)&(oldregs->esp);
+	newregs->sp = (unsigned long)&(oldregs->sp);
 	__asm__ __volatile__(
 			"xorl %%eax, %%eax\n\t"
 			"movw %%ss, %%ax\n\t"
-			:"=a"(newregs->xss));
+			:"=a"(newregs->ss));
 }
 
 /*
  * This function is responsible for capturing register states if coming
- * via panic otherwise just fix up the ss and esp if coming via kernel
+ * via panic otherwise just fix up the ss and sp if coming via kernel
  * mode exception.
  */
 static inline void crash_setup_regs(struct pt_regs *newregs,
@@ -71,21 +71,21 @@ static inline void crash_setup_regs(struct pt_regs *newregs,
        if (oldregs)
                crash_fixup_ss_esp(newregs, oldregs);
        else {
-               __asm__ __volatile__("movl %%ebx,%0" : "=m"(newregs->ebx));
-               __asm__ __volatile__("movl %%ecx,%0" : "=m"(newregs->ecx));
-               __asm__ __volatile__("movl %%edx,%0" : "=m"(newregs->edx));
-               __asm__ __volatile__("movl %%esi,%0" : "=m"(newregs->esi));
-               __asm__ __volatile__("movl %%edi,%0" : "=m"(newregs->edi));
-               __asm__ __volatile__("movl %%ebp,%0" : "=m"(newregs->ebp));
-               __asm__ __volatile__("movl %%eax,%0" : "=m"(newregs->eax));
-               __asm__ __volatile__("movl %%esp,%0" : "=m"(newregs->esp));
-               __asm__ __volatile__("movw %%ss, %%ax;" :"=a"(newregs->xss));
-               __asm__ __volatile__("movw %%cs, %%ax;" :"=a"(newregs->xcs));
-               __asm__ __volatile__("movw %%ds, %%ax;" :"=a"(newregs->xds));
-               __asm__ __volatile__("movw %%es, %%ax;" :"=a"(newregs->xes));
-               __asm__ __volatile__("pushfl; popl %0" :"=m"(newregs->eflags));
+               __asm__ __volatile__("movl %%ebx,%0" : "=m"(newregs->bx));
+               __asm__ __volatile__("movl %%ecx,%0" : "=m"(newregs->cx));
+               __asm__ __volatile__("movl %%edx,%0" : "=m"(newregs->dx));
+               __asm__ __volatile__("movl %%esi,%0" : "=m"(newregs->si));
+               __asm__ __volatile__("movl %%edi,%0" : "=m"(newregs->di));
+               __asm__ __volatile__("movl %%ebp,%0" : "=m"(newregs->bp));
+               __asm__ __volatile__("movl %%eax,%0" : "=m"(newregs->ax));
+               __asm__ __volatile__("movl %%esp,%0" : "=m"(newregs->sp));
+               __asm__ __volatile__("movl %%ss, %%eax;" :"=a"(newregs->ss));
+               __asm__ __volatile__("movl %%cs, %%eax;" :"=a"(newregs->cs));
+               __asm__ __volatile__("movl %%ds, %%eax;" :"=a"(newregs->ds));
+               __asm__ __volatile__("movl %%es, %%eax;" :"=a"(newregs->es));
+               __asm__ __volatile__("pushfl; popl %0" :"=m"(newregs->flags));
 
-               newregs->eip = (unsigned long)current_text_addr();
+               newregs->ip = (unsigned long)current_text_addr();
        }
 }
 asmlinkage NORET_TYPE void
diff --git a/include/asm-x86/kexec_64.h b/include/asm-x86/kexec_64.h
index 738e581b67f8..b5f989b15c0b 100644
--- a/include/asm-x86/kexec_64.h
+++ b/include/asm-x86/kexec_64.h
@@ -60,14 +60,14 @@ static inline void crash_setup_regs(struct pt_regs *newregs,
 	if (oldregs)
 		memcpy(newregs, oldregs, sizeof(*newregs));
 	else {
-		__asm__ __volatile__("movq %%rbx,%0" : "=m"(newregs->rbx));
-		__asm__ __volatile__("movq %%rcx,%0" : "=m"(newregs->rcx));
-		__asm__ __volatile__("movq %%rdx,%0" : "=m"(newregs->rdx));
-		__asm__ __volatile__("movq %%rsi,%0" : "=m"(newregs->rsi));
-		__asm__ __volatile__("movq %%rdi,%0" : "=m"(newregs->rdi));
-		__asm__ __volatile__("movq %%rbp,%0" : "=m"(newregs->rbp));
-		__asm__ __volatile__("movq %%rax,%0" : "=m"(newregs->rax));
-		__asm__ __volatile__("movq %%rsp,%0" : "=m"(newregs->rsp));
+		__asm__ __volatile__("movq %%rbx,%0" : "=m"(newregs->bx));
+		__asm__ __volatile__("movq %%rcx,%0" : "=m"(newregs->cx));
+		__asm__ __volatile__("movq %%rdx,%0" : "=m"(newregs->dx));
+		__asm__ __volatile__("movq %%rsi,%0" : "=m"(newregs->si));
+		__asm__ __volatile__("movq %%rdi,%0" : "=m"(newregs->di));
+		__asm__ __volatile__("movq %%rbp,%0" : "=m"(newregs->bp));
+		__asm__ __volatile__("movq %%rax,%0" : "=m"(newregs->ax));
+		__asm__ __volatile__("movq %%rsp,%0" : "=m"(newregs->sp));
 		__asm__ __volatile__("movq %%r8,%0" : "=m"(newregs->r8));
 		__asm__ __volatile__("movq %%r9,%0" : "=m"(newregs->r9));
 		__asm__ __volatile__("movq %%r10,%0" : "=m"(newregs->r10));
@@ -78,9 +78,9 @@ static inline void crash_setup_regs(struct pt_regs *newregs,
 		__asm__ __volatile__("movq %%r15,%0" : "=m"(newregs->r15));
 		__asm__ __volatile__("movl %%ss, %%eax;" :"=a"(newregs->ss));
 		__asm__ __volatile__("movl %%cs, %%eax;" :"=a"(newregs->cs));
-		__asm__ __volatile__("pushfq; popq %0" :"=m"(newregs->eflags));
+		__asm__ __volatile__("pushfq; popq %0" :"=m"(newregs->flags));
 
-		newregs->rip = (unsigned long)current_text_addr();
+		newregs->ip = (unsigned long)current_text_addr();
 	}
 }
 
diff --git a/include/asm-x86/kprobes_32.h b/include/asm-x86/kprobes_32.h
index 9fe8f3bddfd5..2f38315bc39f 100644
--- a/include/asm-x86/kprobes_32.h
+++ b/include/asm-x86/kprobes_32.h
@@ -84,7 +84,7 @@ struct kprobe_ctlblk {
  */
 static inline void restore_interrupts(struct pt_regs *regs)
 {
-	if (regs->eflags & IF_MASK)
+	if (regs->flags & IF_MASK)
 		local_irq_enable();
 }
 
diff --git a/include/asm-x86/kprobes_64.h b/include/asm-x86/kprobes_64.h
index 743d76218fc9..8c919d35cdd3 100644
--- a/include/asm-x86/kprobes_64.h
+++ b/include/asm-x86/kprobes_64.h
@@ -77,7 +77,7 @@ struct kprobe_ctlblk {
  */
 static inline void restore_interrupts(struct pt_regs *regs)
 {
-	if (regs->eflags & IF_MASK)
+	if (regs->flags & IF_MASK)
 		local_irq_enable();
 }
 
diff --git a/include/asm-x86/mce.h b/include/asm-x86/mce.h
index e6ff507a73b0..94f1fd79e22a 100644
--- a/include/asm-x86/mce.h
+++ b/include/asm-x86/mce.h
@@ -13,7 +13,7 @@
 #define MCG_CTL_P	 (1UL<<8)   /* MCG_CAP register available */
 
 #define MCG_STATUS_RIPV  (1UL<<0)   /* restart ip valid */
-#define MCG_STATUS_EIPV  (1UL<<1)   /* eip points to correct instruction */
+#define MCG_STATUS_EIPV  (1UL<<1)   /* ip points to correct instruction */
 #define MCG_STATUS_MCIP  (1UL<<2)   /* machine check in progress */
 
 #define MCI_STATUS_VAL   (1UL<<63)  /* valid error */
@@ -30,7 +30,7 @@ struct mce {
 	__u64 misc;
 	__u64 addr;
 	__u64 mcgstatus;
-	__u64 rip;
+	__u64 ip;
 	__u64 tsc;	/* cpu time stamp counter */
 	__u64 res1;	/* for future extension */
 	__u64 res2;	/* dito. */
diff --git a/include/asm-x86/processor_32.h b/include/asm-x86/processor_32.h
index 3c67eacb3168..c85400fe58c4 100644
--- a/include/asm-x86/processor_32.h
+++ b/include/asm-x86/processor_32.h
@@ -398,14 +398,14 @@ struct thread_struct {
 
 #define start_thread(regs, new_eip, new_esp) do {		\
 	__asm__("movl %0,%%gs": :"r" (0));			\
-	regs->xfs = 0;						\
+	regs->fs = 0;						\
 	set_fs(USER_DS);					\
-	regs->xds = __USER_DS;					\
-	regs->xes = __USER_DS;					\
-	regs->xss = __USER_DS;					\
-	regs->xcs = __USER_CS;					\
-	regs->eip = new_eip;					\
-	regs->esp = new_esp;					\
+	regs->ds = __USER_DS;					\
+	regs->es = __USER_DS;					\
+	regs->ss = __USER_DS;					\
+	regs->cs = __USER_CS;					\
+	regs->ip = new_eip;					\
+	regs->sp = new_esp;					\
 } while (0)
 
 /* Forward declaration, a strange C thing */
@@ -440,7 +440,7 @@ unsigned long get_wchan(struct task_struct *p);
  * is accessable even if the CPU haven't stored the SS/ESP registers
  * on the stack (interrupt gate does not save these registers
  * when switching to the same priv ring).
- * Therefore beware: accessing the xss/esp fields of the
+ * Therefore beware: accessing the ss/esp fields of the
  * "struct pt_regs" is possible, but they may contain the
  * completely wrong values.
  */
@@ -451,8 +451,8 @@ unsigned long get_wchan(struct task_struct *p);
        __regs__ - 1;                                                   \
 })
 
-#define KSTK_EIP(task) (task_pt_regs(task)->eip)
-#define KSTK_ESP(task) (task_pt_regs(task)->esp)
+#define KSTK_EIP(task) (task_pt_regs(task)->ip)
+#define KSTK_ESP(task) (task_pt_regs(task)->sp)
 
 
 struct microcode_header {
diff --git a/include/asm-x86/processor_64.h b/include/asm-x86/processor_64.h
index e7bea4fed642..797770113e6d 100644
--- a/include/asm-x86/processor_64.h
+++ b/include/asm-x86/processor_64.h
@@ -258,12 +258,12 @@ struct thread_struct {
 #define start_thread(regs,new_rip,new_rsp) do { \
 	asm volatile("movl %0,%%fs; movl %0,%%es; movl %0,%%ds": :"r" (0));	 \
 	load_gs_index(0);							\
-	(regs)->rip = (new_rip);						 \
-	(regs)->rsp = (new_rsp);						 \
+	(regs)->ip = (new_rip);						 \
+	(regs)->sp = (new_rsp);						 \
 	write_pda(oldrsp, (new_rsp));						 \
 	(regs)->cs = __USER_CS;							 \
 	(regs)->ss = __USER_DS;							 \
-	(regs)->eflags = 0x200;							 \
+	(regs)->flags = 0x200;							 \
 	set_fs(USER_DS);							 \
 } while(0) 
 
@@ -297,7 +297,7 @@ extern long kernel_thread(int (*fn)(void *), void * arg, unsigned long flags);
 
 extern unsigned long get_wchan(struct task_struct *p);
 #define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.rsp0 - 1)
-#define KSTK_EIP(tsk) (task_pt_regs(tsk)->rip)
+#define KSTK_EIP(tsk) (task_pt_regs(tsk)->ip)
 #define KSTK_ESP(tsk) -1 /* sorry. doesn't work for syscall. */
 
 
diff --git a/include/asm-x86/ptrace.h b/include/asm-x86/ptrace.h
index 04204f359298..9187b2fab754 100644
--- a/include/asm-x86/ptrace.h
+++ b/include/asm-x86/ptrace.h
@@ -10,6 +10,8 @@
 /* this struct defines the way the registers are stored on the
    stack during a system call. */
 
+#ifndef __KERNEL__
+
 struct pt_regs {
 	long ebx;
 	long ecx;
@@ -21,7 +23,7 @@ struct pt_regs {
 	int  xds;
 	int  xes;
 	int  xfs;
-	/* int  xgs; */
+	/* int  gs; */
 	long orig_eax;
 	long eip;
 	int  xcs;
@@ -30,7 +32,27 @@ struct pt_regs {
 	int  xss;
 };
 
-#ifdef __KERNEL__
+#else /* __KERNEL__ */
+
+struct pt_regs {
+	long bx;
+	long cx;
+	long dx;
+	long si;
+	long di;
+	long bp;
+	long ax;
+	int  ds;
+	int  es;
+	int  fs;
+	/* int  gs; */
+	long orig_ax;
+	long ip;
+	int  cs;
+	long flags;
+	long sp;
+	int  ss;
+};
 
 #include <asm/vm86.h>
 #include <asm/segment.h>
@@ -47,27 +69,30 @@ extern void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int erro
  */
 static inline int user_mode(struct pt_regs *regs)
 {
-	return (regs->xcs & SEGMENT_RPL_MASK) == USER_RPL;
+	return (regs->cs & SEGMENT_RPL_MASK) == USER_RPL;
 }
 static inline int user_mode_vm(struct pt_regs *regs)
 {
-	return ((regs->xcs & SEGMENT_RPL_MASK) | (regs->eflags & VM_MASK)) >= USER_RPL;
+	return ((regs->cs & SEGMENT_RPL_MASK) |
+		(regs->flags & VM_MASK)) >= USER_RPL;
 }
 static inline int v8086_mode(struct pt_regs *regs)
 {
-	return (regs->eflags & VM_MASK);
+	return (regs->flags & VM_MASK);
 }
 
-#define instruction_pointer(regs) ((regs)->eip)
-#define frame_pointer(regs) ((regs)->ebp)
+#define instruction_pointer(regs) ((regs)->ip)
+#define frame_pointer(regs) ((regs)->bp)
 #define stack_pointer(regs) ((unsigned long)(regs))
-#define regs_return_value(regs) ((regs)->eax)
+#define regs_return_value(regs) ((regs)->ax)
 
 extern unsigned long profile_pc(struct pt_regs *regs);
 #endif /* __KERNEL__ */
 
 #else /* __i386__ */
 
+#ifndef __KERNEL__
+
 struct pt_regs {
 	unsigned long r15;
 	unsigned long r14;
@@ -96,14 +121,43 @@ struct pt_regs {
 /* top of stack page */
 };
 
-#ifdef __KERNEL__
+#else /* __KERNEL__ */
+
+struct pt_regs {
+	unsigned long r15;
+	unsigned long r14;
+	unsigned long r13;
+	unsigned long r12;
+	unsigned long bp;
+	unsigned long bx;
+/* arguments: non interrupts/non tracing syscalls only save upto here*/
+	unsigned long r11;
+	unsigned long r10;
+	unsigned long r9;
+	unsigned long r8;
+	unsigned long ax;
+	unsigned long cx;
+	unsigned long dx;
+	unsigned long si;
+	unsigned long di;
+	unsigned long orig_ax;
+/* end of arguments */
+/* cpu exception frame or undefined */
+	unsigned long ip;
+	unsigned long cs;
+	unsigned long flags;
+	unsigned long sp;
+	unsigned long ss;
+/* top of stack page */
+};
 
 #define user_mode(regs) (!!((regs)->cs & 3))
 #define user_mode_vm(regs) user_mode(regs)
-#define instruction_pointer(regs) ((regs)->rip)
-#define frame_pointer(regs) ((regs)->rbp)
-#define stack_pointer(regs) ((regs)->rsp)
-#define regs_return_value(regs) ((regs)->rax)
+#define v8086_mode(regs) 0	/* No V86 mode support in long mode */
+#define instruction_pointer(regs) ((regs)->ip)
+#define frame_pointer(regs) ((regs)->bp)
+#define stack_pointer(regs) ((regs)->sp)
+#define regs_return_value(regs) ((regs)->ax)
 
 extern unsigned long profile_pc(struct pt_regs *regs);
 void signal_fault(struct pt_regs *regs, void __user *frame, char *where);
diff --git a/kernel/signal.c b/kernel/signal.c
index afa4f781f924..bf49ce6f016b 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -733,13 +733,13 @@ static void print_fatal_signal(struct pt_regs *regs, int signr)
 		current->comm, task_pid_nr(current), signr);
 
 #if defined(__i386__) && !defined(__arch_um__)
-	printk("code at %08lx: ", regs->eip);
+	printk("code at %08lx: ", regs->ip);
 	{
 		int i;
 		for (i = 0; i < 16; i++) {
 			unsigned char insn;
 
-			__get_user(insn, (unsigned char *)(regs->eip + i));
+			__get_user(insn, (unsigned char *)(regs->ip + i));
 			printk("%02x ", insn);
 		}
 	}
-- 
cgit v1.2.3


From 95c354fe9f7d6decc08a92aa26eb233ecc2155bf Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Wed, 30 Jan 2008 13:31:20 +0100
Subject: spinlock: lockbreak cleanup

The break_lock data structure and code for spinlocks is quite nasty.
Not only does it double the size of a spinlock but it changes locking to
a potentially less optimal trylock.

Put all of that under CONFIG_GENERIC_LOCKBREAK, and introduce a
__raw_spin_is_contended that uses the lock data itself to determine whether
there are waiters on the lock, to be used if CONFIG_GENERIC_LOCKBREAK is
not set.

Rename need_lockbreak to spin_needbreak, make it use spin_is_contended to
decouple it from the spinlock implementation, and make it typesafe (rwlocks
do not have any need_lockbreak sites -- why do they even get bloated up
with that break_lock then?).

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/arm/Kconfig               |  5 +++++
 arch/ia64/Kconfig              |  5 +++++
 arch/m32r/Kconfig              |  5 +++++
 arch/mips/Kconfig              |  5 +++++
 arch/parisc/Kconfig            |  5 +++++
 arch/powerpc/Kconfig           |  5 +++++
 arch/sparc64/Kconfig           |  5 +++++
 arch/x86/Kconfig               |  4 ++++
 fs/jbd/checkpoint.c            |  3 ++-
 fs/jbd/commit.c                |  2 +-
 fs/jbd2/checkpoint.c           |  3 ++-
 fs/jbd2/commit.c               |  2 +-
 include/linux/sched.h          | 21 +++++++--------------
 include/linux/spinlock.h       |  6 ++++++
 include/linux/spinlock_types.h |  4 ++--
 include/linux/spinlock_up.h    |  2 ++
 kernel/sched.c                 | 16 ++++++----------
 kernel/spinlock.c              |  3 +--
 mm/memory.c                    |  8 +++-----
 19 files changed, 72 insertions(+), 37 deletions(-)

(limited to 'kernel')

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index de211ac3853e..77201d3f7479 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -91,6 +91,11 @@ config GENERIC_IRQ_PROBE
 	bool
 	default y
 
+config GENERIC_LOCKBREAK
+	bool
+	default y
+	depends on SMP && PREEMPT
+
 config RWSEM_GENERIC_SPINLOCK
 	bool
 	default y
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index bef47725d4ad..4a81b7fb191a 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -42,6 +42,11 @@ config MMU
 config SWIOTLB
        bool
 
+config GENERIC_LOCKBREAK
+	bool
+	default y
+	depends on SMP && PREEMPT
+
 config RWSEM_XCHGADD_ALGORITHM
 	bool
 	default y
diff --git a/arch/m32r/Kconfig b/arch/m32r/Kconfig
index ab9a264cb194..f7237c5f531e 100644
--- a/arch/m32r/Kconfig
+++ b/arch/m32r/Kconfig
@@ -235,6 +235,11 @@ config IRAM_SIZE
 # Define implied options from the CPU selection here
 #
 
+config GENERIC_LOCKBREAK
+	bool
+	default y
+	depends on SMP && PREEMPT
+
 config RWSEM_GENERIC_SPINLOCK
 	bool
 	depends on M32R
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index 6b0f85f02c79..4fad0a34b997 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -694,6 +694,11 @@ source "arch/mips/vr41xx/Kconfig"
 
 endmenu
 
+config GENERIC_LOCKBREAK
+	bool
+	default y
+	depends on SMP && PREEMPT
+
 config RWSEM_GENERIC_SPINLOCK
 	bool
 	default y
diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig
index b8ef1787a191..2b649c46631c 100644
--- a/arch/parisc/Kconfig
+++ b/arch/parisc/Kconfig
@@ -19,6 +19,11 @@ config MMU
 config STACK_GROWSUP
 	def_bool y
 
+config GENERIC_LOCKBREAK
+	bool
+	default y
+	depends on SMP && PREEMPT
+
 config RWSEM_GENERIC_SPINLOCK
 	def_bool y
 
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 232c298c933f..c17a194beb0e 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -53,6 +53,11 @@ config RWSEM_XCHGADD_ALGORITHM
 	bool
 	default y
 
+config GENERIC_LOCKBREAK
+	bool
+	default y
+	depends on SMP && PREEMPT
+
 config ARCH_HAS_ILOG2_U32
 	bool
 	default y
diff --git a/arch/sparc64/Kconfig b/arch/sparc64/Kconfig
index 10b212a1f9f5..1e25bce0366d 100644
--- a/arch/sparc64/Kconfig
+++ b/arch/sparc64/Kconfig
@@ -200,6 +200,11 @@ config US2E_FREQ
 	  If in doubt, say N.
 
 # Global things across all Sun machines.
+config GENERIC_LOCKBREAK
+	bool
+	default y
+	depends on SMP && PREEMPT
+
 config RWSEM_GENERIC_SPINLOCK
 	bool
 
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 23936301db56..db434f8171d3 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -19,6 +19,10 @@ config X86_64
 config X86
 	def_bool y
 
+config GENERIC_LOCKBREAK
+	def_bool y
+	depends on SMP && PREEMPT
+
 config GENERIC_TIME
 	def_bool y
 
diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c
index 0f69c416eebc..a5432bbbfb88 100644
--- a/fs/jbd/checkpoint.c
+++ b/fs/jbd/checkpoint.c
@@ -347,7 +347,8 @@ restart:
 				break;
 			}
 			retry = __process_buffer(journal, jh, bhs,&batch_count);
-			if (!retry && lock_need_resched(&journal->j_list_lock)){
+			if (!retry && (need_resched() ||
+				spin_needbreak(&journal->j_list_lock))) {
 				spin_unlock(&journal->j_list_lock);
 				retry = 1;
 				break;
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 610264b99a8e..31853eb65b4c 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -265,7 +265,7 @@ write_out_data:
 			put_bh(bh);
 		}
 
-		if (lock_need_resched(&journal->j_list_lock)) {
+		if (need_resched() || spin_needbreak(&journal->j_list_lock)) {
 			spin_unlock(&journal->j_list_lock);
 			goto write_out_data;
 		}
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 1b7f282c1ae9..6914598022ce 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -353,7 +353,8 @@ restart:
 			}
 			retry = __process_buffer(journal, jh, bhs, &batch_count,
 						 transaction);
-			if (!retry && lock_need_resched(&journal->j_list_lock)){
+			if (!retry && (need_resched() ||
+				spin_needbreak(&journal->j_list_lock))) {
 				spin_unlock(&journal->j_list_lock);
 				retry = 1;
 				break;
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index da8d0eb3b7b9..4f302d279279 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -341,7 +341,7 @@ write_out_data:
 			put_bh(bh);
 		}
 
-		if (lock_need_resched(&journal->j_list_lock)) {
+		if (need_resched() || spin_needbreak(&journal->j_list_lock)) {
 			spin_unlock(&journal->j_list_lock);
 			goto write_out_data;
 		}
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2d0546e884ea..9d4797609aa5 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1922,23 +1922,16 @@ extern int cond_resched_softirq(void);
 
 /*
  * Does a critical section need to be broken due to another
- * task waiting?:
+ * task waiting?: (technically does not depend on CONFIG_PREEMPT,
+ * but a general need for low latency)
  */
-#if defined(CONFIG_PREEMPT) && defined(CONFIG_SMP)
-# define need_lockbreak(lock) ((lock)->break_lock)
-#else
-# define need_lockbreak(lock) 0
-#endif
-
-/*
- * Does a critical section need to be broken due to another
- * task waiting or preemption being signalled:
- */
-static inline int lock_need_resched(spinlock_t *lock)
+static inline int spin_needbreak(spinlock_t *lock)
 {
-	if (need_lockbreak(lock) || need_resched())
-		return 1;
+#ifdef CONFIG_PREEMPT
+	return spin_is_contended(lock);
+#else
 	return 0;
+#endif
 }
 
 /*
diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
index c376f3b36c89..124449733c55 100644
--- a/include/linux/spinlock.h
+++ b/include/linux/spinlock.h
@@ -120,6 +120,12 @@ do {								\
 
 #define spin_is_locked(lock)	__raw_spin_is_locked(&(lock)->raw_lock)
 
+#ifdef CONFIG_GENERIC_LOCKBREAK
+#define spin_is_contended(lock) ((lock)->break_lock)
+#else
+#define spin_is_contended(lock)	__raw_spin_is_contended(&(lock)->raw_lock)
+#endif
+
 /**
  * spin_unlock_wait - wait until the spinlock gets unlocked
  * @lock: the spinlock in question.
diff --git a/include/linux/spinlock_types.h b/include/linux/spinlock_types.h
index f6a3a951b79e..68d88f71f1a2 100644
--- a/include/linux/spinlock_types.h
+++ b/include/linux/spinlock_types.h
@@ -19,7 +19,7 @@
 
 typedef struct {
 	raw_spinlock_t raw_lock;
-#if defined(CONFIG_PREEMPT) && defined(CONFIG_SMP)
+#ifdef CONFIG_GENERIC_LOCKBREAK
 	unsigned int break_lock;
 #endif
 #ifdef CONFIG_DEBUG_SPINLOCK
@@ -35,7 +35,7 @@ typedef struct {
 
 typedef struct {
 	raw_rwlock_t raw_lock;
-#if defined(CONFIG_PREEMPT) && defined(CONFIG_SMP)
+#ifdef CONFIG_GENERIC_LOCKBREAK
 	unsigned int break_lock;
 #endif
 #ifdef CONFIG_DEBUG_SPINLOCK
diff --git a/include/linux/spinlock_up.h b/include/linux/spinlock_up.h
index ea54c4c9a4ec..938234c4a996 100644
--- a/include/linux/spinlock_up.h
+++ b/include/linux/spinlock_up.h
@@ -64,6 +64,8 @@ static inline void __raw_spin_unlock(raw_spinlock_t *lock)
 # define __raw_spin_trylock(lock)	({ (void)(lock); 1; })
 #endif /* DEBUG_SPINLOCK */
 
+#define __raw_spin_is_contended(lock)	(((void)(lock), 0))
+
 #define __raw_read_can_lock(lock)	(((void)(lock), 1))
 #define __raw_write_can_lock(lock)	(((void)(lock), 1))
 
diff --git a/kernel/sched.c b/kernel/sched.c
index 524285e46fa7..ba4c88088f62 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4945,19 +4945,15 @@ EXPORT_SYMBOL(_cond_resched);
  */
 int cond_resched_lock(spinlock_t *lock)
 {
+	int resched = need_resched() && system_state == SYSTEM_RUNNING;
 	int ret = 0;
 
-	if (need_lockbreak(lock)) {
+	if (spin_needbreak(lock) || resched) {
 		spin_unlock(lock);
-		cpu_relax();
-		ret = 1;
-		spin_lock(lock);
-	}
-	if (need_resched() && system_state == SYSTEM_RUNNING) {
-		spin_release(&lock->dep_map, 1, _THIS_IP_);
-		_raw_spin_unlock(lock);
-		preempt_enable_no_resched();
-		__cond_resched();
+		if (resched && need_resched())
+			__cond_resched();
+		else
+			cpu_relax();
 		ret = 1;
 		spin_lock(lock);
 	}
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index cd72424c2662..ae28c8245123 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -65,8 +65,7 @@ EXPORT_SYMBOL(_write_trylock);
  * even on CONFIG_PREEMPT, because lockdep assumes that interrupts are
  * not re-enabled during lock-acquire (which the preempt-spin-ops do):
  */
-#if !defined(CONFIG_PREEMPT) || !defined(CONFIG_SMP) || \
-	defined(CONFIG_DEBUG_LOCK_ALLOC)
+#if !defined(CONFIG_GENERIC_LOCKBREAK) || defined(CONFIG_DEBUG_LOCK_ALLOC)
 
 void __lockfunc _read_lock(rwlock_t *lock)
 {
diff --git a/mm/memory.c b/mm/memory.c
index 4b0144b24c12..673ebbf499c7 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -513,8 +513,7 @@ again:
 		if (progress >= 32) {
 			progress = 0;
 			if (need_resched() ||
-			    need_lockbreak(src_ptl) ||
-			    need_lockbreak(dst_ptl))
+			    spin_needbreak(src_ptl) || spin_needbreak(dst_ptl))
 				break;
 		}
 		if (pte_none(*src_pte)) {
@@ -853,7 +852,7 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp,
 			tlb_finish_mmu(*tlbp, tlb_start, start);
 
 			if (need_resched() ||
-				(i_mmap_lock && need_lockbreak(i_mmap_lock))) {
+				(i_mmap_lock && spin_needbreak(i_mmap_lock))) {
 				if (i_mmap_lock) {
 					*tlbp = NULL;
 					goto out;
@@ -1768,8 +1767,7 @@ again:
 
 	restart_addr = zap_page_range(vma, start_addr,
 					end_addr - start_addr, details);
-	need_break = need_resched() ||
-			need_lockbreak(details->i_mmap_lock);
+	need_break = need_resched() || spin_needbreak(details->i_mmap_lock);
 
 	if (restart_addr >= end_addr) {
 		/* We have now completed this vma: mark it so */
-- 
cgit v1.2.3


From 16c3e389e7a7254ff8dc7029ac4fbe996c3c75bf Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Wed, 30 Jan 2008 13:31:47 +0100
Subject: x86: ptrace_request peekdata/pokedata

This makes ptrace_request handle {PEEK,POKE}{TEXT,DATA} directly.
Every arch_ptrace that could call generic_ptrace_peekdata already
has a default case calling ptrace_request, so this keeps things
simpler for the arch code.

Signed-off-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/ptrace.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'kernel')

diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 973d727f5e84..e6a99d2793b3 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -426,6 +426,13 @@ int ptrace_request(struct task_struct *child, long request,
 	int ret = -EIO;
 
 	switch (request) {
+	case PTRACE_PEEKTEXT:
+	case PTRACE_PEEKDATA:
+		return generic_ptrace_peekdata(child, addr, data);
+	case PTRACE_POKETEXT:
+	case PTRACE_POKEDATA:
+		return generic_ptrace_pokedata(child, addr, data);
+
 #ifdef PTRACE_OLDSETOPTIONS
 	case PTRACE_OLDSETOPTIONS:
 #endif
-- 
cgit v1.2.3


From 032d82d9065dec0e26718eca376c2029e4bd0595 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Wed, 30 Jan 2008 13:31:47 +0100
Subject: x86: compat_ptrace_request

This adds a compat_ptrace_request that is the analogue of ptrace_request
for the things that 32-on-64 ptrace implementations can share in common.
So far there are just a couple of requests handled generically.

Signed-off-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/compat.h |  4 ++++
 kernel/ptrace.c        | 38 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 42 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/compat.h b/include/linux/compat.h
index ba29d4c59643..a907fbede6c3 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -243,6 +243,10 @@ asmlinkage long compat_sys_migrate_pages(compat_pid_t pid,
 		compat_ulong_t maxnode, const compat_ulong_t __user *old_nodes,
 		const compat_ulong_t __user *new_nodes);
 
+extern int compat_ptrace_request(struct task_struct *child,
+				 compat_long_t request,
+				 compat_ulong_t addr, compat_ulong_t data);
+
 /*
  * epoll (fs/eventpoll.c) compat bits follow ...
  */
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index e6a99d2793b3..ed1c3d56c2cd 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -607,3 +607,41 @@ int generic_ptrace_pokedata(struct task_struct *tsk, long addr, long data)
 	copied = access_process_vm(tsk, addr, &data, sizeof(data), 1);
 	return (copied == sizeof(data)) ? 0 : -EIO;
 }
+
+#ifdef CONFIG_COMPAT
+#include <linux/compat.h>
+
+int compat_ptrace_request(struct task_struct *child, compat_long_t request,
+			  compat_ulong_t addr, compat_ulong_t data)
+{
+	compat_ulong_t __user *datap = compat_ptr(data);
+	compat_ulong_t word;
+	int ret;
+
+	switch (request) {
+	case PTRACE_PEEKTEXT:
+	case PTRACE_PEEKDATA:
+		ret = access_process_vm(child, addr, &word, sizeof(word), 0);
+		if (ret != sizeof(word))
+			ret = -EIO;
+		else
+			ret = put_user(word, datap);
+		break;
+
+	case PTRACE_POKETEXT:
+	case PTRACE_POKEDATA:
+		ret = access_process_vm(child, addr, &data, sizeof(data), 1);
+		ret = (ret != sizeof(data) ? -EIO : 0);
+		break;
+
+	case PTRACE_GETEVENTMSG:
+		ret = put_user((compat_ulong_t) child->ptrace_message, datap);
+		break;
+
+	default:
+		ret = ptrace_request(child, request, addr, data);
+	}
+
+	return ret;
+}
+#endif	/* CONFIG_COMPAT */
-- 
cgit v1.2.3


From c269f19617f508cc5c29c0b064c1a437d7011a46 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Wed, 30 Jan 2008 13:31:48 +0100
Subject: x86: compat_sys_ptrace

This adds a generic definition of compat_sys_ptrace that calls
compat_arch_ptrace, parallel to sys_ptrace/arch_ptrace.  Some
machines needing this already define a function by that name.
The new generic function is defined only on machines that
put #define __ARCH_WANT_COMPAT_SYS_PTRACE into asm/ptrace.h.

Signed-off-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/compat.h |  7 +++++++
 kernel/ptrace.c        | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 53 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/compat.h b/include/linux/compat.h
index a907fbede6c3..d38655f2be70 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -247,6 +247,13 @@ extern int compat_ptrace_request(struct task_struct *child,
 				 compat_long_t request,
 				 compat_ulong_t addr, compat_ulong_t data);
 
+#ifdef __ARCH_WANT_COMPAT_SYS_PTRACE
+extern long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
+			       compat_ulong_t addr, compat_ulong_t data);
+asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
+				  compat_long_t addr, compat_long_t data);
+#endif	/* __ARCH_WANT_COMPAT_SYS_PTRACE */
+
 /*
  * epoll (fs/eventpoll.c) compat bits follow ...
  */
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index ed1c3d56c2cd..e6e9b8be4b05 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -644,4 +644,50 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request,
 
 	return ret;
 }
+
+#ifdef __ARCH_WANT_COMPAT_SYS_PTRACE
+asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
+				  compat_long_t addr, compat_long_t data)
+{
+	struct task_struct *child;
+	long ret;
+
+	/*
+	 * This lock_kernel fixes a subtle race with suid exec
+	 */
+	lock_kernel();
+	if (request == PTRACE_TRACEME) {
+		ret = ptrace_traceme();
+		goto out;
+	}
+
+	child = ptrace_get_task_struct(pid);
+	if (IS_ERR(child)) {
+		ret = PTR_ERR(child);
+		goto out;
+	}
+
+	if (request == PTRACE_ATTACH) {
+		ret = ptrace_attach(child);
+		/*
+		 * Some architectures need to do book-keeping after
+		 * a ptrace attach.
+		 */
+		if (!ret)
+			arch_ptrace_attach(child);
+		goto out_put_task_struct;
+	}
+
+	ret = ptrace_check_attach(child, request == PTRACE_KILL);
+	if (!ret)
+		ret = compat_arch_ptrace(child, request, addr, data);
+
+ out_put_task_struct:
+	put_task_struct(child);
+ out:
+	unlock_kernel();
+	return ret;
+}
+#endif /* __ARCH_WANT_COMPAT_SYS_PTRACE */
+
 #endif	/* CONFIG_COMPAT */
-- 
cgit v1.2.3


From 9e094c17ee2b85130ab7b2ea37456f6867eb687a Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Wed, 30 Jan 2008 13:32:48 +0100
Subject: genirq: turn irq debugging options into module params

This allows to change them at runtime using sysfs. No need to
reboot to set them.

I only added aliases (kernel.noirqdebug etc.) so the old options
still work.

Signed-off-by: Andi Kleen <ak@suse.de>

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/spurious.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'kernel')

diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 32b161972fad..a6b2bc831dd0 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -10,6 +10,7 @@
 #include <linux/module.h>
 #include <linux/kallsyms.h>
 #include <linux/interrupt.h>
+#include <linux/moduleparam.h>
 
 static int irqfixup __read_mostly;
 
@@ -225,6 +226,8 @@ int noirqdebug_setup(char *str)
 }
 
 __setup("noirqdebug", noirqdebug_setup);
+module_param(noirqdebug, bool, 0644);
+MODULE_PARM_DESC(noirqdebug, "Disable irq lockup detection when true");
 
 static int __init irqfixup_setup(char *str)
 {
@@ -236,6 +239,8 @@ static int __init irqfixup_setup(char *str)
 }
 
 __setup("irqfixup", irqfixup_setup);
+module_param(irqfixup, int, 0644);
+MODULE_PARM_DESC("irqfixup", "0: No fixup, 1: irqfixup mode 2: irqpoll mode");
 
 static int __init irqpoll_setup(char *str)
 {
-- 
cgit v1.2.3


From 96d97cf03b3d68e6c857623da93acd522b2b7e1a Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Wed, 30 Jan 2008 13:32:48 +0100
Subject: x86: add /proc/irq/*/spurious to dump the spurious irq debugging
 state

This is useful to debug problems with interrupt handlers that return
sometimes IRQ_NONE.

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/proc.c | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 50b81b98046a..c2f2ccb0549a 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -75,6 +75,18 @@ static int irq_affinity_write_proc(struct file *file, const char __user *buffer,
 
 #endif
 
+static int irq_spurious_read(char *page, char **start, off_t off,
+				  int count, int *eof, void *data)
+{
+	struct irq_desc *d = &irq_desc[(long) data];
+	return sprintf(page, "count %u\n"
+			     "unhandled %u\n"
+			     "last_unhandled %u ms\n",
+			d->irq_count,
+			d->irqs_unhandled,
+			jiffies_to_msecs(d->last_unhandled));
+}
+
 #define MAX_NAMELEN 128
 
 static int name_unique(unsigned int irq, struct irqaction *new_action)
@@ -118,6 +130,7 @@ void register_handler_proc(unsigned int irq, struct irqaction *action)
 void register_irq_proc(unsigned int irq)
 {
 	char name [MAX_NAMELEN];
+	struct proc_dir_entry *entry;
 
 	if (!root_irq_dir ||
 		(irq_desc[irq].chip == &no_irq_chip) ||
@@ -132,8 +145,6 @@ void register_irq_proc(unsigned int irq)
 
 #ifdef CONFIG_SMP
 	{
-		struct proc_dir_entry *entry;
-
 		/* create /proc/irq/<irq>/smp_affinity */
 		entry = create_proc_entry("smp_affinity", 0600, irq_desc[irq].dir);
 
@@ -144,6 +155,12 @@ void register_irq_proc(unsigned int irq)
 		}
 	}
 #endif
+
+	entry = create_proc_entry("spurious", 0444, irq_desc[irq].dir);
+	if (entry) {
+		entry->data = (void *)(long)irq;
+		entry->read_proc = irq_spurious_read;
+	}
 }
 
 #undef MAX_NAMELEN
-- 
cgit v1.2.3


From 79b4cc5ee7a8086ac2c9c0afa52e6d687ce1ffef Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@infradead.org>
Date: Wed, 30 Jan 2008 13:32:50 +0100
Subject: debug: move WARN_ON() out of line

A quick grep shows that there are currently 1145 instances of WARN_ON
in the kernel. Currently, WARN_ON is pretty much entirely inlined,
which makes it hard to enhance it without growing the size of the kernel
(and getting Andrew unhappy).

This patch build on top of Olof's patch that introduces __WARN,
and places the slowpath out of line. It also uses Ingo's suggestion
to not use __FUNCTION__ but to use kallsyms to do the lookup;
this saves a ton of extra space since gcc doesn't need to store the function
string twice now:

3936367  833603  624736 5394706  525112 vmlinux.before
3917508  833603  624736 5375847  520767 vmlinux-slowpath

15Kb savings...

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
CC: Andrew Morton <akpm@linux-foundation.org>
CC: Olof Johansson <olof@lixom.net>
Acked-by: Matt Meckall <mpm@selenic.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/asm-generic/bug.h | 10 +++++-----
 kernel/panic.c            | 15 +++++++++++++++
 2 files changed, 20 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h
index 1a0e1a7684bd..2632328d8646 100644
--- a/include/asm-generic/bug.h
+++ b/include/asm-generic/bug.h
@@ -32,11 +32,11 @@ struct bug_entry {
 #endif
 
 #ifndef __WARN
-#define __WARN() do {							\
-	printk("WARNING: at %s:%d %s()\n", __FILE__,			\
-		__LINE__, __FUNCTION__);				\
-	dump_stack();							\
-} while (0)
+#ifndef __ASSEMBLY__
+extern void warn_on_slowpath(const char *file, const int line);
+#define WANT_WARN_ON_SLOWPATH
+#endif
+#define __WARN() warn_on_slowpath(__FILE__, __LINE__)
 #endif
 
 #ifndef WARN_ON
diff --git a/kernel/panic.c b/kernel/panic.c
index da4d6bac270e..0ebea438278a 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -20,6 +20,7 @@
 #include <linux/kexec.h>
 #include <linux/debug_locks.h>
 #include <linux/random.h>
+#include <linux/kallsyms.h>
 
 int panic_on_oops;
 int tainted;
@@ -292,6 +293,20 @@ void oops_exit(void)
 		(unsigned long long)oops_id);
 }
 
+#ifdef WANT_WARN_ON_SLOWPATH
+void warn_on_slowpath(const char *file, int line)
+{
+	char function[KSYM_SYMBOL_LEN];
+	unsigned long caller = (unsigned long) __builtin_return_address(0);
+
+	sprint_symbol(function, caller);
+	printk(KERN_WARNING "WARNING: at %s:%d %s()\n", file,
+		line, function);
+	dump_stack();
+}
+EXPORT_SYMBOL(warn_on_slowpath);
+#endif
+
 #ifdef CONFIG_CC_STACKPROTECTOR
 /*
  * Called when gcc's -fstack-protector feature is used, and
-- 
cgit v1.2.3


From 71c339116a216b181fc5e203ef51a033fe5e38cf Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Wed, 30 Jan 2008 13:32:50 +0100
Subject: debug: add the end-of-trace marker and the module list to

Unlike oopses, WARN_ON() currently does't print the loaded modules list.
This makes it harder to take action on certain bug reports. For example,
recently there were a set of WARN_ON()s reported in the mac80211 stack,
which were just signalling a driver bug. It takes then anther round trip
to the bug reporter (if he responds at all) to find out which driver
is at fault.

Another issue is that, unlike oopses, WARN_ON() doesn't currently printk
the helpful "cut here" line, nor the "end of trace" marker.
Now that WARN_ON() is out of line, the size increase due to this is
minimal and it's worth adding.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/panic.c | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/panic.c b/kernel/panic.c
index 0ebea438278a..d9e90cfe3298 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -281,6 +281,13 @@ static int init_oops_id(void)
 }
 late_initcall(init_oops_id);
 
+static void print_oops_end_marker(void)
+{
+	init_oops_id();
+	printk(KERN_WARNING "---[ end trace %016llx ]---\n",
+		(unsigned long long)oops_id);
+}
+
 /*
  * Called when the architecture exits its oops handler, after printing
  * everything.
@@ -288,9 +295,7 @@ late_initcall(init_oops_id);
 void oops_exit(void)
 {
 	do_oops_enter_exit();
-	init_oops_id();
-	printk(KERN_WARNING "---[ end trace %016llx ]---\n",
-		(unsigned long long)oops_id);
+	print_oops_end_marker();
 }
 
 #ifdef WANT_WARN_ON_SLOWPATH
@@ -298,11 +303,14 @@ void warn_on_slowpath(const char *file, int line)
 {
 	char function[KSYM_SYMBOL_LEN];
 	unsigned long caller = (unsigned long) __builtin_return_address(0);
-
 	sprint_symbol(function, caller);
+
+	printk(KERN_WARNING "------------[ cut here ]------------\n");
 	printk(KERN_WARNING "WARNING: at %s:%d %s()\n", file,
 		line, function);
+	print_modules();
 	dump_stack();
+	print_oops_end_marker();
 }
 EXPORT_SYMBOL(warn_on_slowpath);
 #endif
-- 
cgit v1.2.3


From 8c1c9356429741a82ff176d0f3400fb9e06b2a30 Mon Sep 17 00:00:00 2001
From: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Date: Wed, 30 Jan 2008 13:32:53 +0100
Subject: x86: kprobes: add kprobes smoke tests that run on boot

Here is a quick and naive smoke test for kprobes. This is intended to
just verify if some unrelated change broke the *probes subsystem. It is
self contained, architecture agnostic and isn't of any great use by itself.

This needs to be built in the kernel and runs a basic set of tests to
verify if kprobes, jprobes and kretprobes run fine on the kernel. In case
of an error, it'll print out a message with a "BUG" prefix.

This is a start; we intend to add more tests to this bucket over time.

Thanks to Jim Keniston and Masami Hiramatsu for comments and suggestions.

Tested on x86 (32/64) and powerpc.

Signed-off-by: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Acked-by: Masami Hiramatsu <mhiramat@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/kprobes.h |  10 +++
 kernel/Makefile         |   1 +
 kernel/kprobes.c        |   2 +
 kernel/test_kprobes.c   | 216 ++++++++++++++++++++++++++++++++++++++++++++++++
 lib/Kconfig.debug       |  12 +++
 5 files changed, 241 insertions(+)
 create mode 100644 kernel/test_kprobes.c

(limited to 'kernel')

diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 81891581e89b..6168c0a44172 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -182,6 +182,15 @@ static inline void kretprobe_assert(struct kretprobe_instance *ri,
 	}
 }
 
+#ifdef CONFIG_KPROBES_SANITY_TEST
+extern int init_test_probes(void);
+#else
+static inline int init_test_probes(void)
+{
+	return 0;
+}
+#endif /* CONFIG_KPROBES_SANITY_TEST */
+
 extern spinlock_t kretprobe_lock;
 extern struct mutex kprobe_mutex;
 extern int arch_prepare_kprobe(struct kprobe *p);
@@ -227,6 +236,7 @@ void unregister_kretprobe(struct kretprobe *rp);
 
 void kprobe_flush_task(struct task_struct *tk);
 void recycle_rp_inst(struct kretprobe_instance *ri, struct hlist_head *head);
+
 #else /* CONFIG_KPROBES */
 
 #define __kprobes	/**/
diff --git a/kernel/Makefile b/kernel/Makefile
index 390d42146267..62015c3d8d91 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -43,6 +43,7 @@ obj-$(CONFIG_CPUSETS) += cpuset.o
 obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o
 obj-$(CONFIG_IKCONFIG) += configs.o
 obj-$(CONFIG_STOP_MACHINE) += stop_machine.o
+obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
 obj-$(CONFIG_AUDIT) += audit.o auditfilter.o
 obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
 obj-$(CONFIG_AUDIT_TREE) += audit_tree.o
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index e3a5d817ac9b..d0493eafea3e 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -824,6 +824,8 @@ static int __init init_kprobes(void)
 	if (!err)
 		err = register_die_notifier(&kprobe_exceptions_nb);
 
+	if (!err)
+		init_test_probes();
 	return err;
 }
 
diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c
new file mode 100644
index 000000000000..88cdb109e13c
--- /dev/null
+++ b/kernel/test_kprobes.c
@@ -0,0 +1,216 @@
+/*
+ * test_kprobes.c - simple sanity test for *probes
+ *
+ * Copyright IBM Corp. 2008
+ *
+ * This program is free software;  you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/kprobes.h>
+#include <linux/random.h>
+
+#define div_factor 3
+
+static u32 rand1, preh_val, posth_val, jph_val;
+static int errors, handler_errors, num_tests;
+
+static noinline u32 kprobe_target(u32 value)
+{
+	/*
+	 * gcc ignores noinline on some architectures unless we stuff
+	 * sufficient lard into the function. The get_kprobe() here is
+	 * just for that.
+	 *
+	 * NOTE: We aren't concerned about the correctness of get_kprobe()
+	 * here; hence, this call is neither under !preempt nor with the
+	 * kprobe_mutex held. This is fine(tm)
+	 */
+	if (get_kprobe((void *)0xdeadbeef))
+		printk(KERN_INFO "Kprobe smoke test: probe on 0xdeadbeef!\n");
+
+	return (value / div_factor);
+}
+
+static int kp_pre_handler(struct kprobe *p, struct pt_regs *regs)
+{
+	preh_val = (rand1 / div_factor);
+	return 0;
+}
+
+static void kp_post_handler(struct kprobe *p, struct pt_regs *regs,
+		unsigned long flags)
+{
+	if (preh_val != (rand1 / div_factor)) {
+		handler_errors++;
+		printk(KERN_ERR "Kprobe smoke test failed: "
+				"incorrect value in post_handler\n");
+	}
+	posth_val = preh_val + div_factor;
+}
+
+static struct kprobe kp = {
+	.symbol_name = "kprobe_target",
+	.pre_handler = kp_pre_handler,
+	.post_handler = kp_post_handler
+};
+
+static int test_kprobe(void)
+{
+	int ret;
+
+	ret = register_kprobe(&kp);
+	if (ret < 0) {
+		printk(KERN_ERR "Kprobe smoke test failed: "
+				"register_kprobe returned %d\n", ret);
+		return ret;
+	}
+
+	ret = kprobe_target(rand1);
+	unregister_kprobe(&kp);
+
+	if (preh_val == 0) {
+		printk(KERN_ERR "Kprobe smoke test failed: "
+				"kprobe pre_handler not called\n");
+		handler_errors++;
+	}
+
+	if (posth_val == 0) {
+		printk(KERN_ERR "Kprobe smoke test failed: "
+				"kprobe post_handler not called\n");
+		handler_errors++;
+	}
+
+	return 0;
+}
+
+static u32 j_kprobe_target(u32 value)
+{
+	if (value != rand1) {
+		handler_errors++;
+		printk(KERN_ERR "Kprobe smoke test failed: "
+				"incorrect value in jprobe handler\n");
+	}
+
+	jph_val = rand1;
+	jprobe_return();
+	return 0;
+}
+
+static struct jprobe jp = {
+	.entry		= j_kprobe_target,
+	.kp.symbol_name = "kprobe_target"
+};
+
+static int test_jprobe(void)
+{
+	int ret;
+
+	ret = register_jprobe(&jp);
+	if (ret < 0) {
+		printk(KERN_ERR "Kprobe smoke test failed: "
+				"register_jprobe returned %d\n", ret);
+		return ret;
+	}
+
+	ret = kprobe_target(rand1);
+	unregister_jprobe(&jp);
+	if (jph_val == 0) {
+		printk(KERN_ERR "Kprobe smoke test failed: "
+				"jprobe handler not called\n");
+		handler_errors++;
+	}
+
+	return 0;
+}
+
+#ifdef CONFIG_KRETPROBES
+static u32 krph_val;
+
+static int return_handler(struct kretprobe_instance *ri, struct pt_regs *regs)
+{
+	unsigned long ret = regs_return_value(regs);
+
+	if (ret != (rand1 / div_factor)) {
+		handler_errors++;
+		printk(KERN_ERR "Kprobe smoke test failed: "
+				"incorrect value in kretprobe handler\n");
+	}
+
+	krph_val = (rand1 / div_factor);
+	return 0;
+}
+
+static struct kretprobe rp = {
+	.handler	= return_handler,
+	.kp.symbol_name = "kprobe_target"
+};
+
+static int test_kretprobe(void)
+{
+	int ret;
+
+	ret = register_kretprobe(&rp);
+	if (ret < 0) {
+		printk(KERN_ERR "Kprobe smoke test failed: "
+				"register_kretprobe returned %d\n", ret);
+		return ret;
+	}
+
+	ret = kprobe_target(rand1);
+	unregister_kretprobe(&rp);
+	if (krph_val == 0) {
+		printk(KERN_ERR "Kprobe smoke test failed: "
+				"kretprobe handler not called\n");
+		handler_errors++;
+	}
+
+	return 0;
+}
+#endif /* CONFIG_KRETPROBES */
+
+int init_test_probes(void)
+{
+	int ret;
+
+	do {
+		rand1 = random32();
+	} while (rand1 <= div_factor);
+
+	printk(KERN_INFO "Kprobe smoke test started\n");
+	num_tests++;
+	ret = test_kprobe();
+	if (ret < 0)
+		errors++;
+
+	num_tests++;
+	ret = test_jprobe();
+	if (ret < 0)
+		errors++;
+
+#ifdef CONFIG_KRETPROBES
+	num_tests++;
+	ret = test_kretprobe();
+	if (ret < 0)
+		errors++;
+#endif /* CONFIG_KRETPROBES */
+
+	if (errors)
+		printk(KERN_ERR "BUG: Kprobe smoke test: %d out of "
+				"%d tests failed\n", errors, num_tests);
+	else if (handler_errors)
+		printk(KERN_ERR "BUG: Kprobe smoke test: %d error(s) "
+				"running handlers\n", handler_errors);
+	else
+		printk(KERN_INFO "Kprobe smoke test passed successfully\n");
+
+	return 0;
+}
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index c4ecb2994ba3..f535b9b5eb00 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -494,6 +494,18 @@ config RCU_TORTURE_TEST
 	  Say M if you want the RCU torture tests to build as a module.
 	  Say N if you are unsure.
 
+config KPROBES_SANITY_TEST
+	bool "Kprobes sanity tests"
+	depends on DEBUG_KERNEL
+	depends on KPROBES
+	default n
+	help
+	  This option provides for testing basic kprobes functionality on
+	  boot. A sample kprobe, jprobe and kretprobe are inserted and
+	  verified for functionality.
+
+	  Say N if you are unsure.
+
 config LKDTM
 	tristate "Linux Kernel Dump Test Tool Module"
 	depends on DEBUG_KERNEL
-- 
cgit v1.2.3


From 076f9776f5d8d131b36955db8641aba3893c2c1b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 30 Jan 2008 13:33:06 +0100
Subject: x86: make early printk selectable on 64-bit as well

Enable CONFIG_EMBEDDED to select CONFIG_EARLY_PRINTK on 64-bit as well.

saves ~2K:

   text    data     bss     dec     hex filename
   7290283 3672091 1907848 12870222         c4624e vmlinux.before
   7288373 3671795 1907848 12868016         c459b0 vmlinux.after

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/Kconfig.debug    | 2 +-
 arch/x86/kernel/head64.c  | 7 ++++++-
 arch/x86/kernel/head_64.S | 7 +++++++
 include/asm-x86/kdebug.h  | 1 -
 include/linux/kernel.h    | 3 +++
 kernel/printk.c           | 7 +++++++
 6 files changed, 24 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 15854b53badc..88420af98140 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -6,7 +6,7 @@ config TRACE_IRQFLAGS_SUPPORT
 source "lib/Kconfig.debug"
 
 config EARLY_PRINTK
-	bool "Early printk" if EMBEDDED && DEBUG_KERNEL && X86_32
+	bool "Early printk" if EMBEDDED
 	default y
 	help
 	  Write kernel log output directly into the VGA buffer or to a serial
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 85c1e6bf8022..87e031d4abf1 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -58,8 +58,13 @@ void __init x86_64_start_kernel(char * real_mode_data)
 	/* Make NULL pointers segfault */
 	zap_identity_mappings();
 
-	for (i = 0; i < IDT_ENTRIES; i++)
+	for (i = 0; i < IDT_ENTRIES; i++) {
+#ifdef CONFIG_EARLY_PRINTK
 		set_intr_gate(i, &early_idt_handlers[i]);
+#else
+		set_intr_gate(i, early_idt_handler);
+#endif
+	}
 	load_idt((const struct desc_ptr *)&idt_descr);
 
 	early_printk("Kernel alive\n");
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 8b4c35cb519a..1d5a7a361200 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -267,6 +267,7 @@ init_rsp:
 bad_address:
 	jmp bad_address
 
+#ifdef CONFIG_EARLY_PRINTK
 .macro early_idt_tramp first, last
 	.ifgt \last-\first
 	early_idt_tramp \first, \last-1
@@ -281,8 +282,10 @@ early_idt_handlers:
 	early_idt_tramp 64, 127
 	early_idt_tramp 128, 191
 	early_idt_tramp 192, 255
+#endif
 
 ENTRY(early_idt_handler)
+#ifdef CONFIG_EARLY_PRINTK
 	cmpl $2,early_recursion_flag(%rip)
 	jz  1f
 	incl early_recursion_flag(%rip)
@@ -311,8 +314,11 @@ ENTRY(early_idt_handler)
 	movq 8(%rsp),%rsi	# get rip again
 	call __print_symbol
 #endif
+#endif /* EARLY_PRINTK */
 1:	hlt
 	jmp 1b
+
+#ifdef CONFIG_EARLY_PRINTK
 early_recursion_flag:
 	.long 0
 
@@ -320,6 +326,7 @@ early_idt_msg:
 	.asciz "PANIC: early exception %02lx rip %lx:%lx error %lx cr2 %lx\n"
 early_idt_ripmsg:
 	.asciz "RIP %s\n"
+#endif /* CONFIG_EARLY_PRINTK */
 
 .balign PAGE_SIZE
 
diff --git a/include/asm-x86/kdebug.h b/include/asm-x86/kdebug.h
index a5e5e3b7eb23..e9f42d1ac38f 100644
--- a/include/asm-x86/kdebug.h
+++ b/include/asm-x86/kdebug.h
@@ -22,7 +22,6 @@ enum die_val {
 	DIE_PAGE_FAULT,
 };
 
-extern void early_printk(const char *fmt, ...) __attribute__((format(printf,1,2)));
 extern void printk_address(unsigned long address);
 extern void die(const char *,struct pt_regs *,long);
 extern int __must_check __die(const char *, struct pt_regs *, long);
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index a7283c9beadf..ff356b2ee478 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -194,6 +194,9 @@ static inline int log_buf_read(int idx) { return 0; }
 static inline int log_buf_copy(char *dest, int idx, int len) { return 0; }
 #endif
 
+extern void __attribute__((format(printf, 1, 2)))
+	early_printk(const char *fmt, ...);
+
 unsigned long int_sqrt(unsigned long);
 
 extern int printk_ratelimit(void);
diff --git a/kernel/printk.c b/kernel/printk.c
index 3b7c968d0ef9..58bbec684119 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -36,6 +36,13 @@
 
 #include <asm/uaccess.h>
 
+/*
+ * Architectures can override it:
+ */
+void __attribute__((weak)) early_printk(const char *fmt, ...)
+{
+}
+
 #define __LOG_BUF_LEN	(1 << CONFIG_LOG_BUF_SHIFT)
 
 /* printk's without a loglevel use this.. */
-- 
cgit v1.2.3


From 6dab27784b2a97823b522e1cb88e40be40a93d45 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@infradead.org>
Date: Wed, 30 Jan 2008 13:33:08 +0100
Subject: x86: add a simple backtrace test module

During the work on the x86 32 and 64 bit backtrace code I found it useful
to have a simple test module to test a process and irq context backtrace.
Since the existing backtrace code was buggy, I figure it might be useful
to have such a test module in the kernel so that maybe we can even
detect such bugs earlier..

[ mingo@elte.hu: build fix ]

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/Makefile        |  1 +
 kernel/backtracetest.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
 lib/Kconfig.debug      | 12 ++++++++++++
 3 files changed, 61 insertions(+)
 create mode 100644 kernel/backtracetest.c

(limited to 'kernel')

diff --git a/kernel/Makefile b/kernel/Makefile
index 62015c3d8d91..8885627ea021 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -36,6 +36,7 @@ obj-$(CONFIG_KALLSYMS) += kallsyms.o
 obj-$(CONFIG_PM) += power/
 obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
 obj-$(CONFIG_KEXEC) += kexec.o
+obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
 obj-$(CONFIG_COMPAT) += compat.o
 obj-$(CONFIG_CGROUPS) += cgroup.o
 obj-$(CONFIG_CGROUP_DEBUG) += cgroup_debug.o
diff --git a/kernel/backtracetest.c b/kernel/backtracetest.c
new file mode 100644
index 000000000000..d1a7605c5b8f
--- /dev/null
+++ b/kernel/backtracetest.c
@@ -0,0 +1,48 @@
+/*
+ * Simple stack backtrace regression test module
+ *
+ * (C) Copyright 2008 Intel Corporation
+ * Author: Arjan van de Ven <arjan@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/delay.h>
+
+static struct timer_list backtrace_timer;
+
+static void backtrace_test_timer(unsigned long data)
+{
+	printk("Testing a backtrace from irq context.\n");
+	printk("The following trace is a kernel self test and not a bug!\n");
+	dump_stack();
+}
+static int backtrace_regression_test(void)
+{
+	printk("====[ backtrace testing ]===========\n");
+	printk("Testing a backtrace from process context.\n");
+	printk("The following trace is a kernel self test and not a bug!\n");
+	dump_stack();
+
+	init_timer(&backtrace_timer);
+	backtrace_timer.function = backtrace_test_timer;
+	mod_timer(&backtrace_timer, jiffies + 10);
+
+	msleep(10);
+	printk("====[ end of backtrace testing ]====\n");
+	return 0;
+}
+
+static void exitf(void)
+{
+}
+
+module_init(backtrace_regression_test);
+module_exit(exitf);
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>");
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index f535b9b5eb00..aa56e631580d 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -506,6 +506,18 @@ config KPROBES_SANITY_TEST
 
 	  Say N if you are unsure.
 
+config BACKTRACE_SELF_TEST
+	tristate "Self test for the backtrace code"
+	depends on DEBUG_KERNEL
+	default n
+	help
+	  This option provides a kernel module that can be used to test
+	  the kernel stack backtrace code. This option is not useful
+	  for distributions or general kernels, but only for kernel
+	  developers working on architecture code.
+
+	  Say N if you are unsure.
+
 config LKDTM
 	tristate "Linux Kernel Dump Test Tool Module"
 	depends on DEBUG_KERNEL
-- 
cgit v1.2.3


From 70edcd77a0d6d0f8731c826764f5eb6732f521e9 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 30 Jan 2008 13:33:24 +0100
Subject: genirq: stackdump after the "Trying to free already-free IRQ" message

these bugs are harder to find than they seem, a stackdump helps.

make it dependent on CONFIG_DEBUG_SHIRQ so that people can turn it off
if it annoys them.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/manage.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 1f314221d534..438a01464287 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -479,6 +479,9 @@ void free_irq(unsigned int irq, void *dev_id)
 			return;
 		}
 		printk(KERN_ERR "Trying to free already-free IRQ %d\n", irq);
+#ifdef CONFIG_DEBUG_SHIRQ
+		dump_stack();
+#endif
 		spin_unlock_irqrestore(&desc->lock, flags);
 		return;
 	}
-- 
cgit v1.2.3


From dd5af90a7f3d79e04b7eace9a98644dbf2038f4d Mon Sep 17 00:00:00 2001
From: Mike Travis <travis@sgi.com>
Date: Wed, 30 Jan 2008 13:33:32 +0100
Subject: x86/non-x86: percpu, node ids, apic ids x86.git fixup

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/Kconfig             |  2 +-
 include/asm-generic/percpu.h | 12 ++----------
 init/main.c                  |  4 ++--
 kernel/module.c              |  8 ++++++++
 4 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index f0887d12a5bb..8e1b33c5405f 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -97,7 +97,7 @@ config GENERIC_TIME_VSYSCALL
 	bool
 	default X86_64
 
-config ARCH_SETS_UP_PER_CPU_AREA
+config HAVE_SETUP_PER_CPU_AREA
 	def_bool X86_64
 
 config ARCH_SUPPORTS_OPROFILE
diff --git a/include/asm-generic/percpu.h b/include/asm-generic/percpu.h
index c41b1a731129..4b8d31cda1a0 100644
--- a/include/asm-generic/percpu.h
+++ b/include/asm-generic/percpu.h
@@ -47,7 +47,7 @@ extern unsigned long __per_cpu_offset[NR_CPUS];
 #endif
 
 /*
- * A percpu variable may point to a discarded reghions. The following are
+ * A percpu variable may point to a discarded regions. The following are
  * established ways to produce a usable pointer from the percpu variable
  * offset.
  */
@@ -59,18 +59,10 @@ extern unsigned long __per_cpu_offset[NR_CPUS];
 	(*SHIFT_PERCPU_PTR(&per_cpu_var(var), __my_cpu_offset))
 
 
-#ifdef CONFIG_ARCH_SETS_UP_PER_CPU_AREA
+#ifdef CONFIG_HAVE_SETUP_PER_CPU_AREA
 extern void setup_per_cpu_areas(void);
 #endif
 
-/* A macro to avoid #include hell... */
-#define percpu_modcopy(pcpudst, src, size)			\
-do {								\
-	unsigned int __i;					\
-	for_each_possible_cpu(__i)				\
-		memcpy((pcpudst)+per_cpu_offset(__i),		\
-		       (src), (size));				\
-} while (0)
 #else /* ! SMP */
 
 #define per_cpu(var, cpu)			(*((void)(cpu), &per_cpu_var(var)))
diff --git a/init/main.c b/init/main.c
index 5843fe996703..3316dffe3e57 100644
--- a/init/main.c
+++ b/init/main.c
@@ -363,7 +363,7 @@ static inline void smp_prepare_cpus(unsigned int maxcpus) { }
 
 #else
 
-#ifndef CONFIG_ARCH_SETS_UP_PER_CPU_AREA
+#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA
 unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
 
 EXPORT_SYMBOL(__per_cpu_offset);
@@ -384,7 +384,7 @@ static void __init setup_per_cpu_areas(void)
 		ptr += size;
 	}
 }
-#endif /* CONFIG_ARCH_SETS_UP_CPU_AREA */
+#endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */
 
 /* Called by boot processor to activate the rest. */
 static void __init smp_init(void)
diff --git a/kernel/module.c b/kernel/module.c
index f6a4e721fd49..bd60278ee703 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -430,6 +430,14 @@ static unsigned int find_pcpusec(Elf_Ehdr *hdr,
 	return find_sec(hdr, sechdrs, secstrings, ".data.percpu");
 }
 
+static void percpu_modcopy(void *pcpudest, const void *from, unsigned long size)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu)
+		memcpy(pcpudest + per_cpu_offset(cpu), from, size);
+}
+
 static int percpu_modinit(void)
 {
 	pcpu_num_used = 2;
-- 
cgit v1.2.3


From 6d4e4c4fca5be806b888d606894d914847e82d78 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Wed, 21 Nov 2007 16:41:05 +0200
Subject: KVM: Disallow fork() and similar games when using a VM

We don't want the meaning of guest userspace changing under our feet.

Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 drivers/kvm/kvm.h      | 1 +
 drivers/kvm/kvm_main.c | 9 +++++++++
 kernel/fork.c          | 1 +
 3 files changed, 11 insertions(+)

(limited to 'kernel')

diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h
index 1fd8158ced89..be18620bd656 100644
--- a/drivers/kvm/kvm.h
+++ b/drivers/kvm/kvm.h
@@ -305,6 +305,7 @@ struct kvm_vm_stat {
 
 struct kvm {
 	struct mutex lock; /* protects everything except vcpus */
+	struct mm_struct *mm; /* userspace tied to this vm */
 	int naliases;
 	struct kvm_mem_alias aliases[KVM_ALIAS_SLOTS];
 	int nmemslots;
diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c
index aec6b67cfebb..0efd759e585f 100644
--- a/drivers/kvm/kvm_main.c
+++ b/drivers/kvm/kvm_main.c
@@ -165,6 +165,8 @@ static struct kvm *kvm_create_vm(void)
 	if (IS_ERR(kvm))
 		goto out;
 
+	kvm->mm = current->mm;
+	atomic_inc(&kvm->mm->mm_count);
 	kvm_io_bus_init(&kvm->pio_bus);
 	mutex_init(&kvm->lock);
 	kvm_io_bus_init(&kvm->mmio_bus);
@@ -202,12 +204,15 @@ void kvm_free_physmem(struct kvm *kvm)
 
 static void kvm_destroy_vm(struct kvm *kvm)
 {
+	struct mm_struct *mm = kvm->mm;
+
 	spin_lock(&kvm_lock);
 	list_del(&kvm->vm_list);
 	spin_unlock(&kvm_lock);
 	kvm_io_bus_destroy(&kvm->pio_bus);
 	kvm_io_bus_destroy(&kvm->mmio_bus);
 	kvm_arch_destroy_vm(kvm);
+	mmdrop(mm);
 }
 
 static int kvm_vm_release(struct inode *inode, struct file *filp)
@@ -818,6 +823,8 @@ static long kvm_vcpu_ioctl(struct file *filp,
 	void __user *argp = (void __user *)arg;
 	int r;
 
+	if (vcpu->kvm->mm != current->mm)
+		return -EIO;
 	switch (ioctl) {
 	case KVM_RUN:
 		r = -EINVAL;
@@ -976,6 +983,8 @@ static long kvm_vm_ioctl(struct file *filp,
 	void __user *argp = (void __user *)arg;
 	int r;
 
+	if (kvm->mm != current->mm)
+		return -EIO;
 	switch (ioctl) {
 	case KVM_CREATE_VCPU:
 		r = kvm_vm_ioctl_create_vcpu(kvm, arg);
diff --git a/kernel/fork.c b/kernel/fork.c
index 314f5101d2b0..05e0b6f4365b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -393,6 +393,7 @@ void fastcall __mmdrop(struct mm_struct *mm)
 	destroy_context(mm);
 	free_mm(mm);
 }
+EXPORT_SYMBOL_GPL(__mmdrop);
 
 /*
  * Decrement the use count and release all resources for an mm.
-- 
cgit v1.2.3