From b36e4758dc1b9ff1f6d97e951edba22366230d11 Mon Sep 17 00:00:00 2001
From: Russell King <rmk@dyn-67.arm.linux.org.uk>
Date: Sun, 27 Aug 2006 12:26:34 +0100
Subject: [ARM] Fix kernel/fork.c for lockdep on ARM

ARM has interrupts enabled over context switches (iow, has
__ARCH_WANT_INTERRUPTS_ON_CTXSW defined.)  The lockdep code in fork.c
 assumes that interrupts are always disabled.  Fix this wrong
assumption by making the initialisation of 'p->hardirqs_enabled'
depend on __ARCH_WANT_INTERRUPTS_ON_CTXSW.

Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 kernel/fork.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index f9b014e3e700..8f76adf1c6a6 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1056,7 +1056,11 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 #endif
 #ifdef CONFIG_TRACE_IRQFLAGS
 	p->irq_events = 0;
+#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+	p->hardirqs_enabled = 1;
+#else
 	p->hardirqs_enabled = 0;
+#endif
 	p->hardirq_enable_ip = 0;
 	p->hardirq_enable_event = 0;
 	p->hardirq_disable_ip = _THIS_IP_;
-- 
cgit v1.2.3


From fe4944e59c357f945f81bc67edb7ed1392e875ad Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Fri, 4 Aug 2006 23:03:05 -0700
Subject: [NETLINK]: Extend netlink messaging interface

Adds:
 nlmsg_get_pos()                 return current position in message
 nlmsg_trim()                    trim part of message
 nla_reserve_nohdr(skb, len)     reserve room for an attribute w/o hdr
 nla_put_nohdr(skb, len, data)   add attribute w/o hdr
 nla_find_nested()               find attribute in nested attributes

Fixes nlmsg_new() to take allocation flags and consider size.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netlink.h   | 74 +++++++++++++++++++++++++++++++++++++++++-------
 kernel/taskstats.c      |  2 +-
 net/netlink/attr.c      | 75 +++++++++++++++++++++++++++++++++++++++++++++++++
 net/netlink/genetlink.c |  2 +-
 4 files changed, 141 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/include/net/netlink.h b/include/net/netlink.h
index 640c26a90cf1..3a5e40b1e045 100644
--- a/include/net/netlink.h
+++ b/include/net/netlink.h
@@ -35,6 +35,8 @@
  *   nlmsg_put()			add a netlink message to an skb
  *   nlmsg_put_answer()			callback based nlmsg_put()
  *   nlmsg_end()			finanlize netlink message
+ *   nlmsg_get_pos()			return current position in message
+ *   nlmsg_trim()			trim part of message
  *   nlmsg_cancel()			cancel message construction
  *   nlmsg_free()			free a netlink message
  *
@@ -80,8 +82,10 @@
  *   struct nlattr			netlink attribtue header
  *
  * Attribute Construction:
- *   nla_reserve(skb, type, len)	reserve skb tailroom for an attribute
+ *   nla_reserve(skb, type, len)	reserve room for an attribute
+ *   nla_reserve_nohdr(skb, len)	reserve room for an attribute w/o hdr
  *   nla_put(skb, type, len, data)	add attribute to skb
+ *   nla_put_nohdr(skb, len, data)	add attribute w/o hdr
  *
  * Attribute Construction for Basic Types:
  *   nla_put_u8(skb, type, value)	add u8 attribute to skb
@@ -139,6 +143,7 @@
  *   nla_next(nla, remaining)		get next netlink attribute
  *   nla_validate()			validate a stream of attributes
  *   nla_find()				find attribute in stream of attributes
+ *   nla_find_nested()			find attribute in nested attributes
  *   nla_parse()			parse and validate stream of attrs
  *   nla_parse_nested()			parse nested attribuets
  *   nla_for_each_attr()		loop over all attributes
@@ -203,12 +208,18 @@ extern int		nla_memcmp(const struct nlattr *nla, const void *data,
 extern int		nla_strcmp(const struct nlattr *nla, const char *str);
 extern struct nlattr *	__nla_reserve(struct sk_buff *skb, int attrtype,
 				      int attrlen);
+extern void *		__nla_reserve_nohdr(struct sk_buff *skb, int attrlen);
 extern struct nlattr *	nla_reserve(struct sk_buff *skb, int attrtype,
 				    int attrlen);
+extern void *		nla_reserve_nohdr(struct sk_buff *skb, int attrlen);
 extern void		__nla_put(struct sk_buff *skb, int attrtype,
 				  int attrlen, const void *data);
+extern void		__nla_put_nohdr(struct sk_buff *skb, int attrlen,
+					const void *data);
 extern int		nla_put(struct sk_buff *skb, int attrtype,
 				int attrlen, const void *data);
+extern int		nla_put_nohdr(struct sk_buff *skb, int attrlen,
+				      const void *data);
 
 /**************************************************************************
  * Netlink Messages
@@ -453,12 +464,13 @@ static inline struct nlmsghdr *nlmsg_put_answer(struct sk_buff *skb,
 /**
  * nlmsg_new - Allocate a new netlink message
  * @size: maximum size of message
+ * @flags: the type of memory to allocate.
  *
  * Use NLMSG_GOODSIZE if size isn't know and you need a good default size.
  */
-static inline struct sk_buff *nlmsg_new(int size)
+static inline struct sk_buff *nlmsg_new(int size, gfp_t flags)
 {
-	return alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+	return alloc_skb(size, flags);
 }
 
 /**
@@ -479,6 +491,32 @@ static inline int nlmsg_end(struct sk_buff *skb, struct nlmsghdr *nlh)
 	return skb->len;
 }
 
+/**
+ * nlmsg_get_pos - return current position in netlink message
+ * @skb: socket buffer the message is stored in
+ *
+ * Returns a pointer to the current tail of the message.
+ */
+static inline void *nlmsg_get_pos(struct sk_buff *skb)
+{
+	return skb->tail;
+}
+
+/**
+ * nlmsg_trim - Trim message to a mark
+ * @skb: socket buffer the message is stored in
+ * @mark: mark to trim to
+ *
+ * Trims the message to the provided mark. Returns -1.
+ */
+static inline int nlmsg_trim(struct sk_buff *skb, void *mark)
+{
+	if (mark)
+		skb_trim(skb, (unsigned char *) mark - skb->data);
+
+	return -1;
+}
+
 /**
  * nlmsg_cancel - Cancel construction of a netlink message
  * @skb: socket buffer the message is stored in
@@ -489,9 +527,7 @@ static inline int nlmsg_end(struct sk_buff *skb, struct nlmsghdr *nlh)
  */
 static inline int nlmsg_cancel(struct sk_buff *skb, struct nlmsghdr *nlh)
 {
-	skb_trim(skb, (unsigned char *) nlh - skb->data);
-
-	return -1;
+	return nlmsg_trim(skb, nlh);
 }
 
 /**
@@ -630,6 +666,18 @@ static inline struct nlattr *nla_next(const struct nlattr *nla, int *remaining)
 	return (struct nlattr *) ((char *) nla + totlen);
 }
 
+/**
+ * nla_find_nested - find attribute in a set of nested attributes
+ * @nla: attribute containing the nested attributes
+ * @attrtype: type of attribute to look for
+ *
+ * Returns the first attribute which matches the specified type.
+ */
+static inline struct nlattr *nla_find_nested(struct nlattr *nla, int attrtype)
+{
+	return nla_find(nla_data(nla), nla_len(nla), attrtype);
+}
+
 /**
  * nla_parse_nested - parse nested attributes
  * @tb: destination array with maxtype+1 elements
@@ -862,10 +910,7 @@ static inline int nla_nest_end(struct sk_buff *skb, struct nlattr *start)
  */
 static inline int nla_nest_cancel(struct sk_buff *skb, struct nlattr *start)
 {
-	if (start)
-		skb_trim(skb, (unsigned char *) start - skb->data);
-
-	return -1;
+	return nlmsg_trim(skb, start);
 }
 
 /**
@@ -880,4 +925,13 @@ static inline int nla_nest_cancel(struct sk_buff *skb, struct nlattr *start)
 	     nla_ok(pos, rem); \
 	     pos = nla_next(pos, &(rem)))
 
+/**
+ * nla_for_each_nested - iterate over nested attributes
+ * @pos: loop counter, set to current attribute
+ * @nla: attribute containing the nested attributes
+ * @rem: initialized to len, holds bytes currently remaining in stream
+ */
+#define nla_for_each_nested(pos, nla, rem) \
+	nla_for_each_attr(pos, nla_data(nla), nla_len(nla), rem)
+
 #endif
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index e78187657330..2ed4040d0dc5 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -75,7 +75,7 @@ static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp,
 	/*
 	 * If new attributes are added, please revisit this allocation
 	 */
-	skb = nlmsg_new(size);
+	skb = nlmsg_new(size, GFP_KERNEL);
 	if (!skb)
 		return -ENOMEM;
 
diff --git a/net/netlink/attr.c b/net/netlink/attr.c
index dddbd15135a8..136e529e5780 100644
--- a/net/netlink/attr.c
+++ b/net/netlink/attr.c
@@ -254,6 +254,26 @@ struct nlattr *__nla_reserve(struct sk_buff *skb, int attrtype, int attrlen)
 	return nla;
 }
 
+/**
+ * __nla_reserve_nohdr - reserve room for attribute without header
+ * @skb: socket buffer to reserve room on
+ * @attrlen: length of attribute payload
+ *
+ * Reserves room for attribute payload without a header.
+ *
+ * The caller is responsible to ensure that the skb provides enough
+ * tailroom for the payload.
+ */
+void *__nla_reserve_nohdr(struct sk_buff *skb, int attrlen)
+{
+	void *start;
+
+	start = skb_put(skb, NLA_ALIGN(attrlen));
+	memset(start, 0, NLA_ALIGN(attrlen));
+
+	return start;
+}
+
 /**
  * nla_reserve - reserve room for attribute on the skb
  * @skb: socket buffer to reserve room on
@@ -274,6 +294,24 @@ struct nlattr *nla_reserve(struct sk_buff *skb, int attrtype, int attrlen)
 	return __nla_reserve(skb, attrtype, attrlen);
 }
 
+/**
+ * nla_reserve - reserve room for attribute without header
+ * @skb: socket buffer to reserve room on
+ * @len: length of attribute payload
+ *
+ * Reserves room for attribute payload without a header.
+ *
+ * Returns NULL if the tailroom of the skb is insufficient to store
+ * the attribute payload.
+ */
+void *nla_reserve_nohdr(struct sk_buff *skb, int attrlen)
+{
+	if (unlikely(skb_tailroom(skb) < NLA_ALIGN(attrlen)))
+		return NULL;
+
+	return __nla_reserve_nohdr(skb, attrlen);
+}
+
 /**
  * __nla_put - Add a netlink attribute to a socket buffer
  * @skb: socket buffer to add attribute to
@@ -293,6 +331,22 @@ void __nla_put(struct sk_buff *skb, int attrtype, int attrlen,
 	memcpy(nla_data(nla), data, attrlen);
 }
 
+/**
+ * __nla_put_nohdr - Add a netlink attribute without header
+ * @skb: socket buffer to add attribute to
+ * @attrlen: length of attribute payload
+ * @data: head of attribute payload
+ *
+ * The caller is responsible to ensure that the skb provides enough
+ * tailroom for the attribute payload.
+ */
+void __nla_put_nohdr(struct sk_buff *skb, int attrlen, const void *data)
+{
+	void *start;
+
+	start = __nla_reserve_nohdr(skb, attrlen);
+	memcpy(start, data, attrlen);
+}
 
 /**
  * nla_put - Add a netlink attribute to a socket buffer
@@ -313,15 +367,36 @@ int nla_put(struct sk_buff *skb, int attrtype, int attrlen, const void *data)
 	return 0;
 }
 
+/**
+ * nla_put_nohdr - Add a netlink attribute without header
+ * @skb: socket buffer to add attribute to
+ * @attrlen: length of attribute payload
+ * @data: head of attribute payload
+ *
+ * Returns -1 if the tailroom of the skb is insufficient to store
+ * the attribute payload.
+ */
+int nla_put_nohdr(struct sk_buff *skb, int attrlen, const void *data)
+{
+	if (unlikely(skb_tailroom(skb) < NLA_ALIGN(attrlen)))
+		return -1;
+
+	__nla_put_nohdr(skb, attrlen, data);
+	return 0;
+}
 
 EXPORT_SYMBOL(nla_validate);
 EXPORT_SYMBOL(nla_parse);
 EXPORT_SYMBOL(nla_find);
 EXPORT_SYMBOL(nla_strlcpy);
 EXPORT_SYMBOL(__nla_reserve);
+EXPORT_SYMBOL(__nla_reserve_nohdr);
 EXPORT_SYMBOL(nla_reserve);
+EXPORT_SYMBOL(nla_reserve_nohdr);
 EXPORT_SYMBOL(__nla_put);
+EXPORT_SYMBOL(__nla_put_nohdr);
 EXPORT_SYMBOL(nla_put);
+EXPORT_SYMBOL(nla_put_nohdr);
 EXPORT_SYMBOL(nla_memcpy);
 EXPORT_SYMBOL(nla_memcmp);
 EXPORT_SYMBOL(nla_strcmp);
diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c
index a298f77cc3e3..75bb47a898dd 100644
--- a/net/netlink/genetlink.c
+++ b/net/netlink/genetlink.c
@@ -440,7 +440,7 @@ static struct sk_buff *ctrl_build_msg(struct genl_family *family, u32 pid,
 	struct sk_buff *skb;
 	int err;
 
-	skb = nlmsg_new(NLMSG_GOODSIZE);
+	skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
 	if (skb == NULL)
 		return ERR_PTR(-ENOBUFS);
 
-- 
cgit v1.2.3


From 1cc5f7142eca352109895fe20b1fc6405dd17727 Mon Sep 17 00:00:00 2001
From: Ed Swierk <eswierk@arastra.com>
Date: Mon, 25 Sep 2006 16:25:36 -0700
Subject: [PATCH] load_module: no BUG if module_subsys uninitialized

Invoking load_module() before param_sysfs_init() is called crashes in
mod_sysfs_setup(), since the kset in module_subsys is not initialized yet.

In my case, net-pf-1 is getting modprobed as a result of hotplug trying to
create a UNIX socket.  Calls to hotplug begin after the topology_init
initcall.

Another patch for the same symptom (module_subsys-initialize-earlier.patch)
moves param_sysfs_init() to the subsys initcalls, but this is still not
early enough in the boot process in some cases.  In particular,
topology_init() causes /sbin/hotplug to run, which requests net-pf-1 (the
UNIX socket protocol) which can be compiled as a module.  Moving
param_sysfs_init() to the postcore initcalls fixes this particular race,
but there might well be other cases where a usermodehelper causes a module
to load earlier still.

The patch makes load_module() return an error rather than crashing the
kernel if invoked before module_subsys is initialized.

Cc: Mark Huang <mlhuang@cs.princeton.edu>
Cc: Greg KH <greg@kroah.com>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/module.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 2a19cd47c046..b7fe6e840963 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1054,6 +1054,12 @@ static int mod_sysfs_setup(struct module *mod,
 {
 	int err;
 
+	if (!module_subsys.kset.subsys) {
+		printk(KERN_ERR "%s: module_subsys not initialized\n",
+		       mod->name);
+		err = -EINVAL;
+		goto out;
+	}
 	memset(&mod->mkobj.kobj, 0, sizeof(mod->mkobj.kobj));
 	err = kobject_set_name(&mod->mkobj.kobj, "%s", mod->name);
 	if (err)
-- 
cgit v1.2.3


From 7c8265f51073bc8632a99de78d5fd19117ed78b7 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@osdl.org>
Date: Sat, 24 Jun 2006 14:50:29 -0700
Subject: Suspend infrastructure cleanup and extension

Allow devices to participate in the suspend process more intimately,
in particular, allow the final phase (with interrupts disabled) to
also be open to normal devices, not just system devices.

Also, allow classes to participate in device suspend.

Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/base/power/resume.c  |  28 ++++++++--
 drivers/base/power/suspend.c | 122 ++++++++++++++++++++++++++++++++-----------
 include/linux/device.h       |  11 +++-
 include/linux/pm.h           |   1 +
 kernel/power/main.c          |   4 ++
 5 files changed, 130 insertions(+), 36 deletions(-)

(limited to 'kernel')

diff --git a/drivers/base/power/resume.c b/drivers/base/power/resume.c
index 826093ef4c7e..48e3d49d7b65 100644
--- a/drivers/base/power/resume.c
+++ b/drivers/base/power/resume.c
@@ -38,13 +38,35 @@ int resume_device(struct device * dev)
 		dev_dbg(dev,"resuming\n");
 		error = dev->bus->resume(dev);
 	}
+	if (dev->class && dev->class->resume) {
+		dev_dbg(dev,"class resume\n");
+		error = dev->class->resume(dev);
+	}
 	up(&dev->sem);
 	TRACE_RESUME(error);
 	return error;
 }
 
 
+static int resume_device_early(struct device * dev)
+{
+	int error = 0;
 
+	TRACE_DEVICE(dev);
+	TRACE_RESUME(0);
+	if (dev->bus && dev->bus->resume_early) {
+		dev_dbg(dev,"EARLY resume\n");
+		error = dev->bus->resume_early(dev);
+	}
+	TRACE_RESUME(error);
+	return error;
+}
+
+/*
+ * Resume the devices that have either not gone through
+ * the late suspend, or that did go through it but also
+ * went through the early resume
+ */
 void dpm_resume(void)
 {
 	down(&dpm_list_sem);
@@ -99,10 +121,8 @@ void dpm_power_up(void)
 		struct list_head * entry = dpm_off_irq.next;
 		struct device * dev = to_device(entry);
 
-		get_device(dev);
-		list_move_tail(entry, &dpm_active);
-		resume_device(dev);
-		put_device(dev);
+		list_move_tail(entry, &dpm_off);
+		resume_device_early(dev);
 	}
 }
 
diff --git a/drivers/base/power/suspend.c b/drivers/base/power/suspend.c
index 69509e02f703..10e8032aee1a 100644
--- a/drivers/base/power/suspend.c
+++ b/drivers/base/power/suspend.c
@@ -65,7 +65,19 @@ int suspend_device(struct device * dev, pm_message_t state)
 
 	dev->power.prev_state = dev->power.power_state;
 
-	if (dev->bus && dev->bus->suspend && !dev->power.power_state.event) {
+	if (dev->class && dev->class->suspend && !dev->power.power_state.event) {
+		dev_dbg(dev, "class %s%s\n",
+			suspend_verb(state.event),
+			((state.event == PM_EVENT_SUSPEND)
+					&& device_may_wakeup(dev))
+				? ", may wakeup"
+				: ""
+			);
+		error = dev->class->suspend(dev, state);
+		suspend_report_result(dev->class->suspend, error);
+	}
+
+	if (!error && dev->bus && dev->bus->suspend && !dev->power.power_state.event) {
 		dev_dbg(dev, "%s%s\n",
 			suspend_verb(state.event),
 			((state.event == PM_EVENT_SUSPEND)
@@ -81,15 +93,74 @@ int suspend_device(struct device * dev, pm_message_t state)
 }
 
 
+/*
+ * This is called with interrupts off, only a single CPU
+ * running. We can't do down() on a semaphore (and we don't
+ * need the protection)
+ */
+static int suspend_device_late(struct device *dev, pm_message_t state)
+{
+	int error = 0;
+
+	if (dev->power.power_state.event) {
+		dev_dbg(dev, "PM: suspend_late %d-->%d\n",
+			dev->power.power_state.event, state.event);
+	}
+
+	if (dev->bus && dev->bus->suspend_late && !dev->power.power_state.event) {
+		dev_dbg(dev, "LATE %s%s\n",
+			suspend_verb(state.event),
+			((state.event == PM_EVENT_SUSPEND)
+					&& device_may_wakeup(dev))
+				? ", may wakeup"
+				: ""
+			);
+		error = dev->bus->suspend_late(dev, state);
+		suspend_report_result(dev->bus->suspend_late, error);
+	}
+	return error;
+}
+
+/**
+ *	device_prepare_suspend - save state and prepare to suspend
+ *
+ *	NOTE! Devices cannot detach at this point - not only do we
+ *	hold the device list semaphores over the whole prepare, but
+ *	the whole point is to do non-invasive preparatory work, not
+ *	the actual suspend.
+ */
+int device_prepare_suspend(pm_message_t state)
+{
+	int error = 0;
+	struct device * dev;
+
+	down(&dpm_sem);
+	down(&dpm_list_sem);
+	list_for_each_entry_reverse(dev, &dpm_active, power.entry) {
+		if (!dev->bus || !dev->bus->suspend_prepare)
+			continue;
+		error = dev->bus->suspend_prepare(dev, state);
+		if (error)
+			break;
+	}
+	up(&dpm_list_sem);
+	up(&dpm_sem);
+	return error;
+}
+
 /**
  *	device_suspend - Save state and stop all devices in system.
  *	@state:		Power state to put each device in.
  *
  *	Walk the dpm_active list, call ->suspend() for each device, and move
- *	it to dpm_off.
- *	Check the return value for each. If it returns 0, then we move the
- *	the device to the dpm_off list. If it returns -EAGAIN, we move it to
- *	the dpm_off_irq list. If we get a different error, try and back out.
+ *	it to the dpm_off list.
+ *
+ *	(For historical reasons, if it returns -EAGAIN, that used to mean
+ *	that the device would be called again with interrupts disabled.
+ *	These days, we use the "suspend_late()" callback for that, so we
+ *	print a warning and consider it an error).
+ *
+ *	If we get a different error, try and back out.
  *
  *	If we hit a failure with any of the devices, call device_resume()
  *	above to bring the suspended devices back to life.
@@ -115,39 +186,27 @@ int device_suspend(pm_message_t state)
 
 		/* Check if the device got removed */
 		if (!list_empty(&dev->power.entry)) {
-			/* Move it to the dpm_off or dpm_off_irq list */
+			/* Move it to the dpm_off list */
 			if (!error)
 				list_move(&dev->power.entry, &dpm_off);
-			else if (error == -EAGAIN) {
-				list_move(&dev->power.entry, &dpm_off_irq);
-				error = 0;
-			}
 		}
 		if (error)
 			printk(KERN_ERR "Could not suspend device %s: "
-				"error %d\n", kobject_name(&dev->kobj), error);
+				"error %d%s\n",
+				kobject_name(&dev->kobj), error,
+				error == -EAGAIN ? " (please convert to suspend_late)" : "");
 		put_device(dev);
 	}
 	up(&dpm_list_sem);
-	if (error) {
-		/* we failed... before resuming, bring back devices from
-		 * dpm_off_irq list back to main dpm_off list, we do want
-		 * to call resume() on them, in case they partially suspended
-		 * despite returning -EAGAIN
-		 */
-		while (!list_empty(&dpm_off_irq)) {
-			struct list_head * entry = dpm_off_irq.next;
-			list_move(entry, &dpm_off);
-		}
+	if (error)
 		dpm_resume();
-	}
+
 	up(&dpm_sem);
 	return error;
 }
 
 EXPORT_SYMBOL_GPL(device_suspend);
 
-
 /**
  *	device_power_down - Shut down special devices.
  *	@state:		Power state to enter.
@@ -162,14 +221,17 @@ int device_power_down(pm_message_t state)
 	int error = 0;
 	struct device * dev;
 
-	list_for_each_entry_reverse(dev, &dpm_off_irq, power.entry) {
-		if ((error = suspend_device(dev, state)))
-			break;
+	while (!list_empty(&dpm_off)) {
+		struct list_head * entry = dpm_off.prev;
+
+		dev = to_device(entry);
+		error = suspend_device_late(dev, state);
+		if (error)
+			goto Error;
+		list_move(&dev->power.entry, &dpm_off_irq);
 	}
-	if (error)
-		goto Error;
-	if ((error = sysdev_suspend(state)))
-		goto Error;
+
+	error = sysdev_suspend(state);
  Done:
 	return error;
  Error:
diff --git a/include/linux/device.h b/include/linux/device.h
index 8a648cd94fa9..b40be6fca6fa 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -51,8 +51,12 @@ struct bus_type {
 	int		(*probe)(struct device * dev);
 	int		(*remove)(struct device * dev);
 	void		(*shutdown)(struct device * dev);
-	int		(*suspend)(struct device * dev, pm_message_t state);
-	int		(*resume)(struct device * dev);
+
+	int (*suspend_prepare)(struct device * dev, pm_message_t state);
+	int (*suspend)(struct device * dev, pm_message_t state);
+	int (*suspend_late)(struct device * dev, pm_message_t state);
+	int (*resume_early)(struct device * dev);
+	int (*resume)(struct device * dev);
 };
 
 extern int bus_register(struct bus_type * bus);
@@ -154,6 +158,9 @@ struct class {
 
 	void	(*release)(struct class_device *dev);
 	void	(*class_release)(struct class *class);
+
+	int	(*suspend)(struct device *, pm_message_t state);
+	int	(*resume)(struct device *);
 };
 
 extern int class_register(struct class *);
diff --git a/include/linux/pm.h b/include/linux/pm.h
index 658c1b93d5bb..096fb6f754cf 100644
--- a/include/linux/pm.h
+++ b/include/linux/pm.h
@@ -190,6 +190,7 @@ extern void device_resume(void);
 extern suspend_disk_method_t pm_disk_mode;
 
 extern int device_suspend(pm_message_t state);
+extern int device_prepare_suspend(pm_message_t state);
 
 #define device_set_wakeup_enable(dev,val) \
 	((dev)->power.should_wakeup = !!(val))
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 6d295c776794..0c3ed6ac938e 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -57,6 +57,10 @@ static int suspend_prepare(suspend_state_t state)
 	if (!pm_ops || !pm_ops->enter)
 		return -EPERM;
 
+	error = device_prepare_suspend(PMSG_SUSPEND);
+	if (error)
+		return error;
+
 	pm_prepare_console();
 
 	disable_nonboot_cpus();
-- 
cgit v1.2.3


From f1cc0a894c963923b766eb2d455747495e6e982d Mon Sep 17 00:00:00 2001
From: David Brownell <david-b@pacbell.net>
Date: Mon, 14 Aug 2006 23:11:08 -0700
Subject: PM: issue PM_EVENT_PRETHAW

This patch is the first of this series that should actually change any
behavior ...  by issuing the new event, now tha the rest of the kernel is
prepared to receive it.

This converts the PM core to issue the new PRETHAW message, which the rest of
the kernel is now ready to receive.

Signed-off-by: David Brownell <dbrownell@users.sourceforge.net>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Cc: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/base/power/suspend.c | 1 +
 kernel/power/disk.c          | 4 ++--
 kernel/power/swsusp.c        | 9 ++++++++-
 kernel/power/user.c          | 2 +-
 4 files changed, 12 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/drivers/base/power/suspend.c b/drivers/base/power/suspend.c
index 0bda4a7f2042..e86db83746ac 100644
--- a/drivers/base/power/suspend.c
+++ b/drivers/base/power/suspend.c
@@ -34,6 +34,7 @@ static inline char *suspend_verb(u32 event)
 	switch (event) {
 	case PM_EVENT_SUSPEND:	return "suspend";
 	case PM_EVENT_FREEZE:	return "freeze";
+	case PM_EVENT_PRETHAW:	return "prethaw";
 	default:		return "(unknown suspend event)";
 	}
 }
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index e13e74067845..a3c34fb14321 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -98,7 +98,7 @@ static void unprepare_processes(void)
 }
 
 /**
- *	pm_suspend_disk - The granpappy of power management.
+ *	pm_suspend_disk - The granpappy of hibernation power management.
  *
  *	If we're going through the firmware, then get it over with quickly.
  *
@@ -207,7 +207,7 @@ static int software_resume(void)
 
 	pr_debug("PM: Preparing devices for restore.\n");
 
-	if ((error = device_suspend(PMSG_FREEZE))) {
+	if ((error = device_suspend(PMSG_PRETHAW))) {
 		printk("Some devices failed to suspend\n");
 		swsusp_free();
 		goto Thaw;
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 17f669c83012..62752899b1a1 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -248,6 +248,9 @@ int swsusp_suspend(void)
 	restore_processor_state();
 Restore_highmem:
 	restore_highmem();
+	/* NOTE:  device_power_up() is just a resume() for devices
+	 * that suspended with irqs off ... no overall powerup.
+	 */
 	device_power_up();
 Enable_irqs:
 	local_irq_enable();
@@ -257,8 +260,12 @@ Enable_irqs:
 int swsusp_resume(void)
 {
 	int error;
+
 	local_irq_disable();
-	if (device_power_down(PMSG_FREEZE))
+	/* NOTE:  device_power_down() is just a suspend() with irqs off;
+	 * it has no special "power things down" semantics
+	 */
+	if (device_power_down(PMSG_PRETHAW))
 		printk(KERN_ERR "Some devices failed to power down, very bad\n");
 	/* We'll ignore saved state, but this gets preempt count (etc) right */
 	save_processor_state();
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 3f1539fbe48a..5a8d060d7909 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -191,7 +191,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
 		}
 		down(&pm_sem);
 		pm_prepare_console();
-		error = device_suspend(PMSG_FREEZE);
+		error = device_suspend(PMSG_PRETHAW);
 		if (!error) {
 			error = swsusp_resume();
 			device_resume();
-- 
cgit v1.2.3


From 2bca293e56b6a8cd16bb6e70a09b2adac9c723b5 Mon Sep 17 00:00:00 2001
From: David Brownell <david-b@pacbell.net>
Date: Wed, 30 Aug 2006 13:54:36 -0700
Subject: PM: add kconfig option for deprecated .../power/state files

Add a new PM_SYSFS_DEPRECATED config option to control whether or
not the /sys/devices/.../power/state files are provided.  This will
make it easier to get rid of that mechanism when the time comes,
and to verify that userspace tools work right without it.

Signed-off-by: David Brownell <dbrownell@users.sourceforge.net>
Acked-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/base/power/sysfs.c |  6 ++++++
 kernel/power/Kconfig       | 11 +++++++++++
 2 files changed, 17 insertions(+)

(limited to 'kernel')

diff --git a/drivers/base/power/sysfs.c b/drivers/base/power/sysfs.c
index e55b3c2779e9..2d47517dbe32 100644
--- a/drivers/base/power/sysfs.c
+++ b/drivers/base/power/sysfs.c
@@ -7,6 +7,8 @@
 #include "power.h"
 
 
+#ifdef	CONFIG_PM_SYSFS_DEPRECATED
+
 /**
  *	state - Control current power state of device
  *
@@ -66,6 +68,8 @@ static ssize_t state_store(struct device * dev, struct device_attribute *attr, c
 static DEVICE_ATTR(state, 0644, state_show, state_store);
 
 
+#endif	/* CONFIG_PM_SYSFS_DEPRECATED */
+
 /*
  *	wakeup - Report/change current wakeup option for device
  *
@@ -139,7 +143,9 @@ static DEVICE_ATTR(wakeup, 0644, wake_show, wake_store);
 
 
 static struct attribute * power_attrs[] = {
+#ifdef	CONFIG_PM_SYSFS_DEPRECATED
 	&dev_attr_state.attr,
+#endif
 	&dev_attr_wakeup.attr,
 	NULL,
 };
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 619ecabf7c58..1ed972070d19 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -53,6 +53,17 @@ config PM_TRACE
 	CAUTION: this option will cause your machine's real-time clock to be
 	set to an invalid time after a resume.
 
+config PM_SYSFS_DEPRECATED
+	bool "Driver model /sys/devices/.../power/state files (DEPRECATED)"
+	depends on PM && SYSFS
+	default n
+	help
+	  The driver model started out with a sysfs file intended to provide
+	  a userspace hook for device power management.  This feature has never
+	  worked very well, except for limited testing purposes, and so it will
+	  be removed.   It's not clear that a generic mechanism could really
+	  handle the wide variability of device power states; any replacements
+	  are likely to be bus or driver specific.
 
 config SOFTWARE_SUSPEND
 	bool "Software Suspend"
-- 
cgit v1.2.3


From 1d3a82af45428c5e8deaa119cdeb79611ae46371 Mon Sep 17 00:00:00 2001
From: David Brownell <david-b@pacbell.net>
Date: Wed, 30 Aug 2006 14:09:47 -0700
Subject: PM: no suspend_prepare() phase

Remove the new suspend_prepare() phase.  It doesn't seem very usable,
has never been tested, doesn't address fault cleanup, and would need
a sibling resume_complete(); plus there are no real use cases.  It
could be restored later if those issues get resolved.

Signed-off-by: David Brownell <dbrownell@users.sourceforge.net>
Cc: Linus Torvalds <torvalds@osdl.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/base/power/suspend.c | 27 ---------------------------
 drivers/pci/pci-driver.c     | 14 --------------
 include/linux/device.h       |  1 -
 include/linux/pci.h          |  1 -
 kernel/power/main.c          |  4 ----
 5 files changed, 47 deletions(-)

(limited to 'kernel')

diff --git a/drivers/base/power/suspend.c b/drivers/base/power/suspend.c
index e86db83746ac..6453c25103d2 100644
--- a/drivers/base/power/suspend.c
+++ b/drivers/base/power/suspend.c
@@ -117,33 +117,6 @@ static int suspend_device_late(struct device *dev, pm_message_t state)
 	return error;
 }
 
-/**
- *	device_prepare_suspend - save state and prepare to suspend
- *
- *	NOTE! Devices cannot detach at this point - not only do we
- *	hold the device list semaphores over the whole prepare, but
- *	the whole point is to do non-invasive preparatory work, not
- *	the actual suspend.
- */
-int device_prepare_suspend(pm_message_t state)
-{
-	int error = 0;
-	struct device * dev;
-
-	down(&dpm_sem);
-	down(&dpm_list_sem);
-	list_for_each_entry_reverse(dev, &dpm_active, power.entry) {
-		if (!dev->bus || !dev->bus->suspend_prepare)
-			continue;
-		error = dev->bus->suspend_prepare(dev, state);
-		if (error)
-			break;
-	}
-	up(&dpm_list_sem);
-	up(&dpm_sem);
-	return error;
-}
-
 /**
  *	device_suspend - Save state and stop all devices in system.
  *	@state:		Power state to put each device in.
diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c
index 9e7d6ceb3805..8948ac9ab681 100644
--- a/drivers/pci/pci-driver.c
+++ b/drivers/pci/pci-driver.c
@@ -264,19 +264,6 @@ static int pci_device_remove(struct device * dev)
 	return 0;
 }
 
-static int pci_device_suspend_prepare(struct device * dev, pm_message_t state)
-{
-	struct pci_dev * pci_dev = to_pci_dev(dev);
-	struct pci_driver * drv = pci_dev->driver;
-	int i = 0;
-
-	if (drv && drv->suspend_prepare) {
-		i = drv->suspend_prepare(pci_dev, state);
-		suspend_report_result(drv->suspend_prepare, i);
-	}
-	return i;
-}
-
 static int pci_device_suspend(struct device * dev, pm_message_t state)
 {
 	struct pci_dev * pci_dev = to_pci_dev(dev);
@@ -544,7 +531,6 @@ struct bus_type pci_bus_type = {
 	.uevent		= pci_uevent,
 	.probe		= pci_device_probe,
 	.remove		= pci_device_remove,
-	.suspend_prepare= pci_device_suspend_prepare,
 	.suspend	= pci_device_suspend,
 	.suspend_late	= pci_device_suspend_late,
 	.resume_early	= pci_device_resume_early,
diff --git a/include/linux/device.h b/include/linux/device.h
index b40be6fca6fa..b3646462d6dc 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -52,7 +52,6 @@ struct bus_type {
 	int		(*remove)(struct device * dev);
 	void		(*shutdown)(struct device * dev);
 
-	int (*suspend_prepare)(struct device * dev, pm_message_t state);
 	int (*suspend)(struct device * dev, pm_message_t state);
 	int (*suspend_late)(struct device * dev, pm_message_t state);
 	int (*resume_early)(struct device * dev);
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 4b2e629467c7..9514bbfe96e2 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -345,7 +345,6 @@ struct pci_driver {
 	const struct pci_device_id *id_table;	/* must be non-NULL for probe to be called */
 	int  (*probe)  (struct pci_dev *dev, const struct pci_device_id *id);	/* New device inserted */
 	void (*remove) (struct pci_dev *dev);	/* Device removed (NULL if not a hot-plug capable driver) */
-	int  (*suspend_prepare) (struct pci_dev *dev, pm_message_t state);
 	int  (*suspend) (struct pci_dev *dev, pm_message_t state);	/* Device suspended */
 	int  (*suspend_late) (struct pci_dev *dev, pm_message_t state);
 	int  (*resume_early) (struct pci_dev *dev);
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 0c3ed6ac938e..6d295c776794 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -57,10 +57,6 @@ static int suspend_prepare(suspend_state_t state)
 	if (!pm_ops || !pm_ops->enter)
 		return -EPERM;
 
-	error = device_prepare_suspend(PMSG_SUSPEND);
-	if (error)
-		return error;
-
 	pm_prepare_console();
 
 	disable_nonboot_cpus();
-- 
cgit v1.2.3


From 2fbe7b25c8edaf2d10e6c1a4cc9f8afe714c4764 Mon Sep 17 00:00:00 2001
From: Don Zickus <dzickus@redhat.com>
Date: Tue, 26 Sep 2006 10:52:27 +0200
Subject: [PATCH] i386/x86-64: Remove un/set_nmi_callback and
 reserve/release_lapic_nmi functions

Removes the un/set_nmi_callback and reserve/release_lapic_nmi functions as
they are no longer needed.  The various subsystems are modified to register
with the die_notifier instead.

Also includes compile fixes by Andrew Morton.

Signed-off-by:  Don Zickus <dzickus@redhat.com>
Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/i386/kernel/crash.c           |  20 +++++++-
 arch/i386/kernel/nmi.c             |  85 ++++---------------------------
 arch/i386/kernel/traps.c           |  23 +--------
 arch/i386/oprofile/nmi_int.c       |  47 ++++++++++-------
 arch/i386/oprofile/nmi_timer_int.c |  33 +++++++++---
 arch/x86_64/kernel/crash.c         |  20 +++++++-
 arch/x86_64/kernel/nmi.c           | 102 +++----------------------------------
 include/asm-i386/nmi.h             |  21 ++------
 include/asm-x86_64/nmi.h           |  21 --------
 kernel/sysctl.c                    |   4 +-
 10 files changed, 116 insertions(+), 260 deletions(-)

(limited to 'kernel')

diff --git a/arch/i386/kernel/crash.c b/arch/i386/kernel/crash.c
index 5b96f038367f..736c76d6b31d 100644
--- a/arch/i386/kernel/crash.c
+++ b/arch/i386/kernel/crash.c
@@ -22,6 +22,8 @@
 #include <asm/nmi.h>
 #include <asm/hw_irq.h>
 #include <asm/apic.h>
+#include <asm/kdebug.h>
+
 #include <mach_ipi.h>
 
 
@@ -93,9 +95,18 @@ static void crash_save_self(struct pt_regs *regs)
 #if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC)
 static atomic_t waiting_for_crash_ipi;
 
-static int crash_nmi_callback(struct pt_regs *regs, int cpu)
+static int crash_nmi_callback(struct notifier_block *self,
+			unsigned long val, void *data)
 {
+	struct pt_regs *regs;
 	struct pt_regs fixed_regs;
+	int cpu;
+
+	if (val != DIE_NMI)
+		return NOTIFY_OK;
+
+	regs = ((struct die_args *)data)->regs;
+	cpu = raw_smp_processor_id();
 
 	/* Don't do anything if this handler is invoked on crashing cpu.
 	 * Otherwise, system will completely hang. Crashing cpu can get
@@ -125,13 +136,18 @@ static void smp_send_nmi_allbutself(void)
 	send_IPI_allbutself(NMI_VECTOR);
 }
 
+static struct notifier_block crash_nmi_nb = {
+	.notifier_call = crash_nmi_callback,
+};
+
 static void nmi_shootdown_cpus(void)
 {
 	unsigned long msecs;
 
 	atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1);
 	/* Would it be better to replace the trap vector here? */
-	set_nmi_callback(crash_nmi_callback);
+	if (register_die_notifier(&crash_nmi_nb))
+		return;		/* return what? */
 	/* Ensure the new callback function is set before sending
 	 * out the NMI
 	 */
diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c
index bd96ea4f2942..acd3fdea2a21 100644
--- a/arch/i386/kernel/nmi.c
+++ b/arch/i386/kernel/nmi.c
@@ -42,20 +42,6 @@ static DEFINE_PER_CPU(unsigned long, evntsel_nmi_owner[3]);
  */
 #define NMI_MAX_COUNTER_BITS 66
 
-/*
- * lapic_nmi_owner tracks the ownership of the lapic NMI hardware:
- * - it may be reserved by some other driver, or not
- * - when not reserved by some other driver, it may be used for
- *   the NMI watchdog, or not
- *
- * This is maintained separately from nmi_active because the NMI
- * watchdog may also be driven from the I/O APIC timer.
- */
-static DEFINE_SPINLOCK(lapic_nmi_owner_lock);
-static unsigned int lapic_nmi_owner;
-#define LAPIC_NMI_WATCHDOG	(1<<0)
-#define LAPIC_NMI_RESERVED	(1<<1)
-
 /* nmi_active:
  * >0: the lapic NMI watchdog is active, but can be disabled
  * <0: the lapic NMI watchdog has not been set up, and cannot
@@ -325,33 +311,6 @@ static void enable_lapic_nmi_watchdog(void)
 	touch_nmi_watchdog();
 }
 
-int reserve_lapic_nmi(void)
-{
-	unsigned int old_owner;
-
-	spin_lock(&lapic_nmi_owner_lock);
-	old_owner = lapic_nmi_owner;
-	lapic_nmi_owner |= LAPIC_NMI_RESERVED;
-	spin_unlock(&lapic_nmi_owner_lock);
-	if (old_owner & LAPIC_NMI_RESERVED)
-		return -EBUSY;
-	if (old_owner & LAPIC_NMI_WATCHDOG)
-		disable_lapic_nmi_watchdog();
-	return 0;
-}
-
-void release_lapic_nmi(void)
-{
-	unsigned int new_owner;
-
-	spin_lock(&lapic_nmi_owner_lock);
-	new_owner = lapic_nmi_owner & ~LAPIC_NMI_RESERVED;
-	lapic_nmi_owner = new_owner;
-	spin_unlock(&lapic_nmi_owner_lock);
-	if (new_owner & LAPIC_NMI_WATCHDOG)
-		enable_lapic_nmi_watchdog();
-}
-
 void disable_timer_nmi_watchdog(void)
 {
 	BUG_ON(nmi_watchdog != NMI_IO_APIC);
@@ -866,6 +825,15 @@ done:
 	return rc;
 }
 
+int do_nmi_callback(struct pt_regs * regs, int cpu)
+{
+#ifdef CONFIG_SYSCTL
+	if (unknown_nmi_panic)
+		return unknown_nmi_panic_callback(regs, cpu);
+#endif
+	return 0;
+}
+
 #ifdef CONFIG_SYSCTL
 
 static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu)
@@ -873,37 +841,8 @@ static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu)
 	unsigned char reason = get_nmi_reason();
 	char buf[64];
 
-	if (!(reason & 0xc0)) {
-		sprintf(buf, "NMI received for unknown reason %02x\n", reason);
-		die_nmi(regs, buf);
-	}
-	return 0;
-}
-
-/*
- * proc handler for /proc/sys/kernel/unknown_nmi_panic
- */
-int proc_unknown_nmi_panic(ctl_table *table, int write, struct file *file,
-			void __user *buffer, size_t *length, loff_t *ppos)
-{
-	int old_state;
-
-	old_state = unknown_nmi_panic;
-	proc_dointvec(table, write, file, buffer, length, ppos);
-	if (!!old_state == !!unknown_nmi_panic)
-		return 0;
-
-	if (unknown_nmi_panic) {
-		if (reserve_lapic_nmi() < 0) {
-			unknown_nmi_panic = 0;
-			return -EBUSY;
-		} else {
-			set_nmi_callback(unknown_nmi_panic_callback);
-		}
-	} else {
-		release_lapic_nmi();
-		unset_nmi_callback();
-	}
+	sprintf(buf, "NMI received for unknown reason %02x\n", reason);
+	die_nmi(regs, buf);
 	return 0;
 }
 
@@ -917,7 +856,5 @@ EXPORT_SYMBOL(reserve_perfctr_nmi);
 EXPORT_SYMBOL(release_perfctr_nmi);
 EXPORT_SYMBOL(reserve_evntsel_nmi);
 EXPORT_SYMBOL(release_evntsel_nmi);
-EXPORT_SYMBOL(reserve_lapic_nmi);
-EXPORT_SYMBOL(release_lapic_nmi);
 EXPORT_SYMBOL(disable_timer_nmi_watchdog);
 EXPORT_SYMBOL(enable_timer_nmi_watchdog);
diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c
index 282f0bd40dfd..7db664d0b25c 100644
--- a/arch/i386/kernel/traps.c
+++ b/arch/i386/kernel/traps.c
@@ -706,13 +706,6 @@ void die_nmi (struct pt_regs *regs, const char *msg)
 	do_exit(SIGSEGV);
 }
 
-static int dummy_nmi_callback(struct pt_regs * regs, int cpu)
-{
-	return 0;
-}
-
-static nmi_callback_t nmi_callback = dummy_nmi_callback;
-
 static void default_do_nmi(struct pt_regs * regs)
 {
 	unsigned char reason = 0;
@@ -732,9 +725,10 @@ static void default_do_nmi(struct pt_regs * regs)
 		 */
 		if (nmi_watchdog_tick(regs, reason))
 			return;
+		if (!do_nmi_callback(regs, smp_processor_id()))
 #endif
-		if (!rcu_dereference(nmi_callback)(regs, smp_processor_id()))
 			unknown_nmi_error(reason, regs);
+
 		return;
 	}
 	if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
@@ -765,19 +759,6 @@ fastcall void do_nmi(struct pt_regs * regs, long error_code)
 	nmi_exit();
 }
 
-void set_nmi_callback(nmi_callback_t callback)
-{
-	vmalloc_sync_all();
-	rcu_assign_pointer(nmi_callback, callback);
-}
-EXPORT_SYMBOL_GPL(set_nmi_callback);
-
-void unset_nmi_callback(void)
-{
-	nmi_callback = dummy_nmi_callback;
-}
-EXPORT_SYMBOL_GPL(unset_nmi_callback);
-
 #ifdef CONFIG_KPROBES
 fastcall void __kprobes do_int3(struct pt_regs *regs, long error_code)
 {
diff --git a/arch/i386/oprofile/nmi_int.c b/arch/i386/oprofile/nmi_int.c
index 8710ca081b1e..b3610188bcf0 100644
--- a/arch/i386/oprofile/nmi_int.c
+++ b/arch/i386/oprofile/nmi_int.c
@@ -17,14 +17,15 @@
 #include <asm/nmi.h>
 #include <asm/msr.h>
 #include <asm/apic.h>
+#include <asm/kdebug.h>
  
 #include "op_counter.h"
 #include "op_x86_model.h"
- 
+
 static struct op_x86_model_spec const * model;
 static struct op_msrs cpu_msrs[NR_CPUS];
 static unsigned long saved_lvtpc[NR_CPUS];
- 
+
 static int nmi_start(void);
 static void nmi_stop(void);
 
@@ -82,13 +83,24 @@ static void exit_driverfs(void)
 #define exit_driverfs() do { } while (0)
 #endif /* CONFIG_PM */
 
-
-static int nmi_callback(struct pt_regs * regs, int cpu)
+int profile_exceptions_notify(struct notifier_block *self,
+				unsigned long val, void *data)
 {
-	return model->check_ctrs(regs, &cpu_msrs[cpu]);
+	struct die_args *args = (struct die_args *)data;
+	int ret = NOTIFY_DONE;
+	int cpu = smp_processor_id();
+
+	switch(val) {
+	case DIE_NMI:
+		if (model->check_ctrs(args->regs, &cpu_msrs[cpu]))
+			ret = NOTIFY_STOP;
+		break;
+	default:
+		break;
+	}
+	return ret;
 }
- 
- 
+
 static void nmi_cpu_save_registers(struct op_msrs * msrs)
 {
 	unsigned int const nr_ctrs = model->num_counters;
@@ -174,27 +186,29 @@ static void nmi_cpu_setup(void * dummy)
 	apic_write(APIC_LVTPC, APIC_DM_NMI);
 }
 
+static struct notifier_block profile_exceptions_nb = {
+	.notifier_call = profile_exceptions_notify,
+	.next = NULL,
+	.priority = 0
+};
 
 static int nmi_setup(void)
 {
+	int err=0;
+
 	if (!allocate_msrs())
 		return -ENOMEM;
 
-	/* We walk a thin line between law and rape here.
-	 * We need to be careful to install our NMI handler
-	 * without actually triggering any NMIs as this will
-	 * break the core code horrifically.
-	 */
-	if (reserve_lapic_nmi() < 0) {
+	if ((err = register_die_notifier(&profile_exceptions_nb))){
 		free_msrs();
-		return -EBUSY;
+		return err;
 	}
+
 	/* We need to serialize save and setup for HT because the subset
 	 * of msrs are distinct for save and setup operations
 	 */
 	on_each_cpu(nmi_save_registers, NULL, 0, 1);
 	on_each_cpu(nmi_cpu_setup, NULL, 0, 1);
-	set_nmi_callback(nmi_callback);
 	nmi_enabled = 1;
 	return 0;
 }
@@ -250,8 +264,7 @@ static void nmi_shutdown(void)
 {
 	nmi_enabled = 0;
 	on_each_cpu(nmi_cpu_shutdown, NULL, 0, 1);
-	unset_nmi_callback();
-	release_lapic_nmi();
+	unregister_die_notifier(&profile_exceptions_nb);
 	free_msrs();
 }
 
diff --git a/arch/i386/oprofile/nmi_timer_int.c b/arch/i386/oprofile/nmi_timer_int.c
index a33a73bb502d..93ca48f804ac 100644
--- a/arch/i386/oprofile/nmi_timer_int.c
+++ b/arch/i386/oprofile/nmi_timer_int.c
@@ -17,32 +17,49 @@
 #include <asm/nmi.h>
 #include <asm/apic.h>
 #include <asm/ptrace.h>
+#include <asm/kdebug.h>
  
-static int nmi_timer_callback(struct pt_regs * regs, int cpu)
+int profile_timer_exceptions_notify(struct notifier_block *self,
+				unsigned long val, void *data)
 {
-	oprofile_add_sample(regs, 0);
-	return 1;
+	struct die_args *args = (struct die_args *)data;
+	int ret = NOTIFY_DONE;
+
+	switch(val) {
+	case DIE_NMI:
+		oprofile_add_sample(args->regs, 0);
+		ret = NOTIFY_STOP;
+		break;
+	default:
+		break;
+	}
+	return ret;
 }
 
+static struct notifier_block profile_timer_exceptions_nb = {
+	.notifier_call = profile_timer_exceptions_notify,
+	.next = NULL,
+	.priority = 0
+};
+
 static int timer_start(void)
 {
-	disable_timer_nmi_watchdog();
-	set_nmi_callback(nmi_timer_callback);
+	if (register_die_notifier(&profile_timer_exceptions_nb))
+		return 1;
 	return 0;
 }
 
 
 static void timer_stop(void)
 {
-	enable_timer_nmi_watchdog();
-	unset_nmi_callback();
+	unregister_die_notifier(&profile_timer_exceptions_nb);
 	synchronize_sched();  /* Allow already-started NMIs to complete. */
 }
 
 
 int __init op_nmi_timer_init(struct oprofile_operations * ops)
 {
-	if (atomic_read(&nmi_active) <= 0)
+	if ((nmi_watchdog != NMI_IO_APIC) || (atomic_read(&nmi_active) <= 0))
 		return -ENODEV;
 
 	ops->start = timer_start;
diff --git a/arch/x86_64/kernel/crash.c b/arch/x86_64/kernel/crash.c
index d8d5750d6106..44c8af65325e 100644
--- a/arch/x86_64/kernel/crash.c
+++ b/arch/x86_64/kernel/crash.c
@@ -23,6 +23,7 @@
 #include <asm/nmi.h>
 #include <asm/hw_irq.h>
 #include <asm/mach_apic.h>
+#include <asm/kdebug.h>
 
 /* This keeps a track of which one is crashing cpu. */
 static int crashing_cpu;
@@ -95,8 +96,18 @@ static void crash_save_self(struct pt_regs *regs)
 #ifdef CONFIG_SMP
 static atomic_t waiting_for_crash_ipi;
 
-static int crash_nmi_callback(struct pt_regs *regs, int cpu)
+static int crash_nmi_callback(struct notifier_block *self,
+				unsigned long val, void *data)
 {
+	struct pt_regs *regs;
+	int cpu;
+
+	if (val != DIE_NMI)
+		return NOTIFY_OK;
+
+	regs = ((struct die_args *)data)->regs;
+	cpu = raw_smp_processor_id();
+
 	/*
 	 * Don't do anything if this handler is invoked on crashing cpu.
 	 * Otherwise, system will completely hang. Crashing cpu can get
@@ -127,12 +138,17 @@ static void smp_send_nmi_allbutself(void)
  * cpu hotplug shouldn't matter.
  */
 
+static struct notifier_block crash_nmi_nb = {
+	.notifier_call = crash_nmi_callback,
+};
+
 static void nmi_shootdown_cpus(void)
 {
 	unsigned long msecs;
 
 	atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1);
-	set_nmi_callback(crash_nmi_callback);
+	if (register_die_notifier(&crash_nmi_nb))
+		return;         /* return what? */
 
 	/*
 	 * Ensure the new callback function is set before sending
diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c
index f6b881b23a70..9d175dcf3a2d 100644
--- a/arch/x86_64/kernel/nmi.c
+++ b/arch/x86_64/kernel/nmi.c
@@ -41,20 +41,6 @@ static DEFINE_PER_CPU(unsigned, evntsel_nmi_owner[2]);
  */
 #define NMI_MAX_COUNTER_BITS 66
 
-/*
- * lapic_nmi_owner tracks the ownership of the lapic NMI hardware:
- * - it may be reserved by some other driver, or not
- * - when not reserved by some other driver, it may be used for
- *   the NMI watchdog, or not
- *
- * This is maintained separately from nmi_active because the NMI
- * watchdog may also be driven from the I/O APIC timer.
- */
-static DEFINE_SPINLOCK(lapic_nmi_owner_lock);
-static unsigned int lapic_nmi_owner;
-#define LAPIC_NMI_WATCHDOG	(1<<0)
-#define LAPIC_NMI_RESERVED	(1<<1)
-
 /* nmi_active:
  * >0: the lapic NMI watchdog is active, but can be disabled
  * <0: the lapic NMI watchdog has not been set up, and cannot
@@ -321,33 +307,6 @@ static void enable_lapic_nmi_watchdog(void)
 	touch_nmi_watchdog();
 }
 
-int reserve_lapic_nmi(void)
-{
-	unsigned int old_owner;
-
-	spin_lock(&lapic_nmi_owner_lock);
-	old_owner = lapic_nmi_owner;
-	lapic_nmi_owner |= LAPIC_NMI_RESERVED;
-	spin_unlock(&lapic_nmi_owner_lock);
-	if (old_owner & LAPIC_NMI_RESERVED)
-		return -EBUSY;
-	if (old_owner & LAPIC_NMI_WATCHDOG)
-		disable_lapic_nmi_watchdog();
-	return 0;
-}
-
-void release_lapic_nmi(void)
-{
-	unsigned int new_owner;
-
-	spin_lock(&lapic_nmi_owner_lock);
-	new_owner = lapic_nmi_owner & ~LAPIC_NMI_RESERVED;
-	lapic_nmi_owner = new_owner;
-	spin_unlock(&lapic_nmi_owner_lock);
-	if (new_owner & LAPIC_NMI_WATCHDOG)
-		enable_lapic_nmi_watchdog();
-}
-
 void disable_timer_nmi_watchdog(void)
 {
 	BUG_ON(nmi_watchdog != NMI_IO_APIC);
@@ -762,13 +721,6 @@ done:
 	return rc;
 }
 
-static __kprobes int dummy_nmi_callback(struct pt_regs * regs, int cpu)
-{
-	return 0;
-}
- 
-static nmi_callback_t nmi_callback = dummy_nmi_callback;
- 
 asmlinkage __kprobes void do_nmi(struct pt_regs * regs, long error_code)
 {
 	nmi_enter();
@@ -779,21 +731,12 @@ asmlinkage __kprobes void do_nmi(struct pt_regs * regs, long error_code)
 
 int do_nmi_callback(struct pt_regs * regs, int cpu)
 {
-	return rcu_dereference(nmi_callback)(regs, cpu);
-}
-
-void set_nmi_callback(nmi_callback_t callback)
-{
-	vmalloc_sync_all();
-	rcu_assign_pointer(nmi_callback, callback);
-}
-EXPORT_SYMBOL_GPL(set_nmi_callback);
-
-void unset_nmi_callback(void)
-{
-	nmi_callback = dummy_nmi_callback;
+#ifdef CONFIG_SYSCTL
+	if (unknown_nmi_panic)
+		return unknown_nmi_panic_callback(regs, cpu);
+#endif
+	return 0;
 }
-EXPORT_SYMBOL_GPL(unset_nmi_callback);
 
 #ifdef CONFIG_SYSCTL
 
@@ -802,37 +745,8 @@ static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu)
 	unsigned char reason = get_nmi_reason();
 	char buf[64];
 
-	if (!(reason & 0xc0)) {
-		sprintf(buf, "NMI received for unknown reason %02x\n", reason);
-		die_nmi(buf,regs);
-	}
-	return 0;
-}
-
-/*
- * proc handler for /proc/sys/kernel/unknown_nmi_panic
- */
-int proc_unknown_nmi_panic(struct ctl_table *table, int write, struct file *file,
-			void __user *buffer, size_t *length, loff_t *ppos)
-{
-	int old_state;
-
-	old_state = unknown_nmi_panic;
-	proc_dointvec(table, write, file, buffer, length, ppos);
-	if (!!old_state == !!unknown_nmi_panic)
-		return 0;
-
-	if (unknown_nmi_panic) {
-		if (reserve_lapic_nmi() < 0) {
-			unknown_nmi_panic = 0;
-			return -EBUSY;
-		} else {
-			set_nmi_callback(unknown_nmi_panic_callback);
-		}
-	} else {
-		release_lapic_nmi();
-		unset_nmi_callback();
-	}
+	sprintf(buf, "NMI received for unknown reason %02x\n", reason);
+	die_nmi(buf,regs);
 	return 0;
 }
 
@@ -846,8 +760,6 @@ EXPORT_SYMBOL(reserve_perfctr_nmi);
 EXPORT_SYMBOL(release_perfctr_nmi);
 EXPORT_SYMBOL(reserve_evntsel_nmi);
 EXPORT_SYMBOL(release_evntsel_nmi);
-EXPORT_SYMBOL(reserve_lapic_nmi);
-EXPORT_SYMBOL(release_lapic_nmi);
 EXPORT_SYMBOL(disable_timer_nmi_watchdog);
 EXPORT_SYMBOL(enable_timer_nmi_watchdog);
 EXPORT_SYMBOL(touch_nmi_watchdog);
diff --git a/include/asm-i386/nmi.h b/include/asm-i386/nmi.h
index da0e0b4e9139..34d6bf063b6e 100644
--- a/include/asm-i386/nmi.h
+++ b/include/asm-i386/nmi.h
@@ -6,24 +6,13 @@
 
 #include <linux/pm.h>
 
-struct pt_regs;
-
-typedef int (*nmi_callback_t)(struct pt_regs * regs, int cpu);
-
-/**
- * set_nmi_callback
- *
- * Set a handler for an NMI. Only one handler may be
- * set. Return 1 if the NMI was handled.
- */
-void set_nmi_callback(nmi_callback_t callback);
-
 /**
- * unset_nmi_callback
+ * do_nmi_callback
  *
- * Remove the handler previously set.
+ * Check to see if a callback exists and execute it.  Return 1
+ * if the handler exists and was handled successfully.
  */
-void unset_nmi_callback(void);
+int do_nmi_callback(struct pt_regs *regs, int cpu);
 
 extern int avail_to_resrv_perfctr_nmi_bit(unsigned int);
 extern int avail_to_resrv_perfctr_nmi(unsigned int);
@@ -33,8 +22,6 @@ extern int reserve_evntsel_nmi(unsigned int);
 extern void release_evntsel_nmi(unsigned int);
 
 extern void setup_apic_nmi_watchdog (void *);
-extern int reserve_lapic_nmi(void);
-extern void release_lapic_nmi(void);
 extern void disable_timer_nmi_watchdog(void);
 extern void enable_timer_nmi_watchdog(void);
 extern int nmi_watchdog_tick (struct pt_regs * regs, unsigned reason);
diff --git a/include/asm-x86_64/nmi.h b/include/asm-x86_64/nmi.h
index 8f02a2a416e6..8818c39d34e0 100644
--- a/include/asm-x86_64/nmi.h
+++ b/include/asm-x86_64/nmi.h
@@ -7,25 +7,6 @@
 #include <linux/pm.h>
 #include <asm/io.h>
  
-struct pt_regs;
-
-typedef int (*nmi_callback_t)(struct pt_regs * regs, int cpu);
-
-/**
- * set_nmi_callback
- *
- * Set a handler for an NMI. Only one handler may be
- * set. Return 1 if the NMI was handled.
- */
-void set_nmi_callback(nmi_callback_t callback);
-
-/**
- * unset_nmi_callback
- *
- * Remove the handler previously set.
- */
-void unset_nmi_callback(void);
-
 /**
  * do_nmi_callback
  *
@@ -72,8 +53,6 @@ extern int reserve_evntsel_nmi(unsigned int);
 extern void release_evntsel_nmi(unsigned int);
 
 extern void setup_apic_nmi_watchdog (void *);
-extern int reserve_lapic_nmi(void);
-extern void release_lapic_nmi(void);
 extern void disable_timer_nmi_watchdog(void);
 extern void enable_timer_nmi_watchdog(void);
 extern int nmi_watchdog_tick (struct pt_regs * regs, unsigned reason);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 362a0cc37138..83f168361624 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -76,8 +76,6 @@ extern int compat_log;
 
 #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
 int unknown_nmi_panic;
-extern int proc_unknown_nmi_panic(ctl_table *, int, struct file *,
-				  void __user *, size_t *, loff_t *);
 #endif
 
 /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
@@ -628,7 +626,7 @@ static ctl_table kern_table[] = {
 		.data           = &unknown_nmi_panic,
 		.maxlen         = sizeof (int),
 		.mode           = 0644,
-		.proc_handler   = &proc_unknown_nmi_panic,
+		.proc_handler   = &proc_dointvec,
 	},
 #endif
 #if defined(CONFIG_X86)
-- 
cgit v1.2.3


From 407984f1af259b31957c7c05075a454a751bb801 Mon Sep 17 00:00:00 2001
From: Don Zickus <dzickus@redhat.com>
Date: Tue, 26 Sep 2006 10:52:27 +0200
Subject: [PATCH] x86: Add abilty to enable/disable nmi watchdog with sysctl

Adds a new /proc/sys/kernel/nmi call that will enable/disable the nmi
watchdog.

Signed-off-by:  Don Zickus <dzickus@redhat.com>
Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/i386/kernel/nmi.c   | 52 ++++++++++++++++++++++++++++++++++++++++++++++++
 arch/x86_64/kernel/nmi.c | 48 ++++++++++++++++++++++++++++++++++++++++++++
 include/asm-i386/nmi.h   |  1 +
 include/asm-x86_64/nmi.h |  1 +
 include/linux/sysctl.h   |  1 +
 kernel/sysctl.c          | 11 ++++++++++
 6 files changed, 114 insertions(+)

(limited to 'kernel')

diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c
index acd3fdea2a21..28065d0b71a9 100644
--- a/arch/i386/kernel/nmi.c
+++ b/arch/i386/kernel/nmi.c
@@ -846,6 +846,58 @@ static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu)
 	return 0;
 }
 
+/*
+ * proc handler for /proc/sys/kernel/nmi_watchdog
+ */
+int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file,
+			void __user *buffer, size_t *length, loff_t *ppos)
+{
+	int old_state;
+
+	nmi_watchdog_enabled = (atomic_read(&nmi_active) > 0) ? 1 : 0;
+	old_state = nmi_watchdog_enabled;
+	proc_dointvec(table, write, file, buffer, length, ppos);
+	if (!!old_state == !!nmi_watchdog_enabled)
+		return 0;
+
+	if (atomic_read(&nmi_active) < 0) {
+		printk(KERN_WARNING "NMI watchdog is permanently disabled\n");
+		return -EINVAL;
+	}
+
+	if (nmi_watchdog == NMI_DEFAULT) {
+		if (nmi_known_cpu() > 0)
+			nmi_watchdog = NMI_LOCAL_APIC;
+		else
+			nmi_watchdog = NMI_IO_APIC;
+	}
+
+	if (nmi_watchdog == NMI_LOCAL_APIC)
+	{
+		if (nmi_watchdog_enabled)
+			enable_lapic_nmi_watchdog();
+		else
+			disable_lapic_nmi_watchdog();
+	} else if (nmi_watchdog == NMI_IO_APIC) {
+		/* FIXME
+		 * for some reason these functions don't work
+		 */
+		printk("Can not enable/disable NMI on IO APIC\n");
+		return -EINVAL;
+#if 0
+		if (nmi_watchdog_enabled)
+			enable_timer_nmi_watchdog();
+		else
+			disable_timer_nmi_watchdog();
+#endif
+	} else {
+		printk( KERN_WARNING
+			"NMI watchdog doesn't know what hardware to touch\n");
+		return -EIO;
+	}
+	return 0;
+}
+
 #endif
 
 EXPORT_SYMBOL(nmi_active);
diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c
index 9d175dcf3a2d..3a17411a9a19 100644
--- a/arch/x86_64/kernel/nmi.c
+++ b/arch/x86_64/kernel/nmi.c
@@ -750,6 +750,54 @@ static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu)
 	return 0;
 }
 
+/*
+ * proc handler for /proc/sys/kernel/nmi
+ */
+int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file,
+			void __user *buffer, size_t *length, loff_t *ppos)
+{
+	int old_state;
+
+	nmi_watchdog_enabled = (atomic_read(&nmi_active) > 0) ? 1 : 0;
+	old_state = nmi_watchdog_enabled;
+	proc_dointvec(table, write, file, buffer, length, ppos);
+	if (!!old_state == !!nmi_watchdog_enabled)
+		return 0;
+
+	if (atomic_read(&nmi_active) < 0) {
+		printk( KERN_WARNING "NMI watchdog is permanently disabled\n");
+		return -EINVAL;
+	}
+
+	/* if nmi_watchdog is not set yet, then set it */
+	nmi_watchdog_default();
+
+	if (nmi_watchdog == NMI_LOCAL_APIC)
+	{
+		if (nmi_watchdog_enabled)
+			enable_lapic_nmi_watchdog();
+		else
+			disable_lapic_nmi_watchdog();
+	} else if (nmi_watchdog == NMI_IO_APIC) {
+		/* FIXME
+		 * for some reason these functions don't work
+		 */
+		printk("Can not enable/disable NMI on IO APIC\n");
+		return -EIO;
+#if 0
+		if (nmi_watchdog_enabled)
+			enable_timer_nmi_watchdog();
+		else
+			disable_timer_nmi_watchdog();
+#endif
+	} else {
+		printk(KERN_WARNING
+			"NMI watchdog doesn't know what hardware to touch\n");
+		return -EIO;
+	}
+	return 0;
+}
+
 #endif
 
 EXPORT_SYMBOL(nmi_active);
diff --git a/include/asm-i386/nmi.h b/include/asm-i386/nmi.h
index 34d6bf063b6e..13b5d8311bf7 100644
--- a/include/asm-i386/nmi.h
+++ b/include/asm-i386/nmi.h
@@ -14,6 +14,7 @@
  */
 int do_nmi_callback(struct pt_regs *regs, int cpu);
 
+extern int nmi_watchdog_enabled;
 extern int avail_to_resrv_perfctr_nmi_bit(unsigned int);
 extern int avail_to_resrv_perfctr_nmi(unsigned int);
 extern int reserve_perfctr_nmi(unsigned int);
diff --git a/include/asm-x86_64/nmi.h b/include/asm-x86_64/nmi.h
index 8818c39d34e0..2c23b0df87d2 100644
--- a/include/asm-x86_64/nmi.h
+++ b/include/asm-x86_64/nmi.h
@@ -43,6 +43,7 @@ extern void die_nmi(char *str, struct pt_regs *regs);
 
 extern int panic_on_timeout;
 extern int unknown_nmi_panic;
+extern int nmi_watchdog_enabled;
 
 extern int check_nmi_watchdog(void);
 extern int avail_to_resrv_perfctr_nmi_bit(unsigned int);
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 736ed917a4f8..ecb79ba52ae1 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -150,6 +150,7 @@ enum
 	KERN_IA64_UNALIGNED=72, /* int: ia64 unaligned userland trap enable */
 	KERN_COMPAT_LOG=73,	/* int: print compat layer  messages */
 	KERN_MAX_LOCK_DEPTH=74,
+	KERN_NMI_WATCHDOG=75, /* int: enable/disable nmi watchdog */
 };
 
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 83f168361624..040de6bd74dd 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -76,6 +76,9 @@ extern int compat_log;
 
 #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
 int unknown_nmi_panic;
+int nmi_watchdog_enabled;
+extern int proc_nmi_enabled(struct ctl_table *, int , struct file *,
+			void __user *, size_t *, loff_t *);
 #endif
 
 /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
@@ -628,6 +631,14 @@ static ctl_table kern_table[] = {
 		.mode           = 0644,
 		.proc_handler   = &proc_dointvec,
 	},
+	{
+		.ctl_name       = KERN_NMI_WATCHDOG,
+		.procname       = "nmi_watchdog",
+		.data           = &nmi_watchdog_enabled,
+		.maxlen         = sizeof (int),
+		.mode           = 0644,
+		.proc_handler   = &proc_nmi_enabled,
+	},
 #endif
 #if defined(CONFIG_X86)
 	{
-- 
cgit v1.2.3


From 8da5adda91df3d2fcc5300e68da491694c9af019 Mon Sep 17 00:00:00 2001
From: Don Zickus <dzickus@redhat.com>
Date: Tue, 26 Sep 2006 10:52:27 +0200
Subject: [PATCH] x86: Allow users to force a panic on NMI

To quote Alan Cox:

The default Linux behaviour on an NMI of either memory or unknown is to
continue operation. For many environments such as scientific computing
it is preferable that the box is taken out and the error dealt with than
an uncorrected parity/ECC error get propogated.

A small number of systems do generate NMI's for bizarre random reasons
such as power management so the default is unchanged. In other respects
the new proc/sys entry works like the existing panic controls already in
that directory.

This is separate to the edac support - EDAC allows supported chipsets to
handle ECC errors well, this change allows unsupported cases to at least
panic rather than cause problems further down the line.

Signed-off-by: Don Zickus <dzickus@redhat.com>
Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/i386/kernel/traps.c   | 6 ++++++
 arch/x86_64/kernel/traps.c | 6 ++++++
 include/linux/kernel.h     | 1 +
 include/linux/sysctl.h     | 1 +
 kernel/panic.c             | 1 +
 kernel/sysctl.c            | 8 ++++++++
 6 files changed, 23 insertions(+)

(limited to 'kernel')

diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c
index 7db664d0b25c..2f6cb8276480 100644
--- a/arch/i386/kernel/traps.c
+++ b/arch/i386/kernel/traps.c
@@ -635,6 +635,8 @@ static void mem_parity_error(unsigned char reason, struct pt_regs * regs)
 			"to continue\n");
 	printk(KERN_EMERG "You probably have a hardware problem with your RAM "
 			"chips\n");
+	if (panic_on_unrecovered_nmi)
+                panic("NMI: Not continuing");
 
 	/* Clear and disable the memory parity error line. */
 	clear_mem_error(reason);
@@ -670,6 +672,10 @@ static void unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
 		reason, smp_processor_id());
 	printk("Dazed and confused, but trying to continue\n");
 	printk("Do you have a strange power saving mode enabled?\n");
+
+	if (panic_on_unrecovered_nmi)
+                panic("NMI: Not continuing");
+
 }
 
 static DEFINE_SPINLOCK(nmi_print_lock);
diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c
index 42bc070fdf11..b18829db2a6a 100644
--- a/arch/x86_64/kernel/traps.c
+++ b/arch/x86_64/kernel/traps.c
@@ -732,6 +732,8 @@ mem_parity_error(unsigned char reason, struct pt_regs * regs)
 {
 	printk("Uhhuh. NMI received. Dazed and confused, but trying to continue\n");
 	printk("You probably have a hardware problem with your RAM chips\n");
+	if (panic_on_unrecovered_nmi)
+               panic("NMI: Not continuing");
 
 	/* Clear and disable the memory parity error line. */
 	reason = (reason & 0xf) | 4;
@@ -757,6 +759,10 @@ unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
 {	printk("Uhhuh. NMI received for unknown reason %02x.\n", reason);
 	printk("Dazed and confused, but trying to continue\n");
 	printk("Do you have a strange power saving mode enabled?\n");
+
+	if (panic_on_unrecovered_nmi)
+                panic("NMI: Not continuing");
+
 }
 
 /* Runs on IST stack. This code must keep interrupts off all the time.
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 2b2ae4fdce8b..1ff9609300b4 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -186,6 +186,7 @@ extern void bust_spinlocks(int yes);
 extern int oops_in_progress;		/* If set, an oops, panic(), BUG() or die() is in progress */
 extern int panic_timeout;
 extern int panic_on_oops;
+extern int panic_on_unrecovered_nmi;
 extern int tainted;
 extern const char *print_tainted(void);
 extern void add_taint(unsigned);
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index ecb79ba52ae1..432778446ad2 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -151,6 +151,7 @@ enum
 	KERN_COMPAT_LOG=73,	/* int: print compat layer  messages */
 	KERN_MAX_LOCK_DEPTH=74,
 	KERN_NMI_WATCHDOG=75, /* int: enable/disable nmi watchdog */
+	KERN_PANIC_ON_NMI=76, /* int: whether we will panic on an unrecovered */
 };
 
 
diff --git a/kernel/panic.c b/kernel/panic.c
index 8010b9b17aca..d2db3e2209e0 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -21,6 +21,7 @@
 #include <linux/debug_locks.h>
 
 int panic_on_oops;
+int panic_on_unrecovered_nmi;
 int tainted;
 static int pause_on_oops;
 static int pause_on_oops_flag;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 040de6bd74dd..220e20564124 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -641,6 +641,14 @@ static ctl_table kern_table[] = {
 	},
 #endif
 #if defined(CONFIG_X86)
+	{
+		.ctl_name	= KERN_PANIC_ON_NMI,
+		.procname	= "panic_on_unrecovered_nmi",
+		.data		= &panic_on_unrecovered_nmi,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
 	{
 		.ctl_name	= KERN_BOOTLOADER_TYPE,
 		.procname	= "bootloader_type",
-- 
cgit v1.2.3


From 3cfc348bf90ffaa777c188652aa297f04eb94de8 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Tue, 26 Sep 2006 10:52:28 +0200
Subject: [PATCH] x86: Add portable getcpu call

For NUMA optimization and some other algorithms it is useful to have a fast
to get the current CPU and node numbers in user space.

x86-64 added a fast way to do this in a vsyscall. This adds a generic
syscall for other architectures to make it a generic portable facility.

I expect some of them will also implement it as a faster vsyscall.

The cache is an optimization for the x86-64 vsyscall optimization. Since
what the syscall returns is an approximation anyways and user space
often wants very fast results it can be cached for some time.  The norma
methods to get this information in user space are relatively slow

The vsyscall is in a better position to manage the cache because it has direct
access to a fast time stamp (jiffies). For the generic syscall optimization
it doesn't help much, but enforce a valid argument to keep programs
portable

I only added an i386 syscall entry for now. Other architectures can follow
as needed.

AK: Also added some cleanups from Andrew Morton

Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/i386/kernel/syscall_table.S |  1 +
 arch/x86_64/ia32/ia32entry.S     |  1 +
 include/asm-i386/unistd.h        |  3 ++-
 include/linux/syscalls.h         |  2 ++
 kernel/sys.c                     | 31 +++++++++++++++++++++++++++++++
 5 files changed, 37 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/arch/i386/kernel/syscall_table.S b/arch/i386/kernel/syscall_table.S
index dd63d4775398..7e639f78b0b9 100644
--- a/arch/i386/kernel/syscall_table.S
+++ b/arch/i386/kernel/syscall_table.S
@@ -317,3 +317,4 @@ ENTRY(sys_call_table)
 	.long sys_tee			/* 315 */
 	.long sys_vmsplice
 	.long sys_move_pages
+	.long sys_getcpu
diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S
index 30ed0f6f4a2b..32fd32bea07c 100644
--- a/arch/x86_64/ia32/ia32entry.S
+++ b/arch/x86_64/ia32/ia32entry.S
@@ -713,4 +713,5 @@ ia32_sys_call_table:
 	.quad sys_tee
 	.quad compat_sys_vmsplice
 	.quad compat_sys_move_pages
+	.quad sys_getcpu
 ia32_syscall_end:		
diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h
index fc1c8ddae149..565d0897b205 100644
--- a/include/asm-i386/unistd.h
+++ b/include/asm-i386/unistd.h
@@ -323,10 +323,11 @@
 #define __NR_tee		315
 #define __NR_vmsplice		316
 #define __NR_move_pages		317
+#define __NR_getcpu		318
 
 #ifdef __KERNEL__
 
-#define NR_syscalls 318
+#define NR_syscalls 319
 
 /*
  * user-visible error numbers are in the range -1 - -128: see
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 008f04c56737..3f0f716225ec 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -53,6 +53,7 @@ struct mq_attr;
 struct compat_stat;
 struct compat_timeval;
 struct robust_list_head;
+struct getcpu_cache;
 
 #include <linux/types.h>
 #include <linux/aio_abi.h>
@@ -596,5 +597,6 @@ asmlinkage long sys_get_robust_list(int pid,
 				    size_t __user *len_ptr);
 asmlinkage long sys_set_robust_list(struct robust_list_head __user *head,
 				    size_t len);
+asmlinkage long sys_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *cache);
 
 #endif
diff --git a/kernel/sys.c b/kernel/sys.c
index e236f98f7ec5..3f894775488d 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -28,6 +28,7 @@
 #include <linux/tty.h>
 #include <linux/signal.h>
 #include <linux/cn_proc.h>
+#include <linux/getcpu.h>
 
 #include <linux/compat.h>
 #include <linux/syscalls.h>
@@ -2062,3 +2063,33 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
 	}
 	return error;
 }
+
+asmlinkage long sys_getcpu(unsigned __user *cpup, unsigned __user *nodep,
+	   		   struct getcpu_cache __user *cache)
+{
+	int err = 0;
+	int cpu = raw_smp_processor_id();
+	if (cpup)
+		err |= put_user(cpu, cpup);
+	if (nodep)
+		err |= put_user(cpu_to_node(cpu), nodep);
+	if (cache) {
+		/*
+		 * The cache is not needed for this implementation,
+		 * but make sure user programs pass something
+		 * valid. vsyscall implementations can instead make
+		 * good use of the cache. Only use t0 and t1 because
+		 * these are available in both 32bit and 64bit ABI (no
+		 * need for a compat_getcpu). 32bit has enough
+		 * padding
+		 */
+		unsigned long t0, t1;
+		get_user(t0, &cache->t0);
+		get_user(t1, &cache->t1);
+		t0++;
+		t1++;
+		put_user(t0, &cache->t0);
+		put_user(t1, &cache->t1);
+	}
+	return err ? -EFAULT : 0;
+}
-- 
cgit v1.2.3


From 0cb91a2293648507886563ccb91979cfc94d6a4b Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Tue, 26 Sep 2006 10:52:28 +0200
Subject: [PATCH] i386: Account spinlocks to the caller during profiling for
 !FP kernels

This ports the algorithm from x86-64 (with improvements) to i386.
Previously this only worked for frame pointer enabled kernels.
But spinlocks have a very simple stack frame that can be manually
analyzed. Do this.

Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/i386/kernel/time.c   | 23 +++++++++++++++++++----
 include/asm-i386/ptrace.h |  4 ----
 kernel/spinlock.c         |  5 +++++
 3 files changed, 24 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/arch/i386/kernel/time.c b/arch/i386/kernel/time.c
index edd00f6cee37..5af802ef00b2 100644
--- a/arch/i386/kernel/time.c
+++ b/arch/i386/kernel/time.c
@@ -130,18 +130,33 @@ static int set_rtc_mmss(unsigned long nowtime)
 
 int timer_ack;
 
-#if defined(CONFIG_SMP) && defined(CONFIG_FRAME_POINTER)
 unsigned long profile_pc(struct pt_regs *regs)
 {
 	unsigned long pc = instruction_pointer(regs);
 
-	if (!user_mode_vm(regs) && in_lock_functions(pc))
+#ifdef CONFIG_SMP
+	if (!user_mode_vm(regs) && in_lock_functions(pc)) {
+#ifdef CONFIG_FRAME_POINTER
 		return *(unsigned long *)(regs->ebp + 4);
-
+#else
+		unsigned long *sp;
+		if ((regs->xcs & 3) == 0)
+			sp = (unsigned long *)&regs->esp;
+		else
+			sp = (unsigned long *)regs->esp;
+		/* Return address is either directly at stack pointer
+		   or above a saved eflags. Eflags has bits 22-31 zero,
+		   kernel addresses don't. */
+ 		if (sp[0] >> 22)
+			return sp[0];
+		if (sp[1] >> 22)
+			return sp[1];
+#endif
+	}
+#endif
 	return pc;
 }
 EXPORT_SYMBOL(profile_pc);
-#endif
 
 /*
  * This is the same as the above, except we _also_ save the current
diff --git a/include/asm-i386/ptrace.h b/include/asm-i386/ptrace.h
index f324c53b6f9a..30a442ec2059 100644
--- a/include/asm-i386/ptrace.h
+++ b/include/asm-i386/ptrace.h
@@ -80,11 +80,7 @@ static inline int user_mode_vm(struct pt_regs *regs)
 	return ((regs->xcs & 3) | (regs->eflags & VM_MASK)) != 0;
 }
 #define instruction_pointer(regs) ((regs)->eip)
-#if defined(CONFIG_SMP) && defined(CONFIG_FRAME_POINTER)
 extern unsigned long profile_pc(struct pt_regs *regs);
-#else
-#define profile_pc(regs) instruction_pointer(regs)
-#endif
 #endif /* __KERNEL__ */
 
 #endif
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index fb524b009eef..9644a41e0bef 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -7,6 +7,11 @@
  *
  * This file contains the spinlock/rwlock implementations for the
  * SMP and the DEBUG_SPINLOCK cases. (UP-nondebug inlines them)
+ *
+ * Note that some architectures have special knowledge about the
+ * stack frames of these functions in their profile_pc. If you
+ * change anything significant here that could change the stack
+ * frame contact the architecture maintainers.
  */
 
 #include <linux/linkage.h>
-- 
cgit v1.2.3


From 5a1b3999d6cb7ab87f1f3b1700bc91839fd6fa29 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Tue, 26 Sep 2006 10:52:34 +0200
Subject: [PATCH] x86: Some preparationary cleanup for stack trace

- Remove unused all_contexts parameter
No caller used it
- Move skip argument into the structure (needed for
followon patches)

Cc: mingo@elte.hu

Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/i386/kernel/stacktrace.c   | 11 +++--------
 arch/s390/kernel/stacktrace.c   | 17 ++++++++---------
 arch/x86_64/kernel/stacktrace.c | 14 +++++---------
 include/linux/stacktrace.h      |  7 ++++---
 kernel/lockdep.c                |  5 ++++-
 5 files changed, 24 insertions(+), 30 deletions(-)

(limited to 'kernel')

diff --git a/arch/i386/kernel/stacktrace.c b/arch/i386/kernel/stacktrace.c
index e62a037ab399..ae3c32a87add 100644
--- a/arch/i386/kernel/stacktrace.c
+++ b/arch/i386/kernel/stacktrace.c
@@ -61,12 +61,8 @@ save_context_stack(struct stack_trace *trace, unsigned int skip,
 
 /*
  * Save stack-backtrace addresses into a stack_trace buffer.
- * If all_contexts is set, all contexts (hardirq, softirq and process)
- * are saved. If not set then only the current context is saved.
  */
-void save_stack_trace(struct stack_trace *trace,
-		      struct task_struct *task, int all_contexts,
-		      unsigned int skip)
+void save_stack_trace(struct stack_trace *trace, struct task_struct *task)
 {
 	unsigned long ebp;
 	unsigned long *stack = &ebp;
@@ -85,10 +81,9 @@ void save_stack_trace(struct stack_trace *trace,
 		struct thread_info *context = (struct thread_info *)
 				((unsigned long)stack & (~(THREAD_SIZE - 1)));
 
-		ebp = save_context_stack(trace, skip, context, stack, ebp);
+		ebp = save_context_stack(trace, trace->skip, context, stack, ebp);
 		stack = (unsigned long *)context->previous_esp;
-		if (!all_contexts || !stack ||
-				trace->nr_entries >= trace->max_entries)
+		if (!stack || trace->nr_entries >= trace->max_entries)
 			break;
 		trace->entries[trace->nr_entries++] = ULONG_MAX;
 		if (trace->nr_entries >= trace->max_entries)
diff --git a/arch/s390/kernel/stacktrace.c b/arch/s390/kernel/stacktrace.c
index de83f38288d0..d9428a0fc8fb 100644
--- a/arch/s390/kernel/stacktrace.c
+++ b/arch/s390/kernel/stacktrace.c
@@ -59,9 +59,7 @@ static inline unsigned long save_context_stack(struct stack_trace *trace,
 	}
 }
 
-void save_stack_trace(struct stack_trace *trace,
-		      struct task_struct *task, int all_contexts,
-		      unsigned int skip)
+void save_stack_trace(struct stack_trace *trace, struct task_struct *task)
 {
 	register unsigned long sp asm ("15");
 	unsigned long orig_sp;
@@ -69,22 +67,23 @@ void save_stack_trace(struct stack_trace *trace,
 	sp &= PSW_ADDR_INSN;
 	orig_sp = sp;
 
-	sp = save_context_stack(trace, &skip, sp,
+	sp = save_context_stack(trace, &trace->skip, sp,
 				S390_lowcore.panic_stack - PAGE_SIZE,
 				S390_lowcore.panic_stack);
-	if ((sp != orig_sp) && !all_contexts)
+	if ((sp != orig_sp) && !trace->all_contexts)
 		return;
-	sp = save_context_stack(trace, &skip, sp,
+	sp = save_context_stack(trace, &trace->skip, sp,
 				S390_lowcore.async_stack - ASYNC_SIZE,
 				S390_lowcore.async_stack);
-	if ((sp != orig_sp) && !all_contexts)
+	if ((sp != orig_sp) && !trace->all_contexts)
 		return;
 	if (task)
-		save_context_stack(trace, &skip, sp,
+		save_context_stack(trace, &trace->skip, sp,
 				   (unsigned long) task_stack_page(task),
 				   (unsigned long) task_stack_page(task) + THREAD_SIZE);
 	else
-		save_context_stack(trace, &skip, sp, S390_lowcore.thread_info,
+		save_context_stack(trace, &trace->skip, sp,
+				   S390_lowcore.thread_info,
 				   S390_lowcore.thread_info + THREAD_SIZE);
 	return;
 }
diff --git a/arch/x86_64/kernel/stacktrace.c b/arch/x86_64/kernel/stacktrace.c
index 32cf55eb9af8..1c022af8fe1e 100644
--- a/arch/x86_64/kernel/stacktrace.c
+++ b/arch/x86_64/kernel/stacktrace.c
@@ -109,9 +109,10 @@ out_restore:
  * Save stack-backtrace addresses into a stack_trace buffer:
  */
 static inline unsigned long
-save_context_stack(struct stack_trace *trace, unsigned int skip,
+save_context_stack(struct stack_trace *trace,
 		   unsigned long stack, unsigned long stack_end)
 {
+	int skip = trace->skip;
 	unsigned long addr;
 
 #ifdef CONFIG_FRAME_POINTER
@@ -159,12 +160,8 @@ save_context_stack(struct stack_trace *trace, unsigned int skip,
 
 /*
  * Save stack-backtrace addresses into a stack_trace buffer.
- * If all_contexts is set, all contexts (hardirq, softirq and process)
- * are saved. If not set then only the current context is saved.
  */
-void save_stack_trace(struct stack_trace *trace,
-		      struct task_struct *task, int all_contexts,
-		      unsigned int skip)
+void save_stack_trace(struct stack_trace *trace, struct task_struct *task)
 {
 	unsigned long stack = (unsigned long)&stack;
 	int i, nr_stacks = 0, stacks_done[MAX_STACKS];
@@ -207,9 +204,8 @@ void save_stack_trace(struct stack_trace *trace,
 				return;
 		stacks_done[nr_stacks] = stack_end;
 
-		stack = save_context_stack(trace, skip, stack, stack_end);
-		if (!all_contexts || !stack ||
-				trace->nr_entries >= trace->max_entries)
+		stack = save_context_stack(trace, stack, stack_end);
+		if (!stack || trace->nr_entries >= trace->max_entries)
 			return;
 		trace->entries[trace->nr_entries++] = ULONG_MAX;
 		if (trace->nr_entries >= trace->max_entries)
diff --git a/include/linux/stacktrace.h b/include/linux/stacktrace.h
index 9cc81e572224..50e2b01e517c 100644
--- a/include/linux/stacktrace.h
+++ b/include/linux/stacktrace.h
@@ -5,15 +5,16 @@
 struct stack_trace {
 	unsigned int nr_entries, max_entries;
 	unsigned long *entries;
+	int skip;	/* input argument: How many entries to skip */
+	int all_contexts; /* input argument: if true do than one stack */
 };
 
 extern void save_stack_trace(struct stack_trace *trace,
-			     struct task_struct *task, int all_contexts,
-			     unsigned int skip);
+			     struct task_struct *task);
 
 extern void print_stack_trace(struct stack_trace *trace, int spaces);
 #else
-# define save_stack_trace(trace, task, all, skip)	do { } while (0)
+# define save_stack_trace(trace, task)			do { } while (0)
 # define print_stack_trace(trace)			do { } while (0)
 #endif
 
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 9bad17884513..900b4cb1a024 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -224,7 +224,10 @@ static int save_trace(struct stack_trace *trace)
 	trace->max_entries = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries;
 	trace->entries = stack_trace + nr_stack_trace_entries;
 
-	save_stack_trace(trace, NULL, 0, 3);
+	trace->skip = 3;
+	trace->all_contexts = 0;
+
+	save_stack_trace(trace, NULL);
 
 	trace->max_entries = trace->nr_entries;
 
-- 
cgit v1.2.3


From 3fa7c794fe4dc127f7fac3fad4d13628e68f89ce Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Tue, 26 Sep 2006 10:52:34 +0200
Subject: [PATCH] Avoid recursion in lockdep when stack tracer takes locks

The new dwarf2 unwinder needs to take locks to do backtraces
inside modules. This patch makes sure lockdep which calls
stacktrace is not reentered.

Thanks to Ingo for suggesting this simpler approach.

Cc: mingo@elte.hu
Signed-off-by: Andi Kleen <ak@suse.de>
---
 kernel/lockdep.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 900b4cb1a024..c088e5542e84 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -227,7 +227,11 @@ static int save_trace(struct stack_trace *trace)
 	trace->skip = 3;
 	trace->all_contexts = 0;
 
+	/* Make sure to not recurse in case the the unwinder needs to tak
+e	   locks. */
+	lockdep_off();
 	save_stack_trace(trace, NULL);
+	lockdep_on();
 
 	trace->max_entries = trace->nr_entries;
 
-- 
cgit v1.2.3


From 0a4254058037eb172758961d0a5b94f4320a1425 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Tue, 26 Sep 2006 10:52:38 +0200
Subject: [PATCH] Add the canary field to the PDA area and the task struct

This patch adds the per thread cookie field to the task struct and the PDA.
Also it makes sure that the PDA value gets the new cookie value at context
switch, and that a new task gets a new cookie at task creation time.

Signed-off-by: Arjan van Ven <arjan@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andi Kleen <ak@suse.de>
CC: Andi Kleen <ak@suse.de>
---
 arch/x86_64/kernel/process.c | 8 ++++++++
 include/asm-x86_64/pda.h     | 7 ++++++-
 include/linux/sched.h        | 5 +++++
 kernel/fork.c                | 5 +++++
 4 files changed, 24 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c
index 9e9a70e50c72..fba8dfeda67c 100644
--- a/arch/x86_64/kernel/process.c
+++ b/arch/x86_64/kernel/process.c
@@ -625,6 +625,14 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	unlazy_fpu(prev_p);
 	write_pda(kernelstack,
 		  task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
+#ifdef CONFIG_CC_STACKPROTECTOR
+	write_pda(stack_canary, next_p->stack_canary);
+	/*
+	 * Build time only check to make sure the stack_canary is at
+	 * offset 40 in the pda; this is a gcc ABI requirement
+	 */
+	BUILD_BUG_ON(offsetof(struct x8664_pda, stack_canary) != 40);
+#endif
 
 	/*
 	 * Now maybe reload the debug registers and handle I/O bitmaps
diff --git a/include/asm-x86_64/pda.h b/include/asm-x86_64/pda.h
index 6794ffaae433..e7773e0af865 100644
--- a/include/asm-x86_64/pda.h
+++ b/include/asm-x86_64/pda.h
@@ -16,7 +16,12 @@ struct x8664_pda {
 	unsigned long oldrsp; 	    /* 24 user rsp for system call */
         int irqcount;		    /* 32 Irq nesting counter. Starts with -1 */
 	int cpunumber;		    /* 36 Logical CPU number */
-	char *irqstackptr;	/* 40 top of irqstack */
+#ifdef CONFIG_CC_STACKPROTECTOR
+	unsigned long stack_canary;	/* 40 stack canary value */
+					/* gcc-ABI: this canary MUST be at
+					   offset 40!!! */
+#endif
+	char *irqstackptr;
 	int nodenumber;		    /* number of current node */
 	unsigned int __softirq_pending;
 	unsigned int __nmi_count;	/* number of NMI on this CPUs */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 807556c5bcd2..9d4aa7f95bc8 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -819,6 +819,11 @@ struct task_struct {
 	unsigned did_exec:1;
 	pid_t pid;
 	pid_t tgid;
+
+#ifdef CONFIG_CC_STACKPROTECTOR
+	/* Canary value for the -fstack-protector gcc feature */
+	unsigned long stack_canary;
+#endif
 	/* 
 	 * pointers to (original) parent process, youngest child, younger sibling,
 	 * older sibling, respectively.  (p->father can be replaced with 
diff --git a/kernel/fork.c b/kernel/fork.c
index f9b014e3e700..a0dad84567c9 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -45,6 +45,7 @@
 #include <linux/cn_proc.h>
 #include <linux/delayacct.h>
 #include <linux/taskstats_kern.h>
+#include <linux/random.h>
 
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
@@ -175,6 +176,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
 	tsk->thread_info = ti;
 	setup_thread_stack(tsk, orig);
 
+#ifdef CONFIG_CC_STACKPROTECTOR
+	tsk->stack_canary = get_random_int();
+#endif
+
 	/* One for us, one for whoever does the "release_task()" (usually parent) */
 	atomic_set(&tsk->usage,2);
 	atomic_set(&tsk->fs_excl, 0);
-- 
cgit v1.2.3


From 3162f751d04086a9d006342de63ac8f44fe0f72a Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Tue, 26 Sep 2006 10:52:39 +0200
Subject: [PATCH] Add the __stack_chk_fail() function

GCC emits a call to a __stack_chk_fail() function when the stack canary is
not matching the expected value.

Since this is a bad security issue; lets panic the kernel rather than limping
along; the kernel really can't be trusted anymore when this happens.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andi Kleen <ak@suse.de>
CC: Andi Kleen <ak@suse.de>
---
 kernel/panic.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'kernel')

diff --git a/kernel/panic.c b/kernel/panic.c
index d2db3e2209e0..6ceb664fb52a 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -271,3 +271,15 @@ void oops_exit(void)
 {
 	do_oops_enter_exit();
 }
+
+#ifdef CONFIG_CC_STACKPROTECTOR
+/*
+ * Called when gcc's -fstack-protector feature is used, and
+ * gcc detects corruption of the on-stack canary value
+ */
+void __stack_chk_fail(void)
+{
+	panic("stack-protector: Kernel stack is corrupted");
+}
+EXPORT_SYMBOL(__stack_chk_fail);
+#endif
-- 
cgit v1.2.3


From adf1423698f00d00b267f7dca8231340ce7d65ef Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Tue, 26 Sep 2006 10:52:41 +0200
Subject: [PATCH] i386/x86-64: Work around gcc bug with noreturn functions in
 unwinder

Current gcc generates calls not jumps to noreturn functions. When that happens the
return address can point to the next function, which confuses the unwinder.

This patch works around it by marking asynchronous exception
frames in contrast normal call frames in the unwind information.  Then teach
the unwinder to decode this.

For normal call frames the unwinder now subtracts one from the address which avoids
this problem.  The standard libgcc unwinder uses the same trick.

It doesn't include adjustment of the printed address (i.e. for the original
example, it'd still be kernel_math_error+0 that gets displayed, but the
unwinder wouldn't get confused anymore.

This only works with binutils 2.6.17+ and some versions of H.J.Lu's 2.6.16
unfortunately because earlier binutils don't support .cfi_signal_frame

[AK: added automatic detection of the new binutils and wrote description]

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/i386/Makefile           |  4 ++++
 arch/i386/kernel/entry.S     |  4 ++++
 arch/x86_64/Makefile         |  4 ++++
 arch/x86_64/ia32/ia32entry.S |  4 ++++
 arch/x86_64/kernel/entry.S   |  4 ++++
 include/asm-i386/dwarf2.h    |  7 +++++++
 include/asm-i386/unwind.h    |  5 +++++
 include/asm-x86_64/dwarf2.h  |  6 ++++++
 include/asm-x86_64/unwind.h  |  5 +++++
 kernel/unwind.c              | 35 ++++++++++++++++++++++++++++-------
 scripts/Kbuild.include       |  4 ++--
 11 files changed, 73 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/arch/i386/Makefile b/arch/i386/Makefile
index 508cdbeb3a09..7cc0b189b82b 100644
--- a/arch/i386/Makefile
+++ b/arch/i386/Makefile
@@ -50,6 +50,10 @@ CFLAGS				+= $(shell if [ $(call cc-version) -lt 0400 ] ; then echo $(call cc-op
 cflags-y += $(call as-instr,.cfi_startproc\n.cfi_endproc,-DCONFIG_AS_CFI=1,)
 AFLAGS += $(call as-instr,.cfi_startproc\n.cfi_endproc,-DCONFIG_AS_CFI=1,)
 
+# is .cfi_signal_frame supported too?
+cflags-y += $(call as-instr,.cfi_startproc\n.cfi_endproc,-DCONFIG_AS_CFI=1,)
+AFLAGS += $(call as-instr,.cfi_startproc\n.cfi_endproc,-DCONFIG_AS_CFI=1,)
+
 CFLAGS += $(cflags-y)
 
 # Default subarch .c files
diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S
index 284f2e908ad0..5a63d6fdb70e 100644
--- a/arch/i386/kernel/entry.S
+++ b/arch/i386/kernel/entry.S
@@ -183,18 +183,21 @@ VM_MASK		= 0x00020000
 
 #define RING0_INT_FRAME \
 	CFI_STARTPROC simple;\
+	CFI_SIGNAL_FRAME;\
 	CFI_DEF_CFA esp, 3*4;\
 	/*CFI_OFFSET cs, -2*4;*/\
 	CFI_OFFSET eip, -3*4
 
 #define RING0_EC_FRAME \
 	CFI_STARTPROC simple;\
+	CFI_SIGNAL_FRAME;\
 	CFI_DEF_CFA esp, 4*4;\
 	/*CFI_OFFSET cs, -2*4;*/\
 	CFI_OFFSET eip, -3*4
 
 #define RING0_PTREGS_FRAME \
 	CFI_STARTPROC simple;\
+	CFI_SIGNAL_FRAME;\
 	CFI_DEF_CFA esp, OLDESP-EBX;\
 	/*CFI_OFFSET cs, CS-OLDESP;*/\
 	CFI_OFFSET eip, EIP-OLDESP;\
@@ -275,6 +278,7 @@ need_resched:
 	# sysenter call handler stub
 ENTRY(sysenter_entry)
 	CFI_STARTPROC simple
+	CFI_SIGNAL_FRAME
 	CFI_DEF_CFA esp, 0
 	CFI_REGISTER esp, ebp
 	movl TSS_sysenter_esp0(%esp),%esp
diff --git a/arch/x86_64/Makefile b/arch/x86_64/Makefile
index 2b8d07c70106..1c0f18d4f887 100644
--- a/arch/x86_64/Makefile
+++ b/arch/x86_64/Makefile
@@ -58,6 +58,10 @@ cflags-y += $(call cc-option,-mno-sse -mno-mmx -mno-sse2 -mno-3dnow,)
 cflags-y += $(call as-instr,.cfi_startproc\n.cfi_endproc,-DCONFIG_AS_CFI=1,)
 AFLAGS += $(call as-instr,.cfi_startproc\n.cfi_endproc,-DCONFIG_AS_CFI=1,)
 
+# is .cfi_signal_frame supported too?
+cflags-y += $(call as-instr,.cfi_startproc\n.cfi_signal_frame\n.cfi_endproc,-DCONFIG_AS_CFI_SIGNAL_FRAME=1,)
+AFLAGS += $(call as-instr,.cfi_startproc\n.cfi_signal_frame\n.cfi_endproc,-DCONFIG_AS_CFI_SIGNAL_FRAME=1,)
+
 cflags-$(CONFIG_CC_STACKPROTECTOR) += $(shell $(CONFIG_SHELL) $(srctree)/scripts/gcc-x86_64-has-stack-protector.sh $(CC) -fstack-protector )
 cflags-$(CONFIG_CC_STACKPROTECTOR_ALL) += $(shell $(CONFIG_SHELL) $(srctree)/scripts/gcc-x86_64-has-stack-protector.sh $(CC) -fstack-protector-all )
 
diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S
index 32fd32bea07c..b4aa875e175b 100644
--- a/arch/x86_64/ia32/ia32entry.S
+++ b/arch/x86_64/ia32/ia32entry.S
@@ -71,6 +71,7 @@
  */ 	
 ENTRY(ia32_sysenter_target)
 	CFI_STARTPROC32	simple
+	CFI_SIGNAL_FRAME
 	CFI_DEF_CFA	rsp,0
 	CFI_REGISTER	rsp,rbp
 	swapgs
@@ -186,6 +187,7 @@ ENDPROC(ia32_sysenter_target)
  */ 	
 ENTRY(ia32_cstar_target)
 	CFI_STARTPROC32	simple
+	CFI_SIGNAL_FRAME
 	CFI_DEF_CFA	rsp,PDA_STACKOFFSET
 	CFI_REGISTER	rip,rcx
 	/*CFI_REGISTER	rflags,r11*/
@@ -293,6 +295,7 @@ ia32_badarg:
 
 ENTRY(ia32_syscall)
 	CFI_STARTPROC	simple
+	CFI_SIGNAL_FRAME
 	CFI_DEF_CFA	rsp,SS+8-RIP
 	/*CFI_REL_OFFSET	ss,SS-RIP*/
 	CFI_REL_OFFSET	rsp,RSP-RIP
@@ -370,6 +373,7 @@ ENTRY(ia32_ptregs_common)
 	popq %r11
 	CFI_ENDPROC
 	CFI_STARTPROC32	simple
+	CFI_SIGNAL_FRAME
 	CFI_DEF_CFA	rsp,SS+8-ARGOFFSET
 	CFI_REL_OFFSET	rax,RAX-ARGOFFSET
 	CFI_REL_OFFSET	rcx,RCX-ARGOFFSET
diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S
index ea32688386fd..4cbc65290ae7 100644
--- a/arch/x86_64/kernel/entry.S
+++ b/arch/x86_64/kernel/entry.S
@@ -123,6 +123,7 @@
 	.macro	CFI_DEFAULT_STACK start=1
 	.if \start
 	CFI_STARTPROC	simple
+	CFI_SIGNAL_FRAME
 	CFI_DEF_CFA	rsp,SS+8
 	.else
 	CFI_DEF_CFA_OFFSET SS+8
@@ -207,6 +208,7 @@ END(ret_from_fork)
 
 ENTRY(system_call)
 	CFI_STARTPROC	simple
+	CFI_SIGNAL_FRAME
 	CFI_DEF_CFA	rsp,PDA_STACKOFFSET
 	CFI_REGISTER	rip,rcx
 	/*CFI_REGISTER	rflags,r11*/
@@ -324,6 +326,7 @@ END(system_call)
  */ 	
 ENTRY(int_ret_from_sys_call)
 	CFI_STARTPROC	simple
+	CFI_SIGNAL_FRAME
 	CFI_DEF_CFA	rsp,SS+8-ARGOFFSET
 	/*CFI_REL_OFFSET	ss,SS-ARGOFFSET*/
 	CFI_REL_OFFSET	rsp,RSP-ARGOFFSET
@@ -484,6 +487,7 @@ END(stub_rt_sigreturn)
  */
 	.macro _frame ref
 	CFI_STARTPROC simple
+	CFI_SIGNAL_FRAME
 	CFI_DEF_CFA rsp,SS+8-\ref
 	/*CFI_REL_OFFSET ss,SS-\ref*/
 	CFI_REL_OFFSET rsp,RSP-\ref
diff --git a/include/asm-i386/dwarf2.h b/include/asm-i386/dwarf2.h
index 5d1a8db5a9b0..6d66398a307d 100644
--- a/include/asm-i386/dwarf2.h
+++ b/include/asm-i386/dwarf2.h
@@ -28,6 +28,12 @@
 #define CFI_RESTORE_STATE .cfi_restore_state
 #define CFI_UNDEFINED .cfi_undefined
 
+#ifdef CONFIG_AS_CFI_SIGNAL_FRAME
+#define CFI_SIGNAL_FRAME .cfi_signal_frame
+#else
+#define CFI_SIGNAL_FRAME
+#endif
+
 #else
 
 /* Due to the structure of pre-exisiting code, don't use assembler line
@@ -48,6 +54,7 @@
 #define CFI_REMEMBER_STATE ignore
 #define CFI_RESTORE_STATE ignore
 #define CFI_UNDEFINED ignore
+#define CFI_SIGNAL_FRAME ignore
 
 #endif
 
diff --git a/include/asm-i386/unwind.h b/include/asm-i386/unwind.h
index f0ac399bae3c..5031d693b89d 100644
--- a/include/asm-i386/unwind.h
+++ b/include/asm-i386/unwind.h
@@ -18,6 +18,7 @@ struct unwind_frame_info
 {
 	struct pt_regs regs;
 	struct task_struct *task;
+	unsigned call_frame:1;
 };
 
 #define UNW_PC(frame)        (frame)->regs.eip
@@ -44,6 +45,10 @@ struct unwind_frame_info
 	PTREGS_INFO(edi), \
 	PTREGS_INFO(eip)
 
+#define UNW_DEFAULT_RA(raItem, dataAlign) \
+	((raItem).where == Memory && \
+	 !((raItem).value * (dataAlign) + 4))
+
 static inline void arch_unw_init_frame_info(struct unwind_frame_info *info,
                                             /*const*/ struct pt_regs *regs)
 {
diff --git a/include/asm-x86_64/dwarf2.h b/include/asm-x86_64/dwarf2.h
index 2b9368365fad..eedc08526b0b 100644
--- a/include/asm-x86_64/dwarf2.h
+++ b/include/asm-x86_64/dwarf2.h
@@ -28,6 +28,11 @@
 #define CFI_REMEMBER_STATE .cfi_remember_state
 #define CFI_RESTORE_STATE .cfi_restore_state
 #define CFI_UNDEFINED .cfi_undefined
+#ifdef CONFIG_AS_CFI_SIGNAL_FRAME
+#define CFI_SIGNAL_FRAME .cfi_signal_frame
+#else
+#define CFI_SIGNAL_FRAME
+#endif
 
 #else
 
@@ -45,6 +50,7 @@
 #define CFI_REMEMBER_STATE	#
 #define CFI_RESTORE_STATE	#
 #define CFI_UNDEFINED	#
+#define CFI_SIGNAL_FRAME	#
 
 #endif
 
diff --git a/include/asm-x86_64/unwind.h b/include/asm-x86_64/unwind.h
index 1f6e9bfb569e..b8fa5cb7ff88 100644
--- a/include/asm-x86_64/unwind.h
+++ b/include/asm-x86_64/unwind.h
@@ -18,6 +18,7 @@ struct unwind_frame_info
 {
 	struct pt_regs regs;
 	struct task_struct *task;
+	unsigned call_frame:1;
 };
 
 #define UNW_PC(frame)        (frame)->regs.rip
@@ -57,6 +58,10 @@ struct unwind_frame_info
 	PTREGS_INFO(r15), \
 	PTREGS_INFO(rip)
 
+#define UNW_DEFAULT_RA(raItem, dataAlign) \
+	((raItem).where == Memory && \
+	 !((raItem).value * (dataAlign) + 8))
+
 static inline void arch_unw_init_frame_info(struct unwind_frame_info *info,
                                             /*const*/ struct pt_regs *regs)
 {
diff --git a/kernel/unwind.c b/kernel/unwind.c
index f69c804c8e62..3430475fcd88 100644
--- a/kernel/unwind.c
+++ b/kernel/unwind.c
@@ -603,6 +603,7 @@ int unwind(struct unwind_frame_info *frame)
 #define FRAME_REG(r, t) (((t *)frame)[reg_info[r].offs])
 	const u32 *fde = NULL, *cie = NULL;
 	const u8 *ptr = NULL, *end = NULL;
+	unsigned long pc = UNW_PC(frame) - frame->call_frame;
 	unsigned long startLoc = 0, endLoc = 0, cfa;
 	unsigned i;
 	signed ptrType = -1;
@@ -612,7 +613,7 @@ int unwind(struct unwind_frame_info *frame)
 
 	if (UNW_PC(frame) == 0)
 		return -EINVAL;
-	if ((table = find_table(UNW_PC(frame))) != NULL
+	if ((table = find_table(pc)) != NULL
 	    && !(table->size & (sizeof(*fde) - 1))) {
 		unsigned long tableSize = table->size;
 
@@ -647,7 +648,7 @@ int unwind(struct unwind_frame_info *frame)
 			                        ptrType & DW_EH_PE_indirect
 			                        ? ptrType
 			                        : ptrType & (DW_EH_PE_FORM|DW_EH_PE_signed));
-			if (UNW_PC(frame) >= startLoc && UNW_PC(frame) < endLoc)
+			if (pc >= startLoc && pc < endLoc)
 				break;
 			cie = NULL;
 		}
@@ -657,16 +658,28 @@ int unwind(struct unwind_frame_info *frame)
 		state.cieEnd = ptr; /* keep here temporarily */
 		ptr = (const u8 *)(cie + 2);
 		end = (const u8 *)(cie + 1) + *cie;
+		frame->call_frame = 1;
 		if ((state.version = *ptr) != 1)
 			cie = NULL; /* unsupported version */
 		else if (*++ptr) {
 			/* check if augmentation size is first (and thus present) */
 			if (*ptr == 'z') {
-				/* check for ignorable (or already handled)
-				 * nul-terminated augmentation string */
-				while (++ptr < end && *ptr)
-					if (strchr("LPR", *ptr) == NULL)
+				while (++ptr < end && *ptr) {
+					switch(*ptr) {
+					/* check for ignorable (or already handled)
+					 * nul-terminated augmentation string */
+					case 'L':
+					case 'P':
+					case 'R':
+						continue;
+					case 'S':
+						frame->call_frame = 0;
+						continue;
+					default:
 						break;
+					}
+					break;
+				}
 			}
 			if (ptr >= end || *ptr)
 				cie = NULL;
@@ -755,7 +768,7 @@ int unwind(struct unwind_frame_info *frame)
 	state.org = startLoc;
 	memcpy(&state.cfa, &badCFA, sizeof(state.cfa));
 	/* process instructions */
-	if (!processCFI(ptr, end, UNW_PC(frame), ptrType, &state)
+	if (!processCFI(ptr, end, pc, ptrType, &state)
 	   || state.loc > endLoc
 	   || state.regs[retAddrReg].where == Nowhere
 	   || state.cfa.reg >= ARRAY_SIZE(reg_info)
@@ -763,6 +776,11 @@ int unwind(struct unwind_frame_info *frame)
 	   || state.cfa.offs % sizeof(unsigned long))
 		return -EIO;
 	/* update frame */
+#ifndef CONFIG_AS_CFI_SIGNAL_FRAME
+	if(frame->call_frame
+	   && !UNW_DEFAULT_RA(state.regs[retAddrReg], state.dataAlign))
+		frame->call_frame = 0;
+#endif
 	cfa = FRAME_REG(state.cfa.reg, unsigned long) + state.cfa.offs;
 	startLoc = min((unsigned long)UNW_SP(frame), cfa);
 	endLoc = max((unsigned long)UNW_SP(frame), cfa);
@@ -866,6 +884,7 @@ int unwind_init_frame_info(struct unwind_frame_info *info,
                            /*const*/ struct pt_regs *regs)
 {
 	info->task = tsk;
+	info->call_frame = 0;
 	arch_unw_init_frame_info(info, regs);
 
 	return 0;
@@ -879,6 +898,7 @@ int unwind_init_blocked(struct unwind_frame_info *info,
                         struct task_struct *tsk)
 {
 	info->task = tsk;
+	info->call_frame = 0;
 	arch_unw_init_blocked(info);
 
 	return 0;
@@ -894,6 +914,7 @@ int unwind_init_running(struct unwind_frame_info *info,
                         void *arg)
 {
 	info->task = current;
+	info->call_frame = 0;
 
 	return arch_unwind_init_running(info, callback, arg);
 }
diff --git a/scripts/Kbuild.include b/scripts/Kbuild.include
index 7adef12a0c26..4f5ff19b992b 100644
--- a/scripts/Kbuild.include
+++ b/scripts/Kbuild.include
@@ -66,8 +66,8 @@ as-option = $(shell if $(CC) $(CFLAGS) $(1) -Wa,-Z -c -o /dev/null \
 # as-instr
 # Usage: cflags-y += $(call as-instr, instr, option1, option2)
 
-as-instr = $(shell if echo -e "$(1)" | $(AS) -Z -o astest$$$$.out \
-		   2>&1 >/dev/null ; then echo "$(2)"; else echo "$(3)"; fi; \
+as-instr = $(shell if echo -e "$(1)" | $(AS) >/dev/null 2>&1 -W -Z -o astest$$$$.out ; \
+		   then echo "$(2)"; else echo "$(3)"; fi; \
 	           rm -f astest$$$$.out)
 
 # cc-option
-- 
cgit v1.2.3


From 0a2966b48fb784e437520e400ddc94874ddbd4e8 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <christoph@sgi.com>
Date: Mon, 25 Sep 2006 23:30:51 -0700
Subject: [PATCH] Fix longstanding load balancing bug in the scheduler

The scheduler will stop load balancing if the most busy processor contains
processes pinned via processor affinity.

The scheduler currently only does one search for busiest cpu.  If it cannot
pull any tasks away from the busiest cpu because they were pinned then the
scheduler goes into a corner and sulks leaving the idle processors idle.

F.e.  If you have processor 0 busy running four tasks pinned via taskset,
there are none on processor 1 and one just started two processes on
processor 2 then the scheduler will not move one of the two processes away
from processor 2.

This patch fixes that issue by forcing the scheduler to come out of its
corner and retrying the load balancing by considering other processors for
load balancing.

This patch was originally developed by John Hawkes and discussed at

    http://marc.theaimsgroup.com/?l=linux-kernel&m=113901368523205&w=2.

I have removed extraneous material and gone back to equipping struct rq
with the cpu the queue is associated with since this makes the patch much
easier and it is likely that others in the future will have the same
difficulty of figuring out which processor owns which runqueue.

The overhead added through these patches is a single word on the stack if
the kernel is configured to support 32 cpus or less (32 bit).  For 32 bit
environments the maximum number of cpus that can be configued is 255 which
would result in the use of 32 bytes additional on the stack.  On IA64 up to
1k cpus can be configured which will result in the use of 128 additional
bytes on the stack.  The maximum additional cache footprint is one
cacheline.  Typically memory use will be much less than a cacheline and the
additional cpumask will be placed on the stack in a cacheline that already
contains other local variable.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: John Hawkes <hawkes@sgi.com>
Cc: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Peter Williams <pwil3058@bigpond.net.au>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 54 ++++++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 46 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index a234fbee1238..5c848fd4e461 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -238,6 +238,7 @@ struct rq {
 	/* For active balancing */
 	int active_balance;
 	int push_cpu;
+	int cpu;		/* cpu of this runqueue */
 
 	struct task_struct *migration_thread;
 	struct list_head migration_queue;
@@ -267,6 +268,15 @@ struct rq {
 
 static DEFINE_PER_CPU(struct rq, runqueues);
 
+static inline int cpu_of(struct rq *rq)
+{
+#ifdef CONFIG_SMP
+	return rq->cpu;
+#else
+	return 0;
+#endif
+}
+
 /*
  * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
  * See detach_destroy_domains: synchronize_sched for details.
@@ -2211,7 +2221,8 @@ out:
  */
 static struct sched_group *
 find_busiest_group(struct sched_domain *sd, int this_cpu,
-		   unsigned long *imbalance, enum idle_type idle, int *sd_idle)
+		   unsigned long *imbalance, enum idle_type idle, int *sd_idle,
+		   cpumask_t *cpus)
 {
 	struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
 	unsigned long max_load, avg_load, total_load, this_load, total_pwr;
@@ -2248,7 +2259,12 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 		sum_weighted_load = sum_nr_running = avg_load = 0;
 
 		for_each_cpu_mask(i, group->cpumask) {
-			struct rq *rq = cpu_rq(i);
+			struct rq *rq;
+
+			if (!cpu_isset(i, *cpus))
+				continue;
+
+			rq = cpu_rq(i);
 
 			if (*sd_idle && !idle_cpu(i))
 				*sd_idle = 0;
@@ -2466,13 +2482,17 @@ ret:
  */
 static struct rq *
 find_busiest_queue(struct sched_group *group, enum idle_type idle,
-		   unsigned long imbalance)
+		   unsigned long imbalance, cpumask_t *cpus)
 {
 	struct rq *busiest = NULL, *rq;
 	unsigned long max_load = 0;
 	int i;
 
 	for_each_cpu_mask(i, group->cpumask) {
+
+		if (!cpu_isset(i, *cpus))
+			continue;
+
 		rq = cpu_rq(i);
 
 		if (rq->nr_running == 1 && rq->raw_weighted_load > imbalance)
@@ -2511,6 +2531,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
 	struct sched_group *group;
 	unsigned long imbalance;
 	struct rq *busiest;
+	cpumask_t cpus = CPU_MASK_ALL;
 
 	if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
 	    !sched_smt_power_savings)
@@ -2518,13 +2539,15 @@ static int load_balance(int this_cpu, struct rq *this_rq,
 
 	schedstat_inc(sd, lb_cnt[idle]);
 
-	group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle);
+redo:
+	group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
+							&cpus);
 	if (!group) {
 		schedstat_inc(sd, lb_nobusyg[idle]);
 		goto out_balanced;
 	}
 
-	busiest = find_busiest_queue(group, idle, imbalance);
+	busiest = find_busiest_queue(group, idle, imbalance, &cpus);
 	if (!busiest) {
 		schedstat_inc(sd, lb_nobusyq[idle]);
 		goto out_balanced;
@@ -2549,8 +2572,12 @@ static int load_balance(int this_cpu, struct rq *this_rq,
 		double_rq_unlock(this_rq, busiest);
 
 		/* All tasks on this runqueue were pinned by CPU affinity */
-		if (unlikely(all_pinned))
+		if (unlikely(all_pinned)) {
+			cpu_clear(cpu_of(busiest), cpus);
+			if (!cpus_empty(cpus))
+				goto redo;
 			goto out_balanced;
+		}
 	}
 
 	if (!nr_moved) {
@@ -2639,18 +2666,22 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
 	unsigned long imbalance;
 	int nr_moved = 0;
 	int sd_idle = 0;
+	cpumask_t cpus = CPU_MASK_ALL;
 
 	if (sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings)
 		sd_idle = 1;
 
 	schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
-	group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE, &sd_idle);
+redo:
+	group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE,
+				&sd_idle, &cpus);
 	if (!group) {
 		schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]);
 		goto out_balanced;
 	}
 
-	busiest = find_busiest_queue(group, NEWLY_IDLE, imbalance);
+	busiest = find_busiest_queue(group, NEWLY_IDLE, imbalance,
+				&cpus);
 	if (!busiest) {
 		schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]);
 		goto out_balanced;
@@ -2668,6 +2699,12 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
 					minus_1_or_zero(busiest->nr_running),
 					imbalance, sd, NEWLY_IDLE, NULL);
 		spin_unlock(&busiest->lock);
+
+		if (!nr_moved) {
+			cpu_clear(cpu_of(busiest), cpus);
+			if (!cpus_empty(cpus))
+				goto redo;
+		}
 	}
 
 	if (!nr_moved) {
@@ -6747,6 +6784,7 @@ void __init sched_init(void)
 			rq->cpu_load[j] = 0;
 		rq->active_balance = 0;
 		rq->push_cpu = 0;
+		rq->cpu = i;
 		rq->migration_thread = NULL;
 		INIT_LIST_HEAD(&rq->migration_queue);
 #endif
-- 
cgit v1.2.3


From 9b819d204cf602eab1a53a9ec4b8d2ca51e02a1d Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Mon, 25 Sep 2006 23:31:40 -0700
Subject: [PATCH] Add __GFP_THISNODE to avoid fallback to other nodes and
 ignore cpuset/memory policy restrictions

Add a new gfp flag __GFP_THISNODE to avoid fallback to other nodes.  This
flag is essential if a kernel component requires memory to be located on a
certain node.  It will be needed for alloc_pages_node() to force allocation
on the indicated node and for alloc_pages() to force allocation on the
current node.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Cc: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/gfp.h | 3 ++-
 kernel/cpuset.c     | 2 +-
 mm/mempolicy.c      | 2 +-
 mm/page_alloc.c     | 3 +++
 4 files changed, 7 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 63ab88a36f39..0eda5b6dedbd 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -45,6 +45,7 @@ struct vm_area_struct;
 #define __GFP_ZERO	((__force gfp_t)0x8000u)/* Return zeroed page on success */
 #define __GFP_NOMEMALLOC ((__force gfp_t)0x10000u) /* Don't use emergency reserves */
 #define __GFP_HARDWALL   ((__force gfp_t)0x20000u) /* Enforce hardwall cpuset memory allocs */
+#define __GFP_THISNODE	((__force gfp_t)0x40000u)/* No fallback, no policies */
 
 #define __GFP_BITS_SHIFT 20	/* Room for 20 __GFP_FOO bits */
 #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
@@ -53,7 +54,7 @@ struct vm_area_struct;
 #define GFP_LEVEL_MASK (__GFP_WAIT|__GFP_HIGH|__GFP_IO|__GFP_FS| \
 			__GFP_COLD|__GFP_NOWARN|__GFP_REPEAT| \
 			__GFP_NOFAIL|__GFP_NORETRY|__GFP_NO_GROW|__GFP_COMP| \
-			__GFP_NOMEMALLOC|__GFP_HARDWALL)
+			__GFP_NOMEMALLOC|__GFP_HARDWALL|__GFP_THISNODE)
 
 /* This equals 0, but use constants in case they ever change */
 #define GFP_NOWAIT	(GFP_ATOMIC & ~__GFP_HIGH)
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 4ea6f0dc2fc5..76940361273e 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2316,7 +2316,7 @@ int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
 	const struct cpuset *cs;	/* current cpuset ancestors */
 	int allowed;			/* is allocation in zone z allowed? */
 
-	if (in_interrupt())
+	if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
 		return 1;
 	node = z->zone_pgdat->node_id;
 	might_sleep_if(!(gfp_mask & __GFP_HARDWALL));
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index c3429a710ab1..8002e1faccda 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1290,7 +1290,7 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
 
 	if ((gfp & __GFP_WAIT) && !in_interrupt())
 		cpuset_update_task_memory_state();
-	if (!pol || in_interrupt())
+	if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
 		pol = &default_policy;
 	if (pol->policy == MPOL_INTERLEAVE)
 		return alloc_page_interleave(gfp, order, interleave_nodes(pol));
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 208a6b03aa78..ea498788af53 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -893,6 +893,9 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
 	 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
 	 */
 	do {
+		if (unlikely((gfp_mask & __GFP_THISNODE) &&
+			(*z)->zone_pgdat != zonelist->zones[0]->zone_pgdat))
+				break;
 		if ((alloc_flags & ALLOC_CPUSET) &&
 				!cpuset_zone_allowed(*z, gfp_mask))
 			continue;
-- 
cgit v1.2.3


From fbd98167e653535c5816be154f2149c0efa7757d Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Mon, 25 Sep 2006 23:31:45 -0700
Subject: [PATCH] Profiling: require buffer allocation on the correct node

Profiling really suffers with off node buffers.  Fail if no memory is
available on the nodes.  The profiling code can deal with these failures
should they occur.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/profile.c | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/profile.c b/kernel/profile.c
index d5bd75e7501c..fb660c7d35ba 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -309,13 +309,17 @@ static int __devinit profile_cpu_callback(struct notifier_block *info,
 		node = cpu_to_node(cpu);
 		per_cpu(cpu_profile_flip, cpu) = 0;
 		if (!per_cpu(cpu_profile_hits, cpu)[1]) {
-			page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
+			page = alloc_pages_node(node,
+					GFP_KERNEL | __GFP_ZERO | GFP_THISNODE,
+					0);
 			if (!page)
 				return NOTIFY_BAD;
 			per_cpu(cpu_profile_hits, cpu)[1] = page_address(page);
 		}
 		if (!per_cpu(cpu_profile_hits, cpu)[0]) {
-			page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
+			page = alloc_pages_node(node,
+					GFP_KERNEL | __GFP_ZERO | GFP_THISNODE,
+					0);
 			if (!page)
 				goto out_free;
 			per_cpu(cpu_profile_hits, cpu)[0] = page_address(page);
@@ -491,12 +495,16 @@ static int __init create_hash_tables(void)
 		int node = cpu_to_node(cpu);
 		struct page *page;
 
-		page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
+		page = alloc_pages_node(node,
+				GFP_KERNEL | __GFP_ZERO | GFP_THISNODE,
+				0);
 		if (!page)
 			goto out_cleanup;
 		per_cpu(cpu_profile_hits, cpu)[1]
 				= (struct profile_hit *)page_address(page);
-		page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
+		page = alloc_pages_node(node,
+				GFP_KERNEL | __GFP_ZERO | GFP_THISNODE,
+				0);
 		if (!page)
 			goto out_cleanup;
 		per_cpu(cpu_profile_hits, cpu)[0]
-- 
cgit v1.2.3


From 0ff38490c836dc379ff7ec45b10a15a662f4e5f6 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Mon, 25 Sep 2006 23:31:52 -0700
Subject: [PATCH] zone_reclaim: dynamic slab reclaim

Currently one can enable slab reclaim by setting an explicit option in
/proc/sys/vm/zone_reclaim_mode.  Slab reclaim is then used as a final
option if the freeing of unmapped file backed pages is not enough to free
enough pages to allow a local allocation.

However, that means that the slab can grow excessively and that most memory
of a node may be used by slabs.  We have had a case where a machine with
46GB of memory was using 40-42GB for slab.  Zone reclaim was effective in
dealing with pagecache pages.  However, slab reclaim was only done during
global reclaim (which is a bit rare on NUMA systems).

This patch implements slab reclaim during zone reclaim.  Zone reclaim
occurs if there is a danger of an off node allocation.  At that point we

1. Shrink the per node page cache if the number of pagecache
   pages is more than min_unmapped_ratio percent of pages in a zone.

2. Shrink the slab cache if the number of the nodes reclaimable slab pages
   (patch depends on earlier one that implements that counter)
   are more than min_slab_ratio (a new /proc/sys/vm tunable).

The shrinking of the slab cache is a bit problematic since it is not node
specific.  So we simply calculate what point in the slab we want to reach
(current per node slab use minus the number of pages that neeed to be
allocated) and then repeately run the global reclaim until that is
unsuccessful or we have reached the limit.  I hope we will have zone based
slab reclaim at some point which will make that easier.

The default for the min_slab_ratio is 5%

Also remove the slab option from /proc/sys/vm/zone_reclaim_mode.

[akpm@osdl.org: cleanups]
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/sysctl/vm.txt | 27 +++++++++++++++------
 include/linux/mmzone.h      |  3 +++
 include/linux/swap.h        |  1 +
 include/linux/sysctl.h      |  1 +
 kernel/sysctl.c             | 11 +++++++++
 mm/page_alloc.c             | 17 +++++++++++++
 mm/vmscan.c                 | 58 +++++++++++++++++++++++++++++----------------
 7 files changed, 90 insertions(+), 28 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 7cee90223d3a..20d0d797f539 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -29,6 +29,7 @@ Currently, these files are in /proc/sys/vm:
 - drop-caches
 - zone_reclaim_mode
 - min_unmapped_ratio
+- min_slab_ratio
 - panic_on_oom
 
 ==============================================================
@@ -138,7 +139,6 @@ This is value ORed together of
 1	= Zone reclaim on
 2	= Zone reclaim writes dirty pages out
 4	= Zone reclaim swaps pages
-8	= Also do a global slab reclaim pass
 
 zone_reclaim_mode is set during bootup to 1 if it is determined that pages
 from remote zones will cause a measurable performance reduction. The
@@ -162,18 +162,13 @@ Allowing regular swap effectively restricts allocations to the local
 node unless explicitly overridden by memory policies or cpuset
 configurations.
 
-It may be advisable to allow slab reclaim if the system makes heavy
-use of files and builds up large slab caches. However, the slab
-shrink operation is global, may take a long time and free slabs
-in all nodes of the system.
-
 =============================================================
 
 min_unmapped_ratio:
 
 This is available only on NUMA kernels.
 
-A percentage of the file backed pages in each zone.  Zone reclaim will only
+A percentage of the total pages in each zone.  Zone reclaim will only
 occur if more than this percentage of pages are file backed and unmapped.
 This is to insure that a minimal amount of local pages is still available for
 file I/O even if the node is overallocated.
@@ -182,6 +177,24 @@ The default is 1 percent.
 
 =============================================================
 
+min_slab_ratio:
+
+This is available only on NUMA kernels.
+
+A percentage of the total pages in each zone.  On Zone reclaim
+(fallback from the local zone occurs) slabs will be reclaimed if more
+than this percentage of pages in a zone are reclaimable slab pages.
+This insures that the slab growth stays under control even in NUMA
+systems that rarely perform global reclaim.
+
+The default is 5 percent.
+
+Note that slab reclaim is triggered in a per zone / node fashion.
+The process of reclaiming slab memory is currently not node specific
+and may not be fast.
+
+=============================================================
+
 panic_on_oom
 
 This enables or disables panic on out-of-memory feature.  If this is set to 1,
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 08c41b9f92e0..3693f1a52788 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -171,6 +171,7 @@ struct zone {
 	 * zone reclaim becomes active if more unmapped pages exist.
 	 */
 	unsigned long		min_unmapped_pages;
+	unsigned long		min_slab_pages;
 	struct per_cpu_pageset	*pageset[NR_CPUS];
 #else
 	struct per_cpu_pageset	pageset[NR_CPUS];
@@ -448,6 +449,8 @@ int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *, int, struct file
 					void __user *, size_t *, loff_t *);
 int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *, int,
 			struct file *, void __user *, size_t *, loff_t *);
+int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *, int,
+			struct file *, void __user *, size_t *, loff_t *);
 
 #include <linux/topology.h>
 /* Returns the number of the current Node. */
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 32db06c8ffe0..a2f5ad7c2d2e 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -193,6 +193,7 @@ extern long vm_total_pages;
 #ifdef CONFIG_NUMA
 extern int zone_reclaim_mode;
 extern int sysctl_min_unmapped_ratio;
+extern int sysctl_min_slab_ratio;
 extern int zone_reclaim(struct zone *, gfp_t, unsigned int);
 #else
 #define zone_reclaim_mode 0
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 736ed917a4f8..eca555781d05 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -191,6 +191,7 @@ enum
 	VM_MIN_UNMAPPED=32,	/* Set min percent of unmapped pages */
 	VM_PANIC_ON_OOM=33,	/* panic at out-of-memory */
 	VM_VDSO_ENABLED=34,	/* map VDSO into new processes? */
+	VM_MIN_SLAB=35,		 /* Percent pages ignored by zone reclaim */
 };
 
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 362a0cc37138..fd43c3e6786b 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -943,6 +943,17 @@ static ctl_table vm_table[] = {
 		.extra1		= &zero,
 		.extra2		= &one_hundred,
 	},
+	{
+		.ctl_name	= VM_MIN_SLAB,
+		.procname	= "min_slab_ratio",
+		.data		= &sysctl_min_slab_ratio,
+		.maxlen		= sizeof(sysctl_min_slab_ratio),
+		.mode		= 0644,
+		.proc_handler	= &sysctl_min_slab_ratio_sysctl_handler,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero,
+		.extra2		= &one_hundred,
+	},
 #endif
 #ifdef CONFIG_X86_32
 	{
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 47e98423b30d..cf913bdd433e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2005,6 +2005,7 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat,
 #ifdef CONFIG_NUMA
 		zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)
 						/ 100;
+		zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100;
 #endif
 		zone->name = zone_names[j];
 		spin_lock_init(&zone->lock);
@@ -2318,6 +2319,22 @@ int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
 				sysctl_min_unmapped_ratio) / 100;
 	return 0;
 }
+
+int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
+	struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+{
+	struct zone *zone;
+	int rc;
+
+	rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
+	if (rc)
+		return rc;
+
+	for_each_zone(zone)
+		zone->min_slab_pages = (zone->present_pages *
+				sysctl_min_slab_ratio) / 100;
+	return 0;
+}
 #endif
 
 /*
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 349797ba4bac..089e943c4d38 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1527,7 +1527,6 @@ int zone_reclaim_mode __read_mostly;
 #define RECLAIM_ZONE (1<<0)	/* Run shrink_cache on the zone */
 #define RECLAIM_WRITE (1<<1)	/* Writeout pages during reclaim */
 #define RECLAIM_SWAP (1<<2)	/* Swap pages out during reclaim */
-#define RECLAIM_SLAB (1<<3)	/* Do a global slab shrink if the zone is out of memory */
 
 /*
  * Priority for ZONE_RECLAIM. This determines the fraction of pages
@@ -1542,6 +1541,12 @@ int zone_reclaim_mode __read_mostly;
  */
 int sysctl_min_unmapped_ratio = 1;
 
+/*
+ * If the number of slab pages in a zone grows beyond this percentage then
+ * slab reclaim needs to occur.
+ */
+int sysctl_min_slab_ratio = 5;
+
 /*
  * Try to free up some pages from this zone through reclaim.
  */
@@ -1573,29 +1578,37 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 	reclaim_state.reclaimed_slab = 0;
 	p->reclaim_state = &reclaim_state;
 
-	/*
-	 * Free memory by calling shrink zone with increasing priorities
-	 * until we have enough memory freed.
-	 */
-	priority = ZONE_RECLAIM_PRIORITY;
-	do {
-		nr_reclaimed += shrink_zone(priority, zone, &sc);
-		priority--;
-	} while (priority >= 0 && nr_reclaimed < nr_pages);
+	if (zone_page_state(zone, NR_FILE_PAGES) -
+		zone_page_state(zone, NR_FILE_MAPPED) >
+		zone->min_unmapped_pages) {
+		/*
+		 * Free memory by calling shrink zone with increasing
+		 * priorities until we have enough memory freed.
+		 */
+		priority = ZONE_RECLAIM_PRIORITY;
+		do {
+			nr_reclaimed += shrink_zone(priority, zone, &sc);
+			priority--;
+		} while (priority >= 0 && nr_reclaimed < nr_pages);
+	}
 
-	if (nr_reclaimed < nr_pages && (zone_reclaim_mode & RECLAIM_SLAB)) {
+	if (zone_page_state(zone, NR_SLAB_RECLAIMABLE) > zone->min_slab_pages) {
 		/*
 		 * shrink_slab() does not currently allow us to determine how
-		 * many pages were freed in this zone. So we just shake the slab
-		 * a bit and then go off node for this particular allocation
-		 * despite possibly having freed enough memory to allocate in
-		 * this zone.  If we freed local memory then the next
-		 * allocations will be local again.
+		 * many pages were freed in this zone. So we take the current
+		 * number of slab pages and shake the slab until it is reduced
+		 * by the same nr_pages that we used for reclaiming unmapped
+		 * pages.
 		 *
-		 * shrink_slab will free memory on all zones and may take
-		 * a long time.
+		 * Note that shrink_slab will free memory on all zones and may
+		 * take a long time.
 		 */
-		shrink_slab(sc.nr_scanned, gfp_mask, order);
+		unsigned long limit = zone_page_state(zone,
+				NR_SLAB_RECLAIMABLE) - nr_pages;
+
+		while (shrink_slab(sc.nr_scanned, gfp_mask, order) &&
+			zone_page_state(zone, NR_SLAB_RECLAIMABLE) > limit)
+			;
 	}
 
 	p->reclaim_state = NULL;
@@ -1609,7 +1622,8 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 	int node_id;
 
 	/*
-	 * Zone reclaim reclaims unmapped file backed pages.
+	 * Zone reclaim reclaims unmapped file backed pages and
+	 * slab pages if we are over the defined limits.
 	 *
 	 * A small portion of unmapped file backed pages is needed for
 	 * file I/O otherwise pages read by file I/O will be immediately
@@ -1618,7 +1632,9 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 	 * unmapped file backed pages.
 	 */
 	if (zone_page_state(zone, NR_FILE_PAGES) -
-	    zone_page_state(zone, NR_FILE_MAPPED) <= zone->min_unmapped_pages)
+	    zone_page_state(zone, NR_FILE_MAPPED) <= zone->min_unmapped_pages
+	    && zone_page_state(zone, NR_SLAB_RECLAIMABLE)
+			<= zone->min_slab_pages)
 		return 0;
 
 	/*
-- 
cgit v1.2.3


From 89fa30242facca249aead2aac03c4c69764f911c Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Mon, 25 Sep 2006 23:31:55 -0700
Subject: [PATCH] NUMA: Add zone_to_nid function

There are many places where we need to determine the node of a zone.
Currently we use a difficult to read sequence of pointer dereferencing.
Put that into an inline function and use throughout VM.  Maybe we can find
a way to optimize the lookup in the future.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/mm/discontig.c | 2 +-
 arch/parisc/mm/init.c    | 2 +-
 include/linux/mm.h       | 7 ++++++-
 kernel/cpuset.c          | 4 ++--
 mm/hugetlb.c             | 2 +-
 mm/mempolicy.c           | 6 +++---
 mm/oom_kill.c            | 3 +--
 mm/page_alloc.c          | 2 +-
 mm/vmscan.c              | 2 +-
 9 files changed, 17 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/arch/i386/mm/discontig.c b/arch/i386/mm/discontig.c
index 07c300f93764..fb5d8b747de4 100644
--- a/arch/i386/mm/discontig.c
+++ b/arch/i386/mm/discontig.c
@@ -422,7 +422,7 @@ void __init set_highmem_pages_init(int bad_ppro)
 		zone_end_pfn = zone_start_pfn + zone->spanned_pages;
 
 		printk("Initializing %s for node %d (%08lx:%08lx)\n",
-				zone->name, zone->zone_pgdat->node_id,
+				zone->name, zone_to_nid(zone),
 				zone_start_pfn, zone_end_pfn);
 
 		for (node_pfn = zone_start_pfn; node_pfn < zone_end_pfn; node_pfn++) {
diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c
index c7329615ef94..25ad28d63e88 100644
--- a/arch/parisc/mm/init.c
+++ b/arch/parisc/mm/init.c
@@ -551,7 +551,7 @@ void show_mem(void)
 
 				printk("Zone list for zone %d on node %d: ", j, i);
 				for (k = 0; zl->zones[k] != NULL; k++) 
-					printk("[%d/%s] ", zl->zones[k]->zone_pgdat->node_id, zl->zones[k]->name);
+					printk("[%d/%s] ", zone_to_nid(zl->zones[k]), zl->zones[k]->name);
 				printk("\n");
 			}
 		}
diff --git a/include/linux/mm.h b/include/linux/mm.h
index f2018775b995..856f0ee7e84a 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -499,12 +499,17 @@ static inline struct zone *page_zone(struct page *page)
 	return zone_table[page_zone_id(page)];
 }
 
+static inline unsigned long zone_to_nid(struct zone *zone)
+{
+	return zone->zone_pgdat->node_id;
+}
+
 static inline unsigned long page_to_nid(struct page *page)
 {
 	if (FLAGS_HAS_NODE)
 		return (page->flags >> NODES_PGSHIFT) & NODES_MASK;
 	else
-		return page_zone(page)->zone_pgdat->node_id;
+		return zone_to_nid(page_zone(page));
 }
 static inline unsigned long page_to_section(struct page *page)
 {
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 76940361273e..cff41511269f 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2245,7 +2245,7 @@ int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl)
 	int i;
 
 	for (i = 0; zl->zones[i]; i++) {
-		int nid = zl->zones[i]->zone_pgdat->node_id;
+		int nid = zone_to_nid(zl->zones[i]);
 
 		if (node_isset(nid, current->mems_allowed))
 			return 1;
@@ -2318,7 +2318,7 @@ int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
 
 	if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
 		return 1;
-	node = z->zone_pgdat->node_id;
+	node = zone_to_nid(z);
 	might_sleep_if(!(gfp_mask & __GFP_HARDWALL));
 	if (node_isset(node, current->mems_allowed))
 		return 1;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 3aceadce1a76..7c7d03dbf73d 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -72,7 +72,7 @@ static struct page *dequeue_huge_page(struct vm_area_struct *vma,
 	struct zone **z;
 
 	for (z = zonelist->zones; *z; z++) {
-		nid = (*z)->zone_pgdat->node_id;
+		nid = zone_to_nid(*z);
 		if (cpuset_zone_allowed(*z, GFP_HIGHUSER) &&
 		    !list_empty(&hugepage_freelists[nid]))
 			break;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 8002e1faccda..38f89650bc84 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -487,7 +487,7 @@ static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
 	switch (p->policy) {
 	case MPOL_BIND:
 		for (i = 0; p->v.zonelist->zones[i]; i++)
-			node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id,
+			node_set(zone_to_nid(p->v.zonelist->zones[i]),
 				*nodes);
 		break;
 	case MPOL_DEFAULT:
@@ -1145,7 +1145,7 @@ unsigned slab_node(struct mempolicy *policy)
 		 * Follow bind policy behavior and start allocation at the
 		 * first node.
 		 */
-		return policy->v.zonelist->zones[0]->zone_pgdat->node_id;
+		return zone_to_nid(policy->v.zonelist->zones[0]);
 
 	case MPOL_PREFERRED:
 		if (policy->v.preferred_node >= 0)
@@ -1649,7 +1649,7 @@ void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
 
 		nodes_clear(nodes);
 		for (z = pol->v.zonelist->zones; *z; z++)
-			node_set((*z)->zone_pgdat->node_id, nodes);
+			node_set(zone_to_nid(*z), nodes);
 		nodes_remap(tmp, nodes, *mpolmask, *newmask);
 		nodes = tmp;
 
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index f1c0ef1fd21f..bada3d03119f 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -177,8 +177,7 @@ static inline int constrained_alloc(struct zonelist *zonelist, gfp_t gfp_mask)
 
 	for (z = zonelist->zones; *z; z++)
 		if (cpuset_zone_allowed(*z, gfp_mask))
-			node_clear((*z)->zone_pgdat->node_id,
-					nodes);
+			node_clear(zone_to_nid(*z), nodes);
 		else
 			return CONSTRAINT_CPUSET;
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index cf913bdd433e..51070b6d593f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1217,7 +1217,7 @@ unsigned int nr_free_pagecache_pages(void)
 #ifdef CONFIG_NUMA
 static void show_node(struct zone *zone)
 {
-	printk("Node %d ", zone->zone_pgdat->node_id);
+	printk("Node %ld ", zone_to_nid(zone));
 }
 #else
 #define show_node(zone)	do { } while (0)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b950f193816e..87779dda4ec6 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1661,7 +1661,7 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 	 * over remote processors and spread off node memory allocations
 	 * as wide as possible.
 	 */
-	node_id = zone->zone_pgdat->node_id;
+	node_id = zone_to_nid(zone);
 	mask = node_to_cpumask(node_id);
 	if (!cpus_empty(mask) && node_id != numa_node_id())
 		return 0;
-- 
cgit v1.2.3


From 62bac0185ad3dfef11d9602980445c54d45199c6 Mon Sep 17 00:00:00 2001
From: Stephen Smalley <sds@tycho.nsa.gov>
Date: Mon, 25 Sep 2006 23:31:56 -0700
Subject: [PATCH] selinux: eliminate selinux_task_ctxid

Eliminate selinux_task_ctxid since it duplicates selinux_task_get_sid.

Signed-off-by: Stephen Smalley <sds@tycho.nsa.gov>
Acked-by: James Morris <jmorris@namei.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/selinux.h    | 15 ---------------
 kernel/auditsc.c           |  2 +-
 security/selinux/exports.c |  9 ---------
 3 files changed, 1 insertion(+), 25 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/selinux.h b/include/linux/selinux.h
index aad4e390d6a5..79e4707ca772 100644
--- a/include/linux/selinux.h
+++ b/include/linux/selinux.h
@@ -69,16 +69,6 @@ int selinux_audit_rule_match(u32 ctxid, u32 field, u32 op,
  */
 void selinux_audit_set_callback(int (*callback)(void));
 
-/**
- *	selinux_task_ctxid - determine a context ID for a process.
- *	@tsk: the task object
- *	@ctxid: ID value returned via this
- *
- *	On return, ctxid will contain an ID for the context.  This value
- *	should only be used opaquely.
- */
-void selinux_task_ctxid(struct task_struct *tsk, u32 *ctxid);
-
 /**
  *     selinux_ctxid_to_string - map a security context ID to a string
  *     @ctxid: security context ID to be converted.
@@ -166,11 +156,6 @@ static inline void selinux_audit_set_callback(int (*callback)(void))
 	return;
 }
 
-static inline void selinux_task_ctxid(struct task_struct *tsk, u32 *ctxid)
-{
-	*ctxid = 0;
-}
-
 static inline int selinux_ctxid_to_string(u32 ctxid, char **ctx, u32 *ctxlen)
 {
        *ctx = NULL;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 1bd8827a0102..331e17010393 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -385,7 +385,7 @@ static int audit_filter_rules(struct task_struct *tsk,
 			   logged upon error */
 			if (f->se_rule) {
 				if (need_sid) {
-					selinux_task_ctxid(tsk, &sid);
+					selinux_get_task_sid(tsk, &sid);
 					need_sid = 0;
 				}
 				result = selinux_audit_rule_match(sid, f->type,
diff --git a/security/selinux/exports.c b/security/selinux/exports.c
index 9d7737db5e51..ee0fb47f81ae 100644
--- a/security/selinux/exports.c
+++ b/security/selinux/exports.c
@@ -21,15 +21,6 @@
 #include "security.h"
 #include "objsec.h"
 
-void selinux_task_ctxid(struct task_struct *tsk, u32 *ctxid)
-{
-	struct task_security_struct *tsec = tsk->security;
-	if (selinux_enabled)
-		*ctxid = tsec->sid;
-	else
-		*ctxid = 0;
-}
-
 int selinux_ctxid_to_string(u32 ctxid, char **ctx, u32 *ctxlen)
 {
 	if (selinux_enabled)
-- 
cgit v1.2.3


From 1a70cd40cb291c25b67ec0da715a49d76719329d Mon Sep 17 00:00:00 2001
From: Stephen Smalley <sds@tycho.nsa.gov>
Date: Mon, 25 Sep 2006 23:31:57 -0700
Subject: [PATCH] selinux: rename selinux_ctxid_to_string

Rename selinux_ctxid_to_string to selinux_sid_to_string to be
consistent with other interfaces.

Signed-off-by: Stephen Smalley <sds@tycho.nsa.gov>
Acked-by: James Morris <jmorris@namei.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/selinux.h    |  8 ++++----
 kernel/audit.c             | 14 +++++++-------
 kernel/auditfilter.c       |  2 +-
 kernel/auditsc.c           |  4 ++--
 security/selinux/exports.c |  4 ++--
 5 files changed, 16 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/selinux.h b/include/linux/selinux.h
index 79e4707ca772..df9098de4c99 100644
--- a/include/linux/selinux.h
+++ b/include/linux/selinux.h
@@ -70,8 +70,8 @@ int selinux_audit_rule_match(u32 ctxid, u32 field, u32 op,
 void selinux_audit_set_callback(int (*callback)(void));
 
 /**
- *     selinux_ctxid_to_string - map a security context ID to a string
- *     @ctxid: security context ID to be converted.
+ *     selinux_sid_to_string - map a security context ID to a string
+ *     @sid: security context ID to be converted.
  *     @ctx: address of context string to be returned
  *     @ctxlen: length of returned context string.
  *
@@ -79,7 +79,7 @@ void selinux_audit_set_callback(int (*callback)(void));
  *     string will be allocated internally, and the caller must call
  *     kfree() on it after use.
  */
-int selinux_ctxid_to_string(u32 ctxid, char **ctx, u32 *ctxlen);
+int selinux_sid_to_string(u32 sid, char **ctx, u32 *ctxlen);
 
 /**
  *     selinux_get_inode_sid - get the inode's security context ID
@@ -156,7 +156,7 @@ static inline void selinux_audit_set_callback(int (*callback)(void))
 	return;
 }
 
-static inline int selinux_ctxid_to_string(u32 ctxid, char **ctx, u32 *ctxlen)
+static inline int selinux_sid_to_string(u32 sid, char **ctx, u32 *ctxlen)
 {
        *ctx = NULL;
        *ctxlen = 0;
diff --git a/kernel/audit.c b/kernel/audit.c
index 963fd15c9621..f9889ee77825 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -244,7 +244,7 @@ static int audit_set_rate_limit(int limit, uid_t loginuid, u32 sid)
 		char *ctx = NULL;
 		u32 len;
 		int rc;
-		if ((rc = selinux_ctxid_to_string(sid, &ctx, &len)))
+		if ((rc = selinux_sid_to_string(sid, &ctx, &len)))
 			return rc;
 		else
 			audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
@@ -267,7 +267,7 @@ static int audit_set_backlog_limit(int limit, uid_t loginuid, u32 sid)
 		char *ctx = NULL;
 		u32 len;
 		int rc;
-		if ((rc = selinux_ctxid_to_string(sid, &ctx, &len)))
+		if ((rc = selinux_sid_to_string(sid, &ctx, &len)))
 			return rc;
 		else
 			audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
@@ -293,7 +293,7 @@ static int audit_set_enabled(int state, uid_t loginuid, u32 sid)
 		char *ctx = NULL;
 		u32 len;
 		int rc;
-		if ((rc = selinux_ctxid_to_string(sid, &ctx, &len)))
+		if ((rc = selinux_sid_to_string(sid, &ctx, &len)))
 			return rc;
 		else
 			audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
@@ -321,7 +321,7 @@ static int audit_set_failure(int state, uid_t loginuid, u32 sid)
 		char *ctx = NULL;
 		u32 len;
 		int rc;
-		if ((rc = selinux_ctxid_to_string(sid, &ctx, &len)))
+		if ((rc = selinux_sid_to_string(sid, &ctx, &len)))
 			return rc;
 		else
 			audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
@@ -538,7 +538,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 		if (status_get->mask & AUDIT_STATUS_PID) {
 			int old   = audit_pid;
 			if (sid) {
-				if ((err = selinux_ctxid_to_string(
+				if ((err = selinux_sid_to_string(
 						sid, &ctx, &len)))
 					return err;
 				else
@@ -576,7 +576,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 						 "user pid=%d uid=%u auid=%u",
 						 pid, uid, loginuid);
 				if (sid) {
-					if (selinux_ctxid_to_string(
+					if (selinux_sid_to_string(
 							sid, &ctx, &len)) {
 						audit_log_format(ab, 
 							" ssid=%u", sid);
@@ -614,7 +614,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 					   loginuid, sid);
 		break;
 	case AUDIT_SIGNAL_INFO:
-		err = selinux_ctxid_to_string(audit_sig_sid, &ctx, &len);
+		err = selinux_sid_to_string(audit_sig_sid, &ctx, &len);
 		if (err)
 			return err;
 		sig_data = kmalloc(sizeof(*sig_data) + len, GFP_KERNEL);
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index a44879b0c72f..1a58a81fb09d 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -1398,7 +1398,7 @@ static void audit_log_rule_change(uid_t loginuid, u32 sid, char *action,
 	if (sid) {
 		char *ctx = NULL;
 		u32 len;
-		if (selinux_ctxid_to_string(sid, &ctx, &len))
+		if (selinux_sid_to_string(sid, &ctx, &len))
 			audit_log_format(ab, " ssid=%u", sid);
 		else
 			audit_log_format(ab, " subj=%s", ctx);
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 331e17010393..fb83c5cb8c32 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -898,7 +898,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 			if (axi->osid != 0) {
 				char *ctx = NULL;
 				u32 len;
-				if (selinux_ctxid_to_string(
+				if (selinux_sid_to_string(
 						axi->osid, &ctx, &len)) {
 					audit_log_format(ab, " osid=%u",
 							axi->osid);
@@ -1005,7 +1005,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 		if (n->osid != 0) {
 			char *ctx = NULL;
 			u32 len;
-			if (selinux_ctxid_to_string(
+			if (selinux_sid_to_string(
 				n->osid, &ctx, &len)) {
 				audit_log_format(ab, " osid=%u", n->osid);
 				call_panic = 2;
diff --git a/security/selinux/exports.c b/security/selinux/exports.c
index ee0fb47f81ae..b6f96943be1f 100644
--- a/security/selinux/exports.c
+++ b/security/selinux/exports.c
@@ -21,10 +21,10 @@
 #include "security.h"
 #include "objsec.h"
 
-int selinux_ctxid_to_string(u32 ctxid, char **ctx, u32 *ctxlen)
+int selinux_sid_to_string(u32 sid, char **ctx, u32 *ctxlen)
 {
 	if (selinux_enabled)
-		return security_sid_to_context(ctxid, ctx, ctxlen);
+		return security_sid_to_context(sid, ctx, ctxlen);
 	else {
 		*ctx = NULL;
 		*ctxlen = 0;
-- 
cgit v1.2.3


From af8c65b57aaa4ae321af34dbfc5ca7f5625263fe Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 25 Sep 2006 23:32:07 -0700
Subject: [PATCH] FRV: permit __do_IRQ() to be dispensed with

Permit __do_IRQ() to be dispensed with based on a configuration option.

Signed-off-by: David Howells <dhowells@redhat.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/frv/Kconfig    | 4 ++++
 include/linux/irq.h | 6 ++++++
 kernel/irq/handle.c | 2 ++
 3 files changed, 12 insertions(+)

(limited to 'kernel')

diff --git a/arch/frv/Kconfig b/arch/frv/Kconfig
index 5833808fb823..f7b171b92ea2 100644
--- a/arch/frv/Kconfig
+++ b/arch/frv/Kconfig
@@ -29,6 +29,10 @@ config GENERIC_HARDIRQS
 	bool
 	default y
 
+config GENERIC_HARDIRQS_NO__DO_IRQ
+	bool
+	default y
+
 config GENERIC_TIME
 	bool
 	default y
diff --git a/include/linux/irq.h b/include/linux/irq.h
index fbf6d901e9c2..48d3cb3b6a47 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -320,7 +320,9 @@ handle_irq_name(void fastcall (*handle)(unsigned int, struct irq_desc *,
  * Monolithic do_IRQ implementation.
  * (is an explicit fastcall, because i386 4KSTACKS calls it from assembly)
  */
+#ifndef CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ
 extern fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs);
+#endif
 
 /*
  * Architectures call this to let the generic IRQ layer
@@ -332,10 +334,14 @@ static inline void generic_handle_irq(unsigned int irq, struct pt_regs *regs)
 {
 	struct irq_desc *desc = irq_desc + irq;
 
+#ifdef CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ
+	desc->handle_irq(irq, desc, regs);
+#else
 	if (likely(desc->handle_irq))
 		desc->handle_irq(irq, desc, regs);
 	else
 		__do_IRQ(irq, regs);
+#endif
 }
 
 /* Handling of unhandled and spurious interrupts: */
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 48a53f68af96..4c6cdbaed661 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -154,6 +154,7 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct pt_regs *regs,
 	return retval;
 }
 
+#ifndef CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ
 /**
  * __do_IRQ - original all in one highlevel IRQ handler
  * @irq:	the interrupt number
@@ -253,6 +254,7 @@ out:
 
 	return 1;
 }
+#endif
 
 #ifdef CONFIG_TRACE_IRQFLAGS
 
-- 
cgit v1.2.3


From 3a4f7577c9ef393ca80c783f02ffbc125de771c7 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Mon, 25 Sep 2006 23:32:41 -0700
Subject: [PATCH] swsusp: add write-speed instrumentation

Add some instrumentation to the swsusp writeout code to show what bandwidth
we're achieving.

Cc: Pavel Machek <pavel@ucw.cz>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/swap.c | 32 +++++++++++++++++++++++++++++---
 1 file changed, 29 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index f1dd146bd64d..79b66e734bdb 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -146,6 +146,26 @@ static void release_swap_writer(struct swap_map_handle *handle)
 	handle->bitmap = NULL;
 }
 
+static void show_speed(struct timeval *start, struct timeval *stop,
+			unsigned nr_pages, char *msg)
+{
+	s64 elapsed_centisecs64;
+	int centisecs;
+	int k;
+	int kps;
+
+	elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start);
+	do_div(elapsed_centisecs64, NSEC_PER_SEC / 100);
+	centisecs = elapsed_centisecs64;
+	if (centisecs == 0)
+		centisecs = 1;	/* avoid div-by-zero */
+	k = nr_pages * (PAGE_SIZE / 1024);
+	kps = (k * 100) / centisecs;
+	printk("%s %d kbytes in %d.%02d seconds (%d.%02d MB/s)\n", msg, k,
+			centisecs / 100, centisecs % 100,
+			kps / 1000, (kps % 1000) / 10);
+}
+
 static int get_swap_writer(struct swap_map_handle *handle)
 {
 	handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_KERNEL);
@@ -206,17 +226,21 @@ static int flush_swap_writer(struct swap_map_handle *handle)
 
 static int save_image(struct swap_map_handle *handle,
                       struct snapshot_handle *snapshot,
-                      unsigned int nr_pages)
+                      unsigned int nr_to_write)
 {
 	unsigned int m;
 	int ret;
 	int error = 0;
+	int nr_pages;
+	struct timeval start;
+	struct timeval stop;
 
-	printk("Saving image data pages (%u pages) ...     ", nr_pages);
-	m = nr_pages / 100;
+	printk("Saving image data pages (%u pages) ...     ", nr_to_write);
+	m = nr_to_write / 100;
 	if (!m)
 		m = 1;
 	nr_pages = 0;
+	do_gettimeofday(&start);
 	do {
 		ret = snapshot_read_next(snapshot, PAGE_SIZE);
 		if (ret > 0) {
@@ -228,8 +252,10 @@ static int save_image(struct swap_map_handle *handle,
 			nr_pages++;
 		}
 	} while (ret > 0);
+	do_gettimeofday(&stop);
 	if (!error)
 		printk("\b\b\b\bdone\n");
+	show_speed(&start, &stop, nr_to_write, "Wrote");
 	return error;
 }
 
-- 
cgit v1.2.3


From ab954160350c91c77ae03740ef90458c3ad5412c Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Mon, 25 Sep 2006 23:32:42 -0700
Subject: [PATCH] swsusp: write speedup

Switch the swsusp writeout code from 4k-at-a-time to 4MB-at-a-time.

Crufty old PIII testbox:
	12.9 MB/s -> 20.9 MB/s

Sony Vaio:
	14.7 MB/s -> 26.5 MB/s

The implementation is crude.  A better one would use larger BIOs, but wouldn't
gain any performance.

The memcpys will be mostly pipelined with the IO and basically come for free.

The ENOMEM path has not been tested.  It should be.

Cc: Pavel Machek <pavel@ucw.cz>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/swap.h |  5 ++-
 kernel/power/swap.c  | 93 ++++++++++++++++++++++++++++++++++++++++++----------
 mm/page_io.c         | 25 ++++++++++----
 3 files changed, 98 insertions(+), 25 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index a2f5ad7c2d2e..3d434cbffe26 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -12,6 +12,8 @@
 
 struct notifier_block;
 
+struct bio;
+
 #define SWAP_FLAG_PREFER	0x8000	/* set if swap priority specified */
 #define SWAP_FLAG_PRIO_MASK	0x7fff
 #define SWAP_FLAG_PRIO_SHIFT	0
@@ -216,7 +218,8 @@ extern void swap_unplug_io_fn(struct backing_dev_info *, struct page *);
 /* linux/mm/page_io.c */
 extern int swap_readpage(struct file *, struct page *);
 extern int swap_writepage(struct page *page, struct writeback_control *wbc);
-extern int rw_swap_page_sync(int, swp_entry_t, struct page *);
+extern int rw_swap_page_sync(int rw, swp_entry_t entry, struct page *page,
+				struct bio **bio_chain);
 
 /* linux/mm/swap_state.c */
 extern struct address_space swapper_space;
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 79b66e734bdb..2dc883d361d5 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -49,18 +49,16 @@ static int mark_swapfiles(swp_entry_t start)
 {
 	int error;
 
-	rw_swap_page_sync(READ,
-			  swp_entry(root_swap, 0),
-			  virt_to_page((unsigned long)&swsusp_header));
+	rw_swap_page_sync(READ, swp_entry(root_swap, 0),
+			  virt_to_page((unsigned long)&swsusp_header), NULL);
 	if (!memcmp("SWAP-SPACE",swsusp_header.sig, 10) ||
 	    !memcmp("SWAPSPACE2",swsusp_header.sig, 10)) {
 		memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10);
 		memcpy(swsusp_header.sig,SWSUSP_SIG, 10);
 		swsusp_header.image = start;
-		error = rw_swap_page_sync(WRITE,
-					  swp_entry(root_swap, 0),
-					  virt_to_page((unsigned long)
-						       &swsusp_header));
+		error = rw_swap_page_sync(WRITE, swp_entry(root_swap, 0),
+				virt_to_page((unsigned long)&swsusp_header),
+				NULL);
 	} else {
 		pr_debug("swsusp: Partition is not swap space.\n");
 		error = -ENODEV;
@@ -88,16 +86,37 @@ static int swsusp_swap_check(void) /* This is called before saving image */
  *	write_page - Write one page to given swap location.
  *	@buf:		Address we're writing.
  *	@offset:	Offset of the swap page we're writing to.
+ *	@bio_chain:	Link the next write BIO here
  */
 
-static int write_page(void *buf, unsigned long offset)
+static int write_page(void *buf, unsigned long offset, struct bio **bio_chain)
 {
 	swp_entry_t entry;
 	int error = -ENOSPC;
 
 	if (offset) {
+		struct page *page = virt_to_page(buf);
+
+		if (bio_chain) {
+			/*
+			 * Whether or not we successfully allocated a copy page,
+			 * we take a ref on the page here.  It gets undone in
+			 * wait_on_bio_chain().
+			 */
+			struct page *page_copy;
+			page_copy = alloc_page(GFP_ATOMIC);
+			if (page_copy == NULL) {
+				WARN_ON_ONCE(1);
+				bio_chain = NULL;	/* Go synchronous */
+				get_page(page);
+			} else {
+				memcpy(page_address(page_copy),
+					page_address(page), PAGE_SIZE);
+				page = page_copy;
+			}
+		}
 		entry = swp_entry(root_swap, offset);
-		error = rw_swap_page_sync(WRITE, entry, virt_to_page(buf));
+		error = rw_swap_page_sync(WRITE, entry, page, bio_chain);
 	}
 	return error;
 }
@@ -185,37 +204,68 @@ static int get_swap_writer(struct swap_map_handle *handle)
 	return 0;
 }
 
-static int swap_write_page(struct swap_map_handle *handle, void *buf)
+static int wait_on_bio_chain(struct bio **bio_chain)
 {
-	int error;
+	struct bio *bio;
+	struct bio *next_bio;
+	int ret = 0;
+
+	if (bio_chain == NULL)
+		return 0;
+
+	bio = *bio_chain;
+	while (bio) {
+		struct page *page;
+
+		next_bio = bio->bi_private;
+		page = bio->bi_io_vec[0].bv_page;
+		wait_on_page_locked(page);
+		if (!PageUptodate(page) || PageError(page))
+			ret = -EIO;
+		put_page(page);
+		bio_put(bio);
+		bio = next_bio;
+	}
+	*bio_chain = NULL;
+	return ret;
+}
+
+static int swap_write_page(struct swap_map_handle *handle, void *buf,
+				struct bio **bio_chain)
+{
+	int error = 0;
 	unsigned long offset;
 
 	if (!handle->cur)
 		return -EINVAL;
 	offset = alloc_swap_page(root_swap, handle->bitmap);
-	error = write_page(buf, offset);
+	error = write_page(buf, offset, bio_chain);
 	if (error)
 		return error;
 	handle->cur->entries[handle->k++] = offset;
 	if (handle->k >= MAP_PAGE_ENTRIES) {
+		error = wait_on_bio_chain(bio_chain);
+		if (error)
+			goto out;
 		offset = alloc_swap_page(root_swap, handle->bitmap);
 		if (!offset)
 			return -ENOSPC;
 		handle->cur->next_swap = offset;
-		error = write_page(handle->cur, handle->cur_swap);
+		error = write_page(handle->cur, handle->cur_swap, NULL);
 		if (error)
-			return error;
+			goto out;
 		memset(handle->cur, 0, PAGE_SIZE);
 		handle->cur_swap = offset;
 		handle->k = 0;
 	}
-	return 0;
+out:
+	return error;
 }
 
 static int flush_swap_writer(struct swap_map_handle *handle)
 {
 	if (handle->cur && handle->cur_swap)
-		return write_page(handle->cur, handle->cur_swap);
+		return write_page(handle->cur, handle->cur_swap, NULL);
 	else
 		return -EINVAL;
 }
@@ -232,6 +282,8 @@ static int save_image(struct swap_map_handle *handle,
 	int ret;
 	int error = 0;
 	int nr_pages;
+	int err2;
+	struct bio *bio;
 	struct timeval start;
 	struct timeval stop;
 
@@ -240,11 +292,13 @@ static int save_image(struct swap_map_handle *handle,
 	if (!m)
 		m = 1;
 	nr_pages = 0;
+	bio = NULL;
 	do_gettimeofday(&start);
 	do {
 		ret = snapshot_read_next(snapshot, PAGE_SIZE);
 		if (ret > 0) {
-			error = swap_write_page(handle, data_of(*snapshot));
+			error = swap_write_page(handle, data_of(*snapshot),
+						&bio);
 			if (error)
 				break;
 			if (!(nr_pages % m))
@@ -252,7 +306,10 @@ static int save_image(struct swap_map_handle *handle,
 			nr_pages++;
 		}
 	} while (ret > 0);
+	err2 = wait_on_bio_chain(&bio);
 	do_gettimeofday(&stop);
+	if (!error)
+		error = err2;
 	if (!error)
 		printk("\b\b\b\bdone\n");
 	show_speed(&start, &stop, nr_to_write, "Wrote");
@@ -307,7 +364,7 @@ int swsusp_write(void)
 	error = get_swap_writer(&handle);
 	if (!error) {
 		unsigned long start = handle.cur_swap;
-		error = swap_write_page(&handle, header);
+		error = swap_write_page(&handle, header, NULL);
 		if (!error)
 			error = save_image(&handle, &snapshot,
 					header->pages - 1);
diff --git a/mm/page_io.c b/mm/page_io.c
index d2f0a5783370..f46a9862b7ef 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -156,10 +156,12 @@ out:
  * We use end_swap_bio_read() even for writes, because it happens to do what
  * we want.
  */
-int rw_swap_page_sync(int rw, swp_entry_t entry, struct page *page)
+int rw_swap_page_sync(int rw, swp_entry_t entry, struct page *page,
+			struct bio **bio_chain)
 {
 	struct bio *bio;
 	int ret = 0;
+	int bio_rw;
 
 	lock_page(page);
 
@@ -170,11 +172,22 @@ int rw_swap_page_sync(int rw, swp_entry_t entry, struct page *page)
 		goto out;
 	}
 
-	submit_bio(rw | (1 << BIO_RW_SYNC), bio);
-	wait_on_page_locked(page);
-
-	if (!PageUptodate(page) || PageError(page))
-		ret = -EIO;
+	bio_rw = rw;
+	if (!bio_chain)
+		bio_rw |= (1 << BIO_RW_SYNC);
+	if (bio_chain)
+		bio_get(bio);
+	submit_bio(bio_rw, bio);
+	if (bio_chain == NULL) {
+		wait_on_page_locked(page);
+
+		if (!PageUptodate(page) || PageError(page))
+			ret = -EIO;
+	}
+	if (bio_chain) {
+		bio->bi_private = *bio_chain;
+		*bio_chain = bio;
+	}
 out:
 	return ret;
 }
-- 
cgit v1.2.3


From 8c002494b55119a3fd1dddee83b4fb75cfda47e5 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Mon, 25 Sep 2006 23:32:43 -0700
Subject: [PATCH] swsusp: add read-speed instrumentation

Add some instrumentation to the swsusp readin code to show what bandwidth
we're achieving.

Cc: Pavel Machek <pavel@ucw.cz>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/swap.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'kernel')

diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 2dc883d361d5..9ab989572164 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -522,12 +522,15 @@ static int load_image(struct swap_map_handle *handle,
 	unsigned int m;
 	int ret;
 	int error = 0;
+	struct timeval start;
+	struct timeval stop;
 
 	printk("Loading image data pages (%u pages) ...     ", nr_pages);
 	m = nr_pages / 100;
 	if (!m)
 		m = 1;
 	nr_pages = 0;
+	do_gettimeofday(&start);
 	do {
 		ret = snapshot_write_next(snapshot, PAGE_SIZE);
 		if (ret > 0) {
@@ -539,11 +542,13 @@ static int load_image(struct swap_map_handle *handle,
 			nr_pages++;
 		}
 	} while (ret > 0);
+	do_gettimeofday(&stop);
 	if (!error) {
 		printk("\b\b\b\bdone\n");
 		if (!snapshot_image_loaded(snapshot))
 			error = -ENODATA;
 	}
+	show_speed(&start, &stop, nr_pages, "Read");
 	return error;
 }
 
-- 
cgit v1.2.3


From 546e0d271941dd1ff6961e2a1f7eac75f1fc277e Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Mon, 25 Sep 2006 23:32:44 -0700
Subject: [PATCH] swsusp: read speedup

Implement async reads for swsusp resuming.

Crufty old PIII testbox:
	15.7 MB/s -> 20.3 MB/s

Sony Vaio:
	14.6 MB/s -> 33.3 MB/s

I didn't implement the post-resume bio_set_pages_dirty().  I don't really
understand why resume needs to run set_page_dirty() against these pages.

It might be a worry that this code modifies PG_Uptodate, PG_Error and
PG_Locked against the image pages.  Can this possibly affect the resumed-into
kernel?  Hopefully not, if we're atomically restoring its mem_map?

Cc: Pavel Machek <pavel@ucw.cz>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Cc: Jens Axboe <axboe@suse.de>
Cc: Laurent Riffard <laurent.riffard@free.fr>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/swap.h    |   1 +
 kernel/power/power.h    |   1 +
 kernel/power/snapshot.c |  11 ++--
 kernel/power/swap.c     | 138 ++++++++++++++++++++++++------------------------
 mm/page_io.c            |   2 +-
 5 files changed, 81 insertions(+), 72 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 3d434cbffe26..e7c36ba2a2db 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -220,6 +220,7 @@ extern int swap_readpage(struct file *, struct page *);
 extern int swap_writepage(struct page *page, struct writeback_control *wbc);
 extern int rw_swap_page_sync(int rw, swp_entry_t entry, struct page *page,
 				struct bio **bio_chain);
+extern int end_swap_bio_read(struct bio *bio, unsigned int bytes_done, int err);
 
 /* linux/mm/swap_state.c */
 extern struct address_space swapper_space;
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 57a792982fb9..59ce712f082d 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -58,6 +58,7 @@ struct snapshot_handle {
 	struct pbe	*pbe, *last_pbe;
 	void		*buffer;
 	unsigned int	buf_offset;
+	int		sync_read;
 };
 
 #define data_of(handle)	((handle).buffer + (handle).buf_offset)
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 75d4886e648e..591301ae8b7d 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -314,7 +314,7 @@ static unsigned int unsafe_pages;
  *	and we count them using unsafe_pages
  */
 
-static inline void *alloc_image_page(gfp_t gfp_mask, int safe_needed)
+static void *alloc_image_page(gfp_t gfp_mask, int safe_needed)
 {
 	void *res;
 
@@ -828,13 +828,16 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count)
 	}
 	if (!handle->offset)
 		handle->buffer = buffer;
+	handle->sync_read = 1;
 	if (handle->prev < handle->page) {
 		if (!handle->prev) {
-			error = load_header(handle, (struct swsusp_info *)buffer);
+			error = load_header(handle,
+					(struct swsusp_info *)buffer);
 			if (error)
 				return error;
 		} else if (handle->prev <= nr_meta_pages) {
-			handle->pbe = unpack_orig_addresses(buffer, handle->pbe);
+			handle->pbe = unpack_orig_addresses(buffer,
+							handle->pbe);
 			if (!handle->pbe) {
 				error = prepare_image(handle);
 				if (error)
@@ -842,10 +845,12 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count)
 				handle->pbe = pagedir_nosave;
 				handle->last_pbe = NULL;
 				handle->buffer = get_buffer(handle);
+				handle->sync_read = 0;
 			}
 		} else {
 			handle->pbe = handle->pbe->next;
 			handle->buffer = get_buffer(handle);
+			handle->sync_read = 0;
 		}
 		handle->prev = handle->page;
 	}
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 9ab989572164..8309d20b2563 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -22,6 +22,7 @@
 #include <linux/device.h>
 #include <linux/buffer_head.h>
 #include <linux/bio.h>
+#include <linux/blkdev.h>
 #include <linux/swap.h>
 #include <linux/swapops.h>
 #include <linux/pm.h>
@@ -214,6 +215,8 @@ static int wait_on_bio_chain(struct bio **bio_chain)
 		return 0;
 
 	bio = *bio_chain;
+	if (bio == NULL)
+		return 0;
 	while (bio) {
 		struct page *page;
 
@@ -349,7 +352,8 @@ int swsusp_write(void)
 	int error;
 
 	if ((error = swsusp_swap_check())) {
-		printk(KERN_ERR "swsusp: Cannot find swap device, try swapon -a.\n");
+		printk(KERN_ERR "swsusp: Cannot find swap device, try "
+				"swapon -a.\n");
 		return error;
 	}
 	memset(&snapshot, 0, sizeof(struct snapshot_handle));
@@ -381,27 +385,6 @@ int swsusp_write(void)
 	return error;
 }
 
-/*
- *	Using bio to read from swap.
- *	This code requires a bit more work than just using buffer heads
- *	but, it is the recommended way for 2.5/2.6.
- *	The following are to signal the beginning and end of I/O. Bios
- *	finish asynchronously, while we want them to happen synchronously.
- *	A simple atomic_t, and a wait loop take care of this problem.
- */
-
-static atomic_t io_done = ATOMIC_INIT(0);
-
-static int end_io(struct bio *bio, unsigned int num, int err)
-{
-	if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
-		printk(KERN_ERR "I/O error reading swsusp image.\n");
-		return -EIO;
-	}
-	atomic_set(&io_done, 0);
-	return 0;
-}
-
 static struct block_device *resume_bdev;
 
 /**
@@ -409,15 +392,15 @@ static struct block_device *resume_bdev;
  *	@rw:	READ or WRITE.
  *	@off	physical offset of page.
  *	@page:	page we're reading or writing.
+ *	@bio_chain: list of pending biod (for async reading)
  *
  *	Straight from the textbook - allocate and initialize the bio.
- *	If we're writing, make sure the page is marked as dirty.
- *	Then submit it and wait.
+ *	If we're reading, make sure the page is marked as dirty.
+ *	Then submit it and, if @bio_chain == NULL, wait.
  */
-
-static int submit(int rw, pgoff_t page_off, void *page)
+static int submit(int rw, pgoff_t page_off, struct page *page,
+			struct bio **bio_chain)
 {
-	int error = 0;
 	struct bio *bio;
 
 	bio = bio_alloc(GFP_ATOMIC, 1);
@@ -425,33 +408,40 @@ static int submit(int rw, pgoff_t page_off, void *page)
 		return -ENOMEM;
 	bio->bi_sector = page_off * (PAGE_SIZE >> 9);
 	bio->bi_bdev = resume_bdev;
-	bio->bi_end_io = end_io;
+	bio->bi_end_io = end_swap_bio_read;
 
-	if (bio_add_page(bio, virt_to_page(page), PAGE_SIZE, 0) < PAGE_SIZE) {
-		printk("swsusp: ERROR: adding page to bio at %ld\n",page_off);
-		error = -EFAULT;
-		goto Done;
+	if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
+		printk("swsusp: ERROR: adding page to bio at %ld\n", page_off);
+		bio_put(bio);
+		return -EFAULT;
 	}
 
-	atomic_set(&io_done, 1);
-	submit_bio(rw | (1 << BIO_RW_SYNC), bio);
-	while (atomic_read(&io_done))
-		yield();
-	if (rw == READ)
-		bio_set_pages_dirty(bio);
- Done:
-	bio_put(bio);
-	return error;
+	lock_page(page);
+	bio_get(bio);
+
+	if (bio_chain == NULL) {
+		submit_bio(rw | (1 << BIO_RW_SYNC), bio);
+		wait_on_page_locked(page);
+		if (rw == READ)
+			bio_set_pages_dirty(bio);
+		bio_put(bio);
+	} else {
+		get_page(page);
+		bio->bi_private = *bio_chain;
+		*bio_chain = bio;
+		submit_bio(rw | (1 << BIO_RW_SYNC), bio);
+	}
+	return 0;
 }
 
-static int bio_read_page(pgoff_t page_off, void *page)
+static int bio_read_page(pgoff_t page_off, void *addr, struct bio **bio_chain)
 {
-	return submit(READ, page_off, page);
+	return submit(READ, page_off, virt_to_page(addr), bio_chain);
 }
 
-static int bio_write_page(pgoff_t page_off, void *page)
+static int bio_write_page(pgoff_t page_off, void *addr)
 {
-	return submit(WRITE, page_off, page);
+	return submit(WRITE, page_off, virt_to_page(addr), NULL);
 }
 
 /**
@@ -476,7 +466,7 @@ static int get_swap_reader(struct swap_map_handle *handle,
 	handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC);
 	if (!handle->cur)
 		return -ENOMEM;
-	error = bio_read_page(swp_offset(start), handle->cur);
+	error = bio_read_page(swp_offset(start), handle->cur, NULL);
 	if (error) {
 		release_swap_reader(handle);
 		return error;
@@ -485,7 +475,8 @@ static int get_swap_reader(struct swap_map_handle *handle,
 	return 0;
 }
 
-static int swap_read_page(struct swap_map_handle *handle, void *buf)
+static int swap_read_page(struct swap_map_handle *handle, void *buf,
+				struct bio **bio_chain)
 {
 	unsigned long offset;
 	int error;
@@ -495,16 +486,17 @@ static int swap_read_page(struct swap_map_handle *handle, void *buf)
 	offset = handle->cur->entries[handle->k];
 	if (!offset)
 		return -EFAULT;
-	error = bio_read_page(offset, buf);
+	error = bio_read_page(offset, buf, bio_chain);
 	if (error)
 		return error;
 	if (++handle->k >= MAP_PAGE_ENTRIES) {
+		error = wait_on_bio_chain(bio_chain);
 		handle->k = 0;
 		offset = handle->cur->next_swap;
 		if (!offset)
 			release_swap_reader(handle);
-		else
-			error = bio_read_page(offset, handle->cur);
+		else if (!error)
+			error = bio_read_page(offset, handle->cur, NULL);
 	}
 	return error;
 }
@@ -517,38 +509,48 @@ static int swap_read_page(struct swap_map_handle *handle, void *buf)
 
 static int load_image(struct swap_map_handle *handle,
                       struct snapshot_handle *snapshot,
-                      unsigned int nr_pages)
+                      unsigned int nr_to_read)
 {
 	unsigned int m;
-	int ret;
 	int error = 0;
 	struct timeval start;
 	struct timeval stop;
+	struct bio *bio;
+	int err2;
+	unsigned nr_pages;
 
-	printk("Loading image data pages (%u pages) ...     ", nr_pages);
-	m = nr_pages / 100;
+	printk("Loading image data pages (%u pages) ...     ", nr_to_read);
+	m = nr_to_read / 100;
 	if (!m)
 		m = 1;
 	nr_pages = 0;
+	bio = NULL;
 	do_gettimeofday(&start);
-	do {
-		ret = snapshot_write_next(snapshot, PAGE_SIZE);
-		if (ret > 0) {
-			error = swap_read_page(handle, data_of(*snapshot));
-			if (error)
-				break;
-			if (!(nr_pages % m))
-				printk("\b\b\b\b%3d%%", nr_pages / m);
-			nr_pages++;
-		}
-	} while (ret > 0);
+	for ( ; ; ) {
+		error = snapshot_write_next(snapshot, PAGE_SIZE);
+		if (error <= 0)
+			break;
+		error = swap_read_page(handle, data_of(*snapshot), &bio);
+		if (error)
+			break;
+		if (snapshot->sync_read)
+			error = wait_on_bio_chain(&bio);
+		if (error)
+			break;
+		if (!(nr_pages % m))
+			printk("\b\b\b\b%3d%%", nr_pages / m);
+		nr_pages++;
+	}
+	err2 = wait_on_bio_chain(&bio);
 	do_gettimeofday(&stop);
+	if (!error)
+		error = err2;
 	if (!error) {
 		printk("\b\b\b\bdone\n");
 		if (!snapshot_image_loaded(snapshot))
 			error = -ENODATA;
 	}
-	show_speed(&start, &stop, nr_pages, "Read");
+	show_speed(&start, &stop, nr_to_read, "Read");
 	return error;
 }
 
@@ -571,7 +573,7 @@ int swsusp_read(void)
 	header = (struct swsusp_info *)data_of(snapshot);
 	error = get_swap_reader(&handle, swsusp_header.image);
 	if (!error)
-		error = swap_read_page(&handle, header);
+		error = swap_read_page(&handle, header, NULL);
 	if (!error)
 		error = load_image(&handle, &snapshot, header->pages - 1);
 	release_swap_reader(&handle);
@@ -597,7 +599,7 @@ int swsusp_check(void)
 	if (!IS_ERR(resume_bdev)) {
 		set_blocksize(resume_bdev, PAGE_SIZE);
 		memset(&swsusp_header, 0, sizeof(swsusp_header));
-		if ((error = bio_read_page(0, &swsusp_header)))
+		if ((error = bio_read_page(0, &swsusp_header, NULL)))
 			return error;
 		if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) {
 			memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10);
diff --git a/mm/page_io.c b/mm/page_io.c
index f46a9862b7ef..d4840ecbf8f9 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -74,7 +74,7 @@ static int end_swap_bio_write(struct bio *bio, unsigned int bytes_done, int err)
 	return 0;
 }
 
-static int end_swap_bio_read(struct bio *bio, unsigned int bytes_done, int err)
+int end_swap_bio_read(struct bio *bio, unsigned int bytes_done, int err)
 {
 	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 	struct page *page = bio->bi_io_vec[0].bv_page;
-- 
cgit v1.2.3


From ae83c5eef59ffe6eb61110b8c8fd1ea3e0881712 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Mon, 25 Sep 2006 23:32:45 -0700
Subject: [PATCH] swsusp: clean up browsing of pfns

Clean up some loops over pfns for each zone in snapshot.c: reduce the
number of additions to perform, rework detection of saveable pages and make
the code a bit less difficult to understand, hopefully.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/snapshot.c | 62 +++++++++++++++++++++++++------------------------
 1 file changed, 32 insertions(+), 30 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 591301ae8b7d..979096c27773 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -156,7 +156,7 @@ static inline int save_highmem(void) {return 0;}
 static inline int restore_highmem(void) {return 0;}
 #endif
 
-static int pfn_is_nosave(unsigned long pfn)
+static inline int pfn_is_nosave(unsigned long pfn)
 {
 	unsigned long nosave_begin_pfn = __pa(&__nosave_begin) >> PAGE_SHIFT;
 	unsigned long nosave_end_pfn = PAGE_ALIGN(__pa(&__nosave_end)) >> PAGE_SHIFT;
@@ -167,43 +167,43 @@ static int pfn_is_nosave(unsigned long pfn)
  *	saveable - Determine whether a page should be cloned or not.
  *	@pfn:	The page
  *
- *	We save a page if it's Reserved, and not in the range of pages
- *	statically defined as 'unsaveable', or if it isn't reserved, and
- *	isn't part of a free chunk of pages.
+ *	We save a page if it isn't Nosave, and is not in the range of pages
+ *	statically defined as 'unsaveable', and it
+ *	isn't a part of a free chunk of pages.
  */
 
-static int saveable(struct zone *zone, unsigned long *zone_pfn)
+static struct page *saveable_page(unsigned long pfn)
 {
-	unsigned long pfn = *zone_pfn + zone->zone_start_pfn;
 	struct page *page;
 
 	if (!pfn_valid(pfn))
-		return 0;
+		return NULL;
 
 	page = pfn_to_page(pfn);
-	BUG_ON(PageReserved(page) && PageNosave(page));
+
 	if (PageNosave(page))
-		return 0;
+		return NULL;
 	if (PageReserved(page) && pfn_is_nosave(pfn))
-		return 0;
+		return NULL;
 	if (PageNosaveFree(page))
-		return 0;
+		return NULL;
 
-	return 1;
+	return page;
 }
 
 unsigned int count_data_pages(void)
 {
 	struct zone *zone;
-	unsigned long zone_pfn;
+	unsigned long pfn, max_zone_pfn;
 	unsigned int n = 0;
 
 	for_each_zone (zone) {
 		if (is_highmem(zone))
 			continue;
 		mark_free_pages(zone);
-		for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
-			n += saveable(zone, &zone_pfn);
+		max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
+		for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
+			n += !!saveable_page(pfn);
 	}
 	return n;
 }
@@ -211,7 +211,7 @@ unsigned int count_data_pages(void)
 static void copy_data_pages(struct pbe *pblist)
 {
 	struct zone *zone;
-	unsigned long zone_pfn;
+	unsigned long pfn, max_zone_pfn;
 	struct pbe *pbe, *p;
 
 	pbe = pblist;
@@ -224,13 +224,14 @@ static void copy_data_pages(struct pbe *pblist)
 			SetPageNosaveFree(virt_to_page(p));
 		for_each_pbe (p, pblist)
 			SetPageNosaveFree(virt_to_page(p->address));
-		for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) {
-			if (saveable(zone, &zone_pfn)) {
-				struct page *page;
+		max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
+		for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) {
+			struct page *page = saveable_page(pfn);
+
+			if (page) {
 				long *src, *dst;
 				int n;
 
-				page = pfn_to_page(zone_pfn + zone->zone_start_pfn);
 				BUG_ON(!pbe);
 				pbe->orig_address = (unsigned long)page_address(page);
 				/* copy_page and memcpy are not usable for copying task structs. */
@@ -383,13 +384,14 @@ static struct pbe *alloc_pagedir(unsigned int nr_pages, gfp_t gfp_mask,
 void swsusp_free(void)
 {
 	struct zone *zone;
-	unsigned long zone_pfn;
+	unsigned long pfn, max_zone_pfn;
 
 	for_each_zone(zone) {
-		for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
-			if (pfn_valid(zone_pfn + zone->zone_start_pfn)) {
-				struct page *page;
-				page = pfn_to_page(zone_pfn + zone->zone_start_pfn);
+		max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
+		for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
+			if (pfn_valid(pfn)) {
+				struct page *page = pfn_to_page(pfn);
+
 				if (PageNosave(page) && PageNosaveFree(page)) {
 					ClearPageNosave(page);
 					ClearPageNosaveFree(page);
@@ -598,7 +600,7 @@ int snapshot_read_next(struct snapshot_handle *handle, size_t count)
 static int mark_unsafe_pages(struct pbe *pblist)
 {
 	struct zone *zone;
-	unsigned long zone_pfn;
+	unsigned long pfn, max_zone_pfn;
 	struct pbe *p;
 
 	if (!pblist) /* a sanity check */
@@ -606,10 +608,10 @@ static int mark_unsafe_pages(struct pbe *pblist)
 
 	/* Clear page flags */
 	for_each_zone (zone) {
-		for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
-			if (pfn_valid(zone_pfn + zone->zone_start_pfn))
-				ClearPageNosaveFree(pfn_to_page(zone_pfn +
-					zone->zone_start_pfn));
+		max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
+		for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
+			if (pfn_valid(pfn))
+				ClearPageNosaveFree(pfn_to_page(pfn));
 	}
 
 	/* Mark orig addresses */
-- 
cgit v1.2.3


From fb13a28b0f5ada60861868c4fa48a12bd0cb8dea Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Mon, 25 Sep 2006 23:32:46 -0700
Subject: [PATCH] swsusp: struct snapshot_handle cleanup

Add comments describing struct snapshot_handle and its members, change the
confusing name of its member 'page' to 'cur'.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Cc: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/power.h    | 64 ++++++++++++++++++++++++++++++++++++++++++-------
 kernel/power/snapshot.c | 40 +++++++++++++++----------------
 2 files changed, 76 insertions(+), 28 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/power.h b/kernel/power/power.h
index 59ce712f082d..1cefcf87a694 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -50,17 +50,65 @@ extern asmlinkage int swsusp_arch_resume(void);
 
 extern unsigned int count_data_pages(void);
 
+/**
+ *	Auxiliary structure used for reading the snapshot image data and
+ *	metadata from and writing them to the list of page backup entries
+ *	(PBEs) which is the main data structure of swsusp.
+ *
+ *	Using struct snapshot_handle we can transfer the image, including its
+ *	metadata, as a continuous sequence of bytes with the help of
+ *	snapshot_read_next() and snapshot_write_next().
+ *
+ *	The code that writes the image to a storage or transfers it to
+ *	the user land is required to use snapshot_read_next() for this
+ *	purpose and it should not make any assumptions regarding the internal
+ *	structure of the image.  Similarly, the code that reads the image from
+ *	a storage or transfers it from the user land is required to use
+ *	snapshot_write_next().
+ *
+ *	This may allow us to change the internal structure of the image
+ *	in the future with considerably less effort.
+ */
+
 struct snapshot_handle {
-	loff_t		offset;
-	unsigned int	page;
-	unsigned int	page_offset;
-	unsigned int	prev;
-	struct pbe	*pbe, *last_pbe;
-	void		*buffer;
-	unsigned int	buf_offset;
-	int		sync_read;
+	loff_t		offset;	/* number of the last byte ready for reading
+				 * or writing in the sequence
+				 */
+	unsigned int	cur;	/* number of the block of PAGE_SIZE bytes the
+				 * next operation will refer to (ie. current)
+				 */
+	unsigned int	cur_offset;	/* offset with respect to the current
+					 * block (for the next operation)
+					 */
+	unsigned int	prev;	/* number of the block of PAGE_SIZE bytes that
+				 * was the current one previously
+				 */
+	struct pbe	*pbe;	/* PBE that corresponds to 'buffer' */
+	struct pbe	*last_pbe;	/* When the image is restored (eg. read
+					 * from disk) we can store some image
+					 * data directly in the page frames
+					 * in which they were before suspend.
+					 * In such a case the PBEs that
+					 * correspond to them will be unused.
+					 * This is the last PBE, so far, that
+					 * does not correspond to such data.
+					 */
+	void		*buffer;	/* address of the block to read from
+					 * or write to
+					 */
+	unsigned int	buf_offset;	/* location to read from or write to,
+					 * given as a displacement from 'buffer'
+					 */
+	int		sync_read;	/* Set to one to notify the caller of
+					 * snapshot_write_next() that it may
+					 * need to call wait_on_bio_chain()
+					 */
 };
 
+/* This macro returns the address from/to which the caller of
+ * snapshot_read_next()/snapshot_write_next() is allowed to
+ * read/write data after the function returns
+ */
 #define data_of(handle)	((handle).buffer + (handle).buf_offset)
 
 extern int snapshot_read_next(struct snapshot_handle *handle, size_t count);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 979096c27773..81fe8de9e604 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -555,7 +555,7 @@ static inline struct pbe *pack_orig_addresses(unsigned long *buf, struct pbe *pb
 
 int snapshot_read_next(struct snapshot_handle *handle, size_t count)
 {
-	if (handle->page > nr_meta_pages + nr_copy_pages)
+	if (handle->cur > nr_meta_pages + nr_copy_pages)
 		return 0;
 	if (!buffer) {
 		/* This makes the buffer be freed by swsusp_free() */
@@ -568,8 +568,8 @@ int snapshot_read_next(struct snapshot_handle *handle, size_t count)
 		handle->buffer = buffer;
 		handle->pbe = pagedir_nosave;
 	}
-	if (handle->prev < handle->page) {
-		if (handle->page <= nr_meta_pages) {
+	if (handle->prev < handle->cur) {
+		if (handle->cur <= nr_meta_pages) {
 			handle->pbe = pack_orig_addresses(buffer, handle->pbe);
 			if (!handle->pbe)
 				handle->pbe = pagedir_nosave;
@@ -577,15 +577,15 @@ int snapshot_read_next(struct snapshot_handle *handle, size_t count)
 			handle->buffer = (void *)handle->pbe->address;
 			handle->pbe = handle->pbe->next;
 		}
-		handle->prev = handle->page;
+		handle->prev = handle->cur;
 	}
-	handle->buf_offset = handle->page_offset;
-	if (handle->page_offset + count >= PAGE_SIZE) {
-		count = PAGE_SIZE - handle->page_offset;
-		handle->page_offset = 0;
-		handle->page++;
+	handle->buf_offset = handle->cur_offset;
+	if (handle->cur_offset + count >= PAGE_SIZE) {
+		count = PAGE_SIZE - handle->cur_offset;
+		handle->cur_offset = 0;
+		handle->cur++;
 	} else {
-		handle->page_offset += count;
+		handle->cur_offset += count;
 	}
 	handle->offset += count;
 	return count;
@@ -820,7 +820,7 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count)
 {
 	int error = 0;
 
-	if (handle->prev && handle->page > nr_meta_pages + nr_copy_pages)
+	if (handle->prev && handle->cur > nr_meta_pages + nr_copy_pages)
 		return 0;
 	if (!buffer) {
 		/* This makes the buffer be freed by swsusp_free() */
@@ -831,7 +831,7 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count)
 	if (!handle->offset)
 		handle->buffer = buffer;
 	handle->sync_read = 1;
-	if (handle->prev < handle->page) {
+	if (handle->prev < handle->cur) {
 		if (!handle->prev) {
 			error = load_header(handle,
 					(struct swsusp_info *)buffer);
@@ -854,15 +854,15 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count)
 			handle->buffer = get_buffer(handle);
 			handle->sync_read = 0;
 		}
-		handle->prev = handle->page;
+		handle->prev = handle->cur;
 	}
-	handle->buf_offset = handle->page_offset;
-	if (handle->page_offset + count >= PAGE_SIZE) {
-		count = PAGE_SIZE - handle->page_offset;
-		handle->page_offset = 0;
-		handle->page++;
+	handle->buf_offset = handle->cur_offset;
+	if (handle->cur_offset + count >= PAGE_SIZE) {
+		count = PAGE_SIZE - handle->cur_offset;
+		handle->cur_offset = 0;
+		handle->cur++;
 	} else {
-		handle->page_offset += count;
+		handle->cur_offset += count;
 	}
 	handle->offset += count;
 	return count;
@@ -871,5 +871,5 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count)
 int snapshot_image_loaded(struct snapshot_handle *handle)
 {
 	return !(!handle->pbe || handle->pbe->next || !nr_copy_pages ||
-		handle->page <= nr_meta_pages + nr_copy_pages);
+		handle->cur <= nr_meta_pages + nr_copy_pages);
 }
-- 
cgit v1.2.3


From e3920fb42c8ddfe63befb54d95c0e13eabacea9b Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Mon, 25 Sep 2006 23:32:48 -0700
Subject: [PATCH] Disable CPU hotplug during suspend

The current suspend code has to be run on one CPU, so we use the CPU
hotplug to take the non-boot CPUs offline on SMP machines.  However, we
should also make sure that these CPUs will not be enabled by someone else
after we have disabled them.

The functions disable_nonboot_cpus() and enable_nonboot_cpus() are moved to
kernel/cpu.c, because they now refer to some stuff in there that should
better be static.  Also it's better if disable_nonboot_cpus() returns an
error instead of panicking if something goes wrong, and
enable_nonboot_cpus() has no reason to panic(), because the CPUs may have
been enabled by the userland before it tries to take them online.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/cpu.h     |   8 +++
 include/linux/suspend.h |   8 ---
 kernel/cpu.c            | 138 +++++++++++++++++++++++++++++++++++++++++-------
 kernel/power/Makefile   |   2 -
 kernel/power/disk.c     |   7 ++-
 kernel/power/main.c     |  10 ++--
 kernel/power/smp.c      |  62 ----------------------
 kernel/power/user.c     |  14 +++--
 8 files changed, 145 insertions(+), 104 deletions(-)
 delete mode 100644 kernel/power/smp.c

(limited to 'kernel')

diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index 8fb344a9abd8..3fef7d67aedc 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -89,4 +89,12 @@ int cpu_down(unsigned int cpu);
 static inline int cpu_is_offline(int cpu) { return 0; }
 #endif
 
+#ifdef CONFIG_SUSPEND_SMP
+extern int disable_nonboot_cpus(void);
+extern void enable_nonboot_cpus(void);
+#else
+static inline int disable_nonboot_cpus(void) { return 0; }
+static inline void enable_nonboot_cpus(void) {}
+#endif
+
 #endif /* _LINUX_CPU_H_ */
diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index 96e31aa64cc7..6e8a06c950f4 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -57,14 +57,6 @@ static inline int software_suspend(void)
 }
 #endif /* CONFIG_PM */
 
-#ifdef CONFIG_SUSPEND_SMP
-extern void disable_nonboot_cpus(void);
-extern void enable_nonboot_cpus(void);
-#else
-static inline void disable_nonboot_cpus(void) {}
-static inline void enable_nonboot_cpus(void) {}
-#endif
-
 void save_processor_state(void);
 void restore_processor_state(void);
 struct saved_context;
diff --git a/kernel/cpu.c b/kernel/cpu.c
index f230f9ae01c2..32c96628463e 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -21,6 +21,11 @@ static DEFINE_MUTEX(cpu_bitmask_lock);
 
 static __cpuinitdata BLOCKING_NOTIFIER_HEAD(cpu_chain);
 
+/* If set, cpu_up and cpu_down will return -EBUSY and do nothing.
+ * Should always be manipulated under cpu_add_remove_lock
+ */
+static int cpu_hotplug_disabled;
+
 #ifdef CONFIG_HOTPLUG_CPU
 
 /* Crappy recursive lock-takers in cpufreq! Complain loudly about idiots */
@@ -108,30 +113,25 @@ static int take_cpu_down(void *unused)
 	return 0;
 }
 
-int cpu_down(unsigned int cpu)
+/* Requires cpu_add_remove_lock to be held */
+static int _cpu_down(unsigned int cpu)
 {
 	int err;
 	struct task_struct *p;
 	cpumask_t old_allowed, tmp;
 
-	mutex_lock(&cpu_add_remove_lock);
-	if (num_online_cpus() == 1) {
-		err = -EBUSY;
-		goto out;
-	}
+	if (num_online_cpus() == 1)
+		return -EBUSY;
 
-	if (!cpu_online(cpu)) {
-		err = -EINVAL;
-		goto out;
-	}
+	if (!cpu_online(cpu))
+		return -EINVAL;
 
 	err = blocking_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE,
 						(void *)(long)cpu);
 	if (err == NOTIFY_BAD) {
 		printk("%s: attempt to take down CPU %u failed\n",
 				__FUNCTION__, cpu);
-		err = -EINVAL;
-		goto out;
+		return -EINVAL;
 	}
 
 	/* Ensure that we are not runnable on dying cpu */
@@ -179,22 +179,32 @@ out_thread:
 	err = kthread_stop(p);
 out_allowed:
 	set_cpus_allowed(current, old_allowed);
-out:
+	return err;
+}
+
+int cpu_down(unsigned int cpu)
+{
+	int err = 0;
+
+	mutex_lock(&cpu_add_remove_lock);
+	if (cpu_hotplug_disabled)
+		err = -EBUSY;
+	else
+		err = _cpu_down(cpu);
+
 	mutex_unlock(&cpu_add_remove_lock);
 	return err;
 }
 #endif /*CONFIG_HOTPLUG_CPU*/
 
-int __devinit cpu_up(unsigned int cpu)
+/* Requires cpu_add_remove_lock to be held */
+static int __devinit _cpu_up(unsigned int cpu)
 {
 	int ret;
 	void *hcpu = (void *)(long)cpu;
 
-	mutex_lock(&cpu_add_remove_lock);
-	if (cpu_online(cpu) || !cpu_present(cpu)) {
-		ret = -EINVAL;
-		goto out;
-	}
+	if (cpu_online(cpu) || !cpu_present(cpu))
+		return -EINVAL;
 
 	ret = blocking_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE, hcpu);
 	if (ret == NOTIFY_BAD) {
@@ -219,7 +229,95 @@ out_notify:
 	if (ret != 0)
 		blocking_notifier_call_chain(&cpu_chain,
 				CPU_UP_CANCELED, hcpu);
+
+	return ret;
+}
+
+int __devinit cpu_up(unsigned int cpu)
+{
+	int err = 0;
+
+	mutex_lock(&cpu_add_remove_lock);
+	if (cpu_hotplug_disabled)
+		err = -EBUSY;
+	else
+		err = _cpu_up(cpu);
+
+	mutex_unlock(&cpu_add_remove_lock);
+	return err;
+}
+
+#ifdef CONFIG_SUSPEND_SMP
+static cpumask_t frozen_cpus;
+
+int disable_nonboot_cpus(void)
+{
+	int cpu, first_cpu, error;
+
+	mutex_lock(&cpu_add_remove_lock);
+	first_cpu = first_cpu(cpu_present_map);
+	if (!cpu_online(first_cpu)) {
+		error = _cpu_up(first_cpu);
+		if (error) {
+			printk(KERN_ERR "Could not bring CPU%d up.\n",
+				first_cpu);
+			goto out;
+		}
+	}
+	error = set_cpus_allowed(current, cpumask_of_cpu(first_cpu));
+	if (error) {
+		printk(KERN_ERR "Could not run on CPU%d\n", first_cpu);
+		goto out;
+	}
+	/* We take down all of the non-boot CPUs in one shot to avoid races
+	 * with the userspace trying to use the CPU hotplug at the same time
+	 */
+	cpus_clear(frozen_cpus);
+	printk("Disabling non-boot CPUs ...\n");
+	for_each_online_cpu(cpu) {
+		if (cpu == first_cpu)
+			continue;
+		error = _cpu_down(cpu);
+		if (!error) {
+			cpu_set(cpu, frozen_cpus);
+			printk("CPU%d is down\n", cpu);
+		} else {
+			printk(KERN_ERR "Error taking CPU%d down: %d\n",
+				cpu, error);
+			break;
+		}
+	}
+	if (!error) {
+		BUG_ON(num_online_cpus() > 1);
+		/* Make sure the CPUs won't be enabled by someone else */
+		cpu_hotplug_disabled = 1;
+	} else {
+		printk(KERN_ERR "Non-boot CPUs are not disabled");
+	}
 out:
 	mutex_unlock(&cpu_add_remove_lock);
-	return ret;
+	return error;
+}
+
+void enable_nonboot_cpus(void)
+{
+	int cpu, error;
+
+	/* Allow everyone to use the CPU hotplug again */
+	mutex_lock(&cpu_add_remove_lock);
+	cpu_hotplug_disabled = 0;
+	mutex_unlock(&cpu_add_remove_lock);
+
+	printk("Enabling non-boot CPUs ...\n");
+	for_each_cpu_mask(cpu, frozen_cpus) {
+		error = cpu_up(cpu);
+		if (!error) {
+			printk("CPU%d is up\n", cpu);
+			continue;
+		}
+		printk(KERN_WARNING "Error taking CPU%d up: %d\n",
+			cpu, error);
+	}
+	cpus_clear(frozen_cpus);
 }
+#endif
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index 8d0af3d37a4b..38725f526afc 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -7,6 +7,4 @@ obj-y				:= main.o process.o console.o
 obj-$(CONFIG_PM_LEGACY)		+= pm.o
 obj-$(CONFIG_SOFTWARE_SUSPEND)	+= swsusp.o disk.o snapshot.o swap.o user.o
 
-obj-$(CONFIG_SUSPEND_SMP)	+= smp.o
-
 obj-$(CONFIG_MAGIC_SYSRQ)	+= poweroff.o
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index e13e74067845..7c7b9b65e365 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -18,6 +18,7 @@
 #include <linux/fs.h>
 #include <linux/mount.h>
 #include <linux/pm.h>
+#include <linux/cpu.h>
 
 #include "power.h"
 
@@ -72,7 +73,10 @@ static int prepare_processes(void)
 	int error;
 
 	pm_prepare_console();
-	disable_nonboot_cpus();
+
+	error = disable_nonboot_cpus();
+	if (error)
+		goto enable_cpus;
 
 	if (freeze_processes()) {
 		error = -EBUSY;
@@ -84,6 +88,7 @@ static int prepare_processes(void)
 		return 0;
 thaw:
 	thaw_processes();
+enable_cpus:
 	enable_nonboot_cpus();
 	pm_restore_console();
 	return error;
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 6d295c776794..4d403323a7bb 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -16,6 +16,7 @@
 #include <linux/init.h>
 #include <linux/pm.h>
 #include <linux/console.h>
+#include <linux/cpu.h>
 
 #include "power.h"
 
@@ -51,7 +52,7 @@ void pm_set_ops(struct pm_ops * ops)
 
 static int suspend_prepare(suspend_state_t state)
 {
-	int error = 0;
+	int error;
 	unsigned int free_pages;
 
 	if (!pm_ops || !pm_ops->enter)
@@ -59,12 +60,9 @@ static int suspend_prepare(suspend_state_t state)
 
 	pm_prepare_console();
 
-	disable_nonboot_cpus();
-
-	if (num_online_cpus() != 1) {
-		error = -EPERM;
+	error = disable_nonboot_cpus();
+	if (error)
 		goto Enable_cpu;
-	}
 
 	if (freeze_processes()) {
 		error = -EAGAIN;
diff --git a/kernel/power/smp.c b/kernel/power/smp.c
deleted file mode 100644
index 5957312b2d68..000000000000
--- a/kernel/power/smp.c
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * drivers/power/smp.c - Functions for stopping other CPUs.
- *
- * Copyright 2004 Pavel Machek <pavel@suse.cz>
- * Copyright (C) 2002-2003 Nigel Cunningham <ncunningham@clear.net.nz>
- *
- * This file is released under the GPLv2.
- */
-
-#undef DEBUG
-
-#include <linux/smp_lock.h>
-#include <linux/interrupt.h>
-#include <linux/suspend.h>
-#include <linux/module.h>
-#include <linux/cpu.h>
-#include <asm/atomic.h>
-#include <asm/tlbflush.h>
-
-/* This is protected by pm_sem semaphore */
-static cpumask_t frozen_cpus;
-
-void disable_nonboot_cpus(void)
-{
-	int cpu, error;
-
-	error = 0;
-	cpus_clear(frozen_cpus);
-	printk("Freezing cpus ...\n");
-	for_each_online_cpu(cpu) {
-		if (cpu == 0)
-			continue;
-		error = cpu_down(cpu);
-		if (!error) {
-			cpu_set(cpu, frozen_cpus);
-			printk("CPU%d is down\n", cpu);
-			continue;
-		}
-		printk("Error taking cpu %d down: %d\n", cpu, error);
-	}
-	BUG_ON(raw_smp_processor_id() != 0);
-	if (error)
-		panic("cpus not sleeping");
-}
-
-void enable_nonboot_cpus(void)
-{
-	int cpu, error;
-
-	printk("Thawing cpus ...\n");
-	for_each_cpu_mask(cpu, frozen_cpus) {
-		error = cpu_up(cpu);
-		if (!error) {
-			printk("CPU%d is up\n", cpu);
-			continue;
-		}
-		printk("Error taking cpu %d up: %d\n", cpu, error);
-		panic("Not enough cpus");
-	}
-	cpus_clear(frozen_cpus);
-}
-
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 3f1539fbe48a..0ef5e4ba39e5 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -19,6 +19,7 @@
 #include <linux/swapops.h>
 #include <linux/pm.h>
 #include <linux/fs.h>
+#include <linux/cpu.h>
 
 #include <asm/uaccess.h>
 
@@ -139,12 +140,15 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
 		if (data->frozen)
 			break;
 		down(&pm_sem);
-		disable_nonboot_cpus();
-		if (freeze_processes()) {
-			thaw_processes();
-			enable_nonboot_cpus();
-			error = -EBUSY;
+		error = disable_nonboot_cpus();
+		if (!error) {
+			error = freeze_processes();
+			if (error) {
+				thaw_processes();
+				error = -EBUSY;
+			}
 		}
+		enable_nonboot_cpus();
 		up(&pm_sem);
 		if (!error)
 			data->frozen = 1;
-- 
cgit v1.2.3


From f623f0db8e6aa86a37be86167e4ff478821a9f4f Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Mon, 25 Sep 2006 23:32:49 -0700
Subject: [PATCH] swsusp: Fix mark_free_pages

Clean up mm/page_alloc.c#mark_free_pages() and make it avoid clearing
PageNosaveFree for PageNosave pages.  This allows us to get rid of an ugly
hack in kernel/power/snapshot.c#copy_data_pages().

Additionally, the page-copying loop in copy_data_pages() is moved to an
inline function.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Cc: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/snapshot.c | 27 +++++++++++++--------------
 mm/page_alloc.c         | 24 ++++++++++++++++--------
 2 files changed, 29 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 81fe8de9e604..4afb7900002b 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -208,37 +208,36 @@ unsigned int count_data_pages(void)
 	return n;
 }
 
+static inline void copy_data_page(long *dst, long *src)
+{
+	int n;
+
+	/* copy_page and memcpy are not usable for copying task structs. */
+	for (n = PAGE_SIZE / sizeof(long); n; n--)
+		*dst++ = *src++;
+}
+
 static void copy_data_pages(struct pbe *pblist)
 {
 	struct zone *zone;
 	unsigned long pfn, max_zone_pfn;
-	struct pbe *pbe, *p;
+	struct pbe *pbe;
 
 	pbe = pblist;
 	for_each_zone (zone) {
 		if (is_highmem(zone))
 			continue;
 		mark_free_pages(zone);
-		/* This is necessary for swsusp_free() */
-		for_each_pb_page (p, pblist)
-			SetPageNosaveFree(virt_to_page(p));
-		for_each_pbe (p, pblist)
-			SetPageNosaveFree(virt_to_page(p->address));
 		max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
 		for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) {
 			struct page *page = saveable_page(pfn);
 
 			if (page) {
-				long *src, *dst;
-				int n;
+				void *ptr = page_address(page);
 
 				BUG_ON(!pbe);
-				pbe->orig_address = (unsigned long)page_address(page);
-				/* copy_page and memcpy are not usable for copying task structs. */
-				dst = (long *)pbe->address;
-				src = (long *)pbe->orig_address;
-				for (n = PAGE_SIZE / sizeof(long); n; n--)
-					*dst++ = *src++;
+				copy_data_page((void *)pbe->address, ptr);
+				pbe->orig_address = (unsigned long)ptr;
 				pbe = pbe->next;
 			}
 		}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 51070b6d593f..9810f0a60db7 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -694,7 +694,8 @@ static void __drain_pages(unsigned int cpu)
 
 void mark_free_pages(struct zone *zone)
 {
-	unsigned long zone_pfn, flags;
+	unsigned long pfn, max_zone_pfn;
+	unsigned long flags;
 	int order;
 	struct list_head *curr;
 
@@ -702,18 +703,25 @@ void mark_free_pages(struct zone *zone)
 		return;
 
 	spin_lock_irqsave(&zone->lock, flags);
-	for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
-		ClearPageNosaveFree(pfn_to_page(zone_pfn + zone->zone_start_pfn));
+
+	max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
+	for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
+		if (pfn_valid(pfn)) {
+			struct page *page = pfn_to_page(pfn);
+
+			if (!PageNosave(page))
+				ClearPageNosaveFree(page);
+		}
 
 	for (order = MAX_ORDER - 1; order >= 0; --order)
 		list_for_each(curr, &zone->free_area[order].free_list) {
-			unsigned long start_pfn, i;
+			unsigned long i;
 
-			start_pfn = page_to_pfn(list_entry(curr, struct page, lru));
+			pfn = page_to_pfn(list_entry(curr, struct page, lru));
+			for (i = 0; i < (1UL << order); i++)
+				SetPageNosaveFree(pfn_to_page(pfn + i));
+		}
 
-			for (i=0; i < (1<<order); i++)
-				SetPageNosaveFree(pfn_to_page(start_pfn+i));
-	}
 	spin_unlock_irqrestore(&zone->lock, flags);
 }
 
-- 
cgit v1.2.3


From f6143aa60ed71e58578bc92cc64d98158a694d99 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Mon, 25 Sep 2006 23:32:50 -0700
Subject: [PATCH] swsusp: Reorder memory-allocating functions

Move some functions in kernel/power/snapshot.c to a better place (in the
same file) and introduce free_image_page() (will be necessary in the
future).

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/snapshot.c | 93 ++++++++++++++++++++++++++++---------------------
 1 file changed, 53 insertions(+), 40 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 4afb7900002b..7ad0c0465524 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -156,6 +156,58 @@ static inline int save_highmem(void) {return 0;}
 static inline int restore_highmem(void) {return 0;}
 #endif
 
+/**
+ *	@safe_needed - on resume, for storing the PBE list and the image,
+ *	we can only use memory pages that do not conflict with the pages
+ *	used before suspend.
+ *
+ *	The unsafe pages are marked with the PG_nosave_free flag
+ *	and we count them using unsafe_pages
+ */
+
+static unsigned int unsafe_pages;
+
+static void *alloc_image_page(gfp_t gfp_mask, int safe_needed)
+{
+	void *res;
+
+	res = (void *)get_zeroed_page(gfp_mask);
+	if (safe_needed)
+		while (res && PageNosaveFree(virt_to_page(res))) {
+			/* The page is unsafe, mark it for swsusp_free() */
+			SetPageNosave(virt_to_page(res));
+			unsafe_pages++;
+			res = (void *)get_zeroed_page(gfp_mask);
+		}
+	if (res) {
+		SetPageNosave(virt_to_page(res));
+		SetPageNosaveFree(virt_to_page(res));
+	}
+	return res;
+}
+
+unsigned long get_safe_page(gfp_t gfp_mask)
+{
+	return (unsigned long)alloc_image_page(gfp_mask, 1);
+}
+
+/**
+ *	free_image_page - free page represented by @addr, allocated with
+ *	alloc_image_page (page flags set by it must be cleared)
+ */
+
+static inline void free_image_page(void *addr, int clear_nosave_free)
+{
+	ClearPageNosave(virt_to_page(addr));
+	if (clear_nosave_free)
+		ClearPageNosaveFree(virt_to_page(addr));
+	free_page((unsigned long)addr);
+}
+
+/**
+ *	pfn_is_nosave - check if given pfn is in the 'nosave' section
+ */
+
 static inline int pfn_is_nosave(unsigned long pfn)
 {
 	unsigned long nosave_begin_pfn = __pa(&__nosave_begin) >> PAGE_SHIFT;
@@ -245,7 +297,6 @@ static void copy_data_pages(struct pbe *pblist)
 	BUG_ON(pbe);
 }
 
-
 /**
  *	free_pagedir - free pages allocated with alloc_pagedir()
  */
@@ -256,10 +307,7 @@ static void free_pagedir(struct pbe *pblist, int clear_nosave_free)
 
 	while (pblist) {
 		pbe = (pblist + PB_PAGE_SKIP)->next;
-		ClearPageNosave(virt_to_page(pblist));
-		if (clear_nosave_free)
-			ClearPageNosaveFree(virt_to_page(pblist));
-		free_page((unsigned long)pblist);
+		free_image_page(pblist, clear_nosave_free);
 		pblist = pbe;
 	}
 }
@@ -303,41 +351,6 @@ static inline void create_pbe_list(struct pbe *pblist, unsigned int nr_pages)
 	}
 }
 
-static unsigned int unsafe_pages;
-
-/**
- *	@safe_needed - on resume, for storing the PBE list and the image,
- *	we can only use memory pages that do not conflict with the pages
- *	used before suspend.
- *
- *	The unsafe pages are marked with the PG_nosave_free flag
- *	and we count them using unsafe_pages
- */
-
-static void *alloc_image_page(gfp_t gfp_mask, int safe_needed)
-{
-	void *res;
-
-	res = (void *)get_zeroed_page(gfp_mask);
-	if (safe_needed)
-		while (res && PageNosaveFree(virt_to_page(res))) {
-			/* The page is unsafe, mark it for swsusp_free() */
-			SetPageNosave(virt_to_page(res));
-			unsafe_pages++;
-			res = (void *)get_zeroed_page(gfp_mask);
-		}
-	if (res) {
-		SetPageNosave(virt_to_page(res));
-		SetPageNosaveFree(virt_to_page(res));
-	}
-	return res;
-}
-
-unsigned long get_safe_page(gfp_t gfp_mask)
-{
-	return (unsigned long)alloc_image_page(gfp_mask, 1);
-}
-
 /**
  *	alloc_pagedir - Allocate the page directory.
  *
-- 
cgit v1.2.3


From cd560bb2f9e2cd451bb3942af43da19632ba4a8e Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Mon, 25 Sep 2006 23:32:50 -0700
Subject: [PATCH] swsusp: Fix alloc_pagedir

Get rid of the FIXME in kernel/power/snapshot.c#alloc_pagedir() and
simplify the functions called by it.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/snapshot.c | 32 +++++++++++++++++---------------
 1 file changed, 17 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 7ad0c0465524..4ca372f2bc14 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -316,12 +316,12 @@ static void free_pagedir(struct pbe *pblist, int clear_nosave_free)
  *	fill_pb_page - Create a list of PBEs on a given memory page
  */
 
-static inline void fill_pb_page(struct pbe *pbpage)
+static inline void fill_pb_page(struct pbe *pbpage, unsigned int n)
 {
 	struct pbe *p;
 
 	p = pbpage;
-	pbpage += PB_PAGE_SKIP;
+	pbpage += n - 1;
 	do
 		p->next = p + 1;
 	while (++p < pbpage);
@@ -330,24 +330,26 @@ static inline void fill_pb_page(struct pbe *pbpage)
 /**
  *	create_pbe_list - Create a list of PBEs on top of a given chain
  *	of memory pages allocated with alloc_pagedir()
+ *
+ *	This function assumes that pages allocated by alloc_image_page() will
+ *	always be zeroed.
  */
 
 static inline void create_pbe_list(struct pbe *pblist, unsigned int nr_pages)
 {
-	struct pbe *pbpage, *p;
+	struct pbe *pbpage;
 	unsigned int num = PBES_PER_PAGE;
 
 	for_each_pb_page (pbpage, pblist) {
 		if (num >= nr_pages)
 			break;
 
-		fill_pb_page(pbpage);
+		fill_pb_page(pbpage, PBES_PER_PAGE);
 		num += PBES_PER_PAGE;
 	}
 	if (pbpage) {
-		for (num -= PBES_PER_PAGE - 1, p = pbpage; num < nr_pages; p++, num++)
-			p->next = p + 1;
-		p->next = NULL;
+		num -= PBES_PER_PAGE;
+		fill_pb_page(pbpage, nr_pages - num);
 	}
 }
 
@@ -374,17 +376,17 @@ static struct pbe *alloc_pagedir(unsigned int nr_pages, gfp_t gfp_mask,
 		return NULL;
 
 	pblist = alloc_image_page(gfp_mask, safe_needed);
-	/* FIXME: rewrite this ugly loop */
-	for (pbe = pblist, num = PBES_PER_PAGE; pbe && num < nr_pages;
-        		pbe = pbe->next, num += PBES_PER_PAGE) {
+	pbe = pblist;
+	for (num = PBES_PER_PAGE; num < nr_pages; num += PBES_PER_PAGE) {
+		if (!pbe) {
+			free_pagedir(pblist, 1);
+			return NULL;
+		}
 		pbe += PB_PAGE_SKIP;
 		pbe->next = alloc_image_page(gfp_mask, safe_needed);
+		pbe = pbe->next;
 	}
-	if (!pbe) { /* get_zeroed_page() failed */
-		free_pagedir(pblist, 1);
-		pblist = NULL;
-        } else
-		create_pbe_list(pblist, nr_pages);
+	create_pbe_list(pblist, nr_pages);
 	return pblist;
 }
 
-- 
cgit v1.2.3


From 75534b50cc658e951bcb213c2763c81e9f7b0b48 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Mon, 25 Sep 2006 23:32:52 -0700
Subject: [PATCH] Change the name of pagedir_nosave

The name of the pagedir_nosave variable does not make sense any more, so it
seems reasonable to change it to something more meaningful.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/power/swsusp.S         |  2 +-
 arch/powerpc/kernel/swsusp_32.S  |  4 ++--
 arch/x86_64/kernel/suspend_asm.S |  2 +-
 kernel/power/power.h             |  2 --
 kernel/power/snapshot.c          | 26 ++++++++++++++------------
 5 files changed, 18 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/arch/i386/power/swsusp.S b/arch/i386/power/swsusp.S
index c893b897217f..8a2b50a0aaad 100644
--- a/arch/i386/power/swsusp.S
+++ b/arch/i386/power/swsusp.S
@@ -32,7 +32,7 @@ ENTRY(swsusp_arch_resume)
 	movl	$swsusp_pg_dir-__PAGE_OFFSET, %ecx
 	movl	%ecx, %cr3
 
-	movl	pagedir_nosave, %edx
+	movl	restore_pblist, %edx
 	.p2align 4,,7
 
 copy_loop:
diff --git a/arch/powerpc/kernel/swsusp_32.S b/arch/powerpc/kernel/swsusp_32.S
index 7369f9a6ad25..69e8f86aa4f8 100644
--- a/arch/powerpc/kernel/swsusp_32.S
+++ b/arch/powerpc/kernel/swsusp_32.S
@@ -159,8 +159,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
 	isync
 
 	/* Load ptr the list of pages to copy in r3 */
-	lis	r11,(pagedir_nosave - KERNELBASE)@h
-	ori	r11,r11,pagedir_nosave@l
+	lis	r11,(restore_pblist - KERNELBASE)@h
+	ori	r11,r11,restore_pblist@l
 	lwz	r10,0(r11)
 
 	/* Copy the pages. This is a very basic implementation, to
diff --git a/arch/x86_64/kernel/suspend_asm.S b/arch/x86_64/kernel/suspend_asm.S
index 320b6fb00cca..bfbe00763c68 100644
--- a/arch/x86_64/kernel/suspend_asm.S
+++ b/arch/x86_64/kernel/suspend_asm.S
@@ -54,7 +54,7 @@ ENTRY(restore_image)
 	movq	%rcx, %cr3;
 	movq	%rax, %cr4;  # turn PGE back on
 
-	movq	pagedir_nosave(%rip), %rdx
+	movq	restore_pblist(%rip), %rdx
 loop:
 	testq	%rdx, %rdx
 	jz	done
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 1cefcf87a694..e18ba207e784 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -38,8 +38,6 @@ extern struct subsystem power_subsys;
 /* References to section boundaries */
 extern const void __nosave_begin, __nosave_end;
 
-extern struct pbe *pagedir_nosave;
-
 /* Preferred image size in bytes (default 500 MB) */
 extern unsigned long image_size;
 extern int in_suspend;
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 4ca372f2bc14..1d276b3ae152 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -34,7 +34,9 @@
 
 #include "power.h"
 
-struct pbe *pagedir_nosave;
+/* List of PBEs used for creating and restoring the suspend image */
+struct pbe *restore_pblist;
+
 static unsigned int nr_copy_pages;
 static unsigned int nr_meta_pages;
 static unsigned long *buffer;
@@ -415,7 +417,7 @@ void swsusp_free(void)
 	}
 	nr_copy_pages = 0;
 	nr_meta_pages = 0;
-	pagedir_nosave = NULL;
+	restore_pblist = NULL;
 	buffer = NULL;
 }
 
@@ -490,15 +492,15 @@ asmlinkage int swsusp_save(void)
 		return -ENOMEM;
 	}
 
-	pagedir_nosave = swsusp_alloc(nr_pages);
-	if (!pagedir_nosave)
+	restore_pblist = swsusp_alloc(nr_pages);
+	if (!restore_pblist)
 		return -ENOMEM;
 
 	/* During allocating of suspend pagedir, new cold pages may appear.
 	 * Kill them.
 	 */
 	drain_local_pages();
-	copy_data_pages(pagedir_nosave);
+	copy_data_pages(restore_pblist);
 
 	/*
 	 * End of critical section. From now on, we can write to memory,
@@ -580,13 +582,13 @@ int snapshot_read_next(struct snapshot_handle *handle, size_t count)
 	if (!handle->offset) {
 		init_header((struct swsusp_info *)buffer);
 		handle->buffer = buffer;
-		handle->pbe = pagedir_nosave;
+		handle->pbe = restore_pblist;
 	}
 	if (handle->prev < handle->cur) {
 		if (handle->cur <= nr_meta_pages) {
 			handle->pbe = pack_orig_addresses(buffer, handle->pbe);
 			if (!handle->pbe)
-				handle->pbe = pagedir_nosave;
+				handle->pbe = restore_pblist;
 		} else {
 			handle->buffer = (void *)handle->pbe->address;
 			handle->pbe = handle->pbe->next;
@@ -689,7 +691,7 @@ static int load_header(struct snapshot_handle *handle,
 		pblist = alloc_pagedir(info->image_pages, GFP_ATOMIC, 0);
 		if (!pblist)
 			return -ENOMEM;
-		pagedir_nosave = pblist;
+		restore_pblist = pblist;
 		handle->pbe = pblist;
 		nr_copy_pages = info->image_pages;
 		nr_meta_pages = info->pages - info->image_pages - 1;
@@ -716,7 +718,7 @@ static inline struct pbe *unpack_orig_addresses(unsigned long *buf,
 
 /**
  *	prepare_image - use metadata contained in the PBE list
- *	pointed to by pagedir_nosave to mark the pages that will
+ *	pointed to by restore_pblist to mark the pages that will
  *	be overwritten in the process of restoring the system
  *	memory state from the image ("unsafe" pages) and allocate
  *	memory for the image
@@ -741,7 +743,7 @@ static int prepare_image(struct snapshot_handle *handle)
 	unsigned int nr_pages = nr_copy_pages;
 	struct pbe *p, *pblist = NULL;
 
-	p = pagedir_nosave;
+	p = restore_pblist;
 	error = mark_unsafe_pages(p);
 	if (!error) {
 		pblist = alloc_pagedir(nr_pages, GFP_ATOMIC, 1);
@@ -773,7 +775,7 @@ static int prepare_image(struct snapshot_handle *handle)
 		}
 	}
 	if (!error) {
-		pagedir_nosave = pblist;
+		restore_pblist = pblist;
 	} else {
 		handle->pbe = NULL;
 		swsusp_free();
@@ -858,7 +860,7 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count)
 				error = prepare_image(handle);
 				if (error)
 					return error;
-				handle->pbe = pagedir_nosave;
+				handle->pbe = restore_pblist;
 				handle->last_pbe = NULL;
 				handle->buffer = get_buffer(handle);
 				handle->sync_read = 0;
-- 
cgit v1.2.3


From 0bcd888d64684f896ffa70c1d16a42b00753c184 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Mon, 25 Sep 2006 23:32:52 -0700
Subject: [PATCH] swsusp: Introduce some helpful constants

Introduce some constants that hopefully will help improve the readability of
code in kernel/power/snapshot.c.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/snapshot.c | 24 +++++++++++++++---------
 1 file changed, 15 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 1d276b3ae152..d0d691f976d8 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -167,6 +167,11 @@ static inline int restore_highmem(void) {return 0;}
  *	and we count them using unsafe_pages
  */
 
+#define PG_ANY		0
+#define PG_SAFE		1
+#define PG_UNSAFE_CLEAR	1
+#define PG_UNSAFE_KEEP	0
+
 static unsigned int unsafe_pages;
 
 static void *alloc_image_page(gfp_t gfp_mask, int safe_needed)
@@ -190,7 +195,7 @@ static void *alloc_image_page(gfp_t gfp_mask, int safe_needed)
 
 unsigned long get_safe_page(gfp_t gfp_mask)
 {
-	return (unsigned long)alloc_image_page(gfp_mask, 1);
+	return (unsigned long)alloc_image_page(gfp_mask, PG_SAFE);
 }
 
 /**
@@ -381,7 +386,7 @@ static struct pbe *alloc_pagedir(unsigned int nr_pages, gfp_t gfp_mask,
 	pbe = pblist;
 	for (num = PBES_PER_PAGE; num < nr_pages; num += PBES_PER_PAGE) {
 		if (!pbe) {
-			free_pagedir(pblist, 1);
+			free_pagedir(pblist, PG_UNSAFE_CLEAR);
 			return NULL;
 		}
 		pbe += PB_PAGE_SKIP;
@@ -458,12 +463,13 @@ static struct pbe *swsusp_alloc(unsigned int nr_pages)
 {
 	struct pbe *pblist;
 
-	if (!(pblist = alloc_pagedir(nr_pages, GFP_ATOMIC | __GFP_COLD, 0))) {
+	pblist = alloc_pagedir(nr_pages, GFP_ATOMIC | __GFP_COLD, PG_ANY);
+	if (!pblist) {
 		printk(KERN_ERR "suspend: Allocating pagedir failed.\n");
 		return NULL;
 	}
 
-	if (alloc_data_pages(pblist, GFP_ATOMIC | __GFP_COLD, 0)) {
+	if (alloc_data_pages(pblist, GFP_ATOMIC | __GFP_COLD, PG_ANY)) {
 		printk(KERN_ERR "suspend: Allocating image pages failed.\n");
 		swsusp_free();
 		return NULL;
@@ -575,7 +581,7 @@ int snapshot_read_next(struct snapshot_handle *handle, size_t count)
 		return 0;
 	if (!buffer) {
 		/* This makes the buffer be freed by swsusp_free() */
-		buffer = alloc_image_page(GFP_ATOMIC, 0);
+		buffer = alloc_image_page(GFP_ATOMIC, PG_ANY);
 		if (!buffer)
 			return -ENOMEM;
 	}
@@ -688,7 +694,7 @@ static int load_header(struct snapshot_handle *handle,
 
 	error = check_header(info);
 	if (!error) {
-		pblist = alloc_pagedir(info->image_pages, GFP_ATOMIC, 0);
+		pblist = alloc_pagedir(info->image_pages, GFP_ATOMIC, PG_ANY);
 		if (!pblist)
 			return -ENOMEM;
 		restore_pblist = pblist;
@@ -746,10 +752,10 @@ static int prepare_image(struct snapshot_handle *handle)
 	p = restore_pblist;
 	error = mark_unsafe_pages(p);
 	if (!error) {
-		pblist = alloc_pagedir(nr_pages, GFP_ATOMIC, 1);
+		pblist = alloc_pagedir(nr_pages, GFP_ATOMIC, PG_SAFE);
 		if (pblist)
 			copy_page_backup_list(pblist, p);
-		free_pagedir(p, 0);
+		free_pagedir(p, PG_UNSAFE_KEEP);
 		if (!pblist)
 			error = -ENOMEM;
 	}
@@ -840,7 +846,7 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count)
 		return 0;
 	if (!buffer) {
 		/* This makes the buffer be freed by swsusp_free() */
-		buffer = alloc_image_page(GFP_ATOMIC, 0);
+		buffer = alloc_image_page(GFP_ATOMIC, PG_ANY);
 		if (!buffer)
 			return -ENOMEM;
 	}
-- 
cgit v1.2.3


From b788db79896ef2a5817b9395ad63573b254a6d93 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Mon, 25 Sep 2006 23:32:54 -0700
Subject: [PATCH] swsusp: Introduce memory bitmaps

Introduce the memory bitmap data structure and make swsusp use in the suspend
phase.

The current swsusp's internal data structure is not very efficient from the
memory usage point of view, so it seems reasonable to replace it with a data
structure that will require less memory, such as a pair of bitmaps.

The idea is to use bitmaps that may be allocated as sets of individual pages,
so that we can avoid making allocations of order greater than 0.  For this
reason the memory bitmap structure consists of several linked lists of objects
that contain pointers to memory pages with the actual bitmap data.  Still, for
a typical system all of these lists fit in a single page, so it's reasonable
to introduce an additional mechanism allowing us to allocate all of them
efficiently without sacrificing the generality of the design.  This is done
with the help of the chain_allocator structure and associated functions.

We need to use two memory bitmaps during the suspend phase of the
suspend-resume cycle.  One of them is necessary for marking the saveable
pages, and the second is used to mark the pages in which to store the copies
of them (aka image pages).

First, the bitmaps are created and we allocate as many image pages as needed
(the corresponding bits in the second bitmap are set as soon as the pages are
allocated).  Second, the bits corresponding to the saveable pages are set in
the first bitmap and the saveable pages are copied to the image pages.
Finally, the first bitmap is used to save the kernel virtual addresses of the
saveable pages and the second one is used to save the contents of the image
pages.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/power.h    |   3 +-
 kernel/power/snapshot.c | 602 ++++++++++++++++++++++++++++++++++++++++++------
 kernel/power/swsusp.c   |   5 +-
 3 files changed, 540 insertions(+), 70 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/power.h b/kernel/power/power.h
index e18ba207e784..6e9e2acc34f8 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -109,9 +109,10 @@ struct snapshot_handle {
  */
 #define data_of(handle)	((handle).buffer + (handle).buf_offset)
 
+extern unsigned int snapshot_additional_pages(struct zone *zone);
 extern int snapshot_read_next(struct snapshot_handle *handle, size_t count);
 extern int snapshot_write_next(struct snapshot_handle *handle, size_t count);
-int snapshot_image_loaded(struct snapshot_handle *handle);
+extern int snapshot_image_loaded(struct snapshot_handle *handle);
 
 #define SNAPSHOT_IOC_MAGIC	'3'
 #define SNAPSHOT_FREEZE			_IO(SNAPSHOT_IOC_MAGIC, 1)
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index d0d691f976d8..852e0df41719 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -211,6 +211,467 @@ static inline void free_image_page(void *addr, int clear_nosave_free)
 	free_page((unsigned long)addr);
 }
 
+/* struct linked_page is used to build chains of pages */
+
+#define LINKED_PAGE_DATA_SIZE	(PAGE_SIZE - sizeof(void *))
+
+struct linked_page {
+	struct linked_page *next;
+	char data[LINKED_PAGE_DATA_SIZE];
+} __attribute__((packed));
+
+static inline void
+free_list_of_pages(struct linked_page *list, int clear_page_nosave)
+{
+	while (list) {
+		struct linked_page *lp = list->next;
+
+		free_image_page(list, clear_page_nosave);
+		list = lp;
+	}
+}
+
+/**
+  *	struct chain_allocator is used for allocating small objects out of
+  *	a linked list of pages called 'the chain'.
+  *
+  *	The chain grows each time when there is no room for a new object in
+  *	the current page.  The allocated objects cannot be freed individually.
+  *	It is only possible to free them all at once, by freeing the entire
+  *	chain.
+  *
+  *	NOTE: The chain allocator may be inefficient if the allocated objects
+  *	are not much smaller than PAGE_SIZE.
+  */
+
+struct chain_allocator {
+	struct linked_page *chain;	/* the chain */
+	unsigned int used_space;	/* total size of objects allocated out
+					 * of the current page
+					 */
+	gfp_t gfp_mask;		/* mask for allocating pages */
+	int safe_needed;	/* if set, only "safe" pages are allocated */
+};
+
+static void
+chain_init(struct chain_allocator *ca, gfp_t gfp_mask, int safe_needed)
+{
+	ca->chain = NULL;
+	ca->used_space = LINKED_PAGE_DATA_SIZE;
+	ca->gfp_mask = gfp_mask;
+	ca->safe_needed = safe_needed;
+}
+
+static void *chain_alloc(struct chain_allocator *ca, unsigned int size)
+{
+	void *ret;
+
+	if (LINKED_PAGE_DATA_SIZE - ca->used_space < size) {
+		struct linked_page *lp;
+
+		lp = alloc_image_page(ca->gfp_mask, ca->safe_needed);
+		if (!lp)
+			return NULL;
+
+		lp->next = ca->chain;
+		ca->chain = lp;
+		ca->used_space = 0;
+	}
+	ret = ca->chain->data + ca->used_space;
+	ca->used_space += size;
+	return ret;
+}
+
+static void chain_free(struct chain_allocator *ca, int clear_page_nosave)
+{
+	free_list_of_pages(ca->chain, clear_page_nosave);
+	memset(ca, 0, sizeof(struct chain_allocator));
+}
+
+/**
+ *	Data types related to memory bitmaps.
+ *
+ *	Memory bitmap is a structure consiting of many linked lists of
+ *	objects.  The main list's elements are of type struct zone_bitmap
+ *	and each of them corresonds to one zone.  For each zone bitmap
+ *	object there is a list of objects of type struct bm_block that
+ *	represent each blocks of bit chunks in which information is
+ *	stored.
+ *
+ *	struct memory_bitmap contains a pointer to the main list of zone
+ *	bitmap objects, a struct bm_position used for browsing the bitmap,
+ *	and a pointer to the list of pages used for allocating all of the
+ *	zone bitmap objects and bitmap block objects.
+ *
+ *	NOTE: It has to be possible to lay out the bitmap in memory
+ *	using only allocations of order 0.  Additionally, the bitmap is
+ *	designed to work with arbitrary number of zones (this is over the
+ *	top for now, but let's avoid making unnecessary assumptions ;-).
+ *
+ *	struct zone_bitmap contains a pointer to a list of bitmap block
+ *	objects and a pointer to the bitmap block object that has been
+ *	most recently used for setting bits.  Additionally, it contains the
+ *	pfns that correspond to the start and end of the represented zone.
+ *
+ *	struct bm_block contains a pointer to the memory page in which
+ *	information is stored (in the form of a block of bit chunks
+ *	of type unsigned long each).  It also contains the pfns that
+ *	correspond to the start and end of the represented memory area and
+ *	the number of bit chunks in the block.
+ *
+ *	NOTE: Memory bitmaps are used for two types of operations only:
+ *	"set a bit" and "find the next bit set".  Moreover, the searching
+ *	is always carried out after all of the "set a bit" operations
+ *	on given bitmap.
+ */
+
+#define BM_END_OF_MAP	(~0UL)
+
+#define BM_CHUNKS_PER_BLOCK	(PAGE_SIZE / sizeof(long))
+#define BM_BITS_PER_CHUNK	(sizeof(long) << 3)
+#define BM_BITS_PER_BLOCK	(PAGE_SIZE << 3)
+
+struct bm_block {
+	struct bm_block *next;		/* next element of the list */
+	unsigned long start_pfn;	/* pfn represented by the first bit */
+	unsigned long end_pfn;	/* pfn represented by the last bit plus 1 */
+	unsigned int size;	/* number of bit chunks */
+	unsigned long *data;	/* chunks of bits representing pages */
+};
+
+struct zone_bitmap {
+	struct zone_bitmap *next;	/* next element of the list */
+	unsigned long start_pfn;	/* minimal pfn in this zone */
+	unsigned long end_pfn;		/* maximal pfn in this zone plus 1 */
+	struct bm_block *bm_blocks;	/* list of bitmap blocks */
+	struct bm_block *cur_block;	/* recently used bitmap block */
+};
+
+/* strcut bm_position is used for browsing memory bitmaps */
+
+struct bm_position {
+	struct zone_bitmap *zone_bm;
+	struct bm_block *block;
+	int chunk;
+	int bit;
+};
+
+struct memory_bitmap {
+	struct zone_bitmap *zone_bm_list;	/* list of zone bitmaps */
+	struct linked_page *p_list;	/* list of pages used to store zone
+					 * bitmap objects and bitmap block
+					 * objects
+					 */
+	struct bm_position cur;	/* most recently used bit position */
+};
+
+/* Functions that operate on memory bitmaps */
+
+static inline void memory_bm_reset_chunk(struct memory_bitmap *bm)
+{
+	bm->cur.chunk = 0;
+	bm->cur.bit = -1;
+}
+
+static void memory_bm_position_reset(struct memory_bitmap *bm)
+{
+	struct zone_bitmap *zone_bm;
+
+	zone_bm = bm->zone_bm_list;
+	bm->cur.zone_bm = zone_bm;
+	bm->cur.block = zone_bm->bm_blocks;
+	memory_bm_reset_chunk(bm);
+}
+
+static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free);
+
+/**
+ *	create_bm_block_list - create a list of block bitmap objects
+ */
+
+static inline struct bm_block *
+create_bm_block_list(unsigned int nr_blocks, struct chain_allocator *ca)
+{
+	struct bm_block *bblist = NULL;
+
+	while (nr_blocks-- > 0) {
+		struct bm_block *bb;
+
+		bb = chain_alloc(ca, sizeof(struct bm_block));
+		if (!bb)
+			return NULL;
+
+		bb->next = bblist;
+		bblist = bb;
+	}
+	return bblist;
+}
+
+/**
+ *	create_zone_bm_list - create a list of zone bitmap objects
+ */
+
+static inline struct zone_bitmap *
+create_zone_bm_list(unsigned int nr_zones, struct chain_allocator *ca)
+{
+	struct zone_bitmap *zbmlist = NULL;
+
+	while (nr_zones-- > 0) {
+		struct zone_bitmap *zbm;
+
+		zbm = chain_alloc(ca, sizeof(struct zone_bitmap));
+		if (!zbm)
+			return NULL;
+
+		zbm->next = zbmlist;
+		zbmlist = zbm;
+	}
+	return zbmlist;
+}
+
+/**
+  *	memory_bm_create - allocate memory for a memory bitmap
+  */
+
+static int
+memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)
+{
+	struct chain_allocator ca;
+	struct zone *zone;
+	struct zone_bitmap *zone_bm;
+	struct bm_block *bb;
+	unsigned int nr;
+
+	chain_init(&ca, gfp_mask, safe_needed);
+
+	/* Compute the number of zones */
+	nr = 0;
+	for_each_zone (zone)
+		if (populated_zone(zone) && !is_highmem(zone))
+			nr++;
+
+	/* Allocate the list of zones bitmap objects */
+	zone_bm = create_zone_bm_list(nr, &ca);
+	bm->zone_bm_list = zone_bm;
+	if (!zone_bm) {
+		chain_free(&ca, PG_UNSAFE_CLEAR);
+		return -ENOMEM;
+	}
+
+	/* Initialize the zone bitmap objects */
+	for_each_zone (zone) {
+		unsigned long pfn;
+
+		if (!populated_zone(zone) || is_highmem(zone))
+			continue;
+
+		zone_bm->start_pfn = zone->zone_start_pfn;
+		zone_bm->end_pfn = zone->zone_start_pfn + zone->spanned_pages;
+		/* Allocate the list of bitmap block objects */
+		nr = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK);
+		bb = create_bm_block_list(nr, &ca);
+		zone_bm->bm_blocks = bb;
+		zone_bm->cur_block = bb;
+		if (!bb)
+			goto Free;
+
+		nr = zone->spanned_pages;
+		pfn = zone->zone_start_pfn;
+		/* Initialize the bitmap block objects */
+		while (bb) {
+			unsigned long *ptr;
+
+			ptr = alloc_image_page(gfp_mask, safe_needed);
+			bb->data = ptr;
+			if (!ptr)
+				goto Free;
+
+			bb->start_pfn = pfn;
+			if (nr >= BM_BITS_PER_BLOCK) {
+				pfn += BM_BITS_PER_BLOCK;
+				bb->size = BM_CHUNKS_PER_BLOCK;
+				nr -= BM_BITS_PER_BLOCK;
+			} else {
+				/* This is executed only once in the loop */
+				pfn += nr;
+				bb->size = DIV_ROUND_UP(nr, BM_BITS_PER_CHUNK);
+			}
+			bb->end_pfn = pfn;
+			bb = bb->next;
+		}
+		zone_bm = zone_bm->next;
+	}
+	bm->p_list = ca.chain;
+	memory_bm_position_reset(bm);
+	return 0;
+
+Free:
+	bm->p_list = ca.chain;
+	memory_bm_free(bm, PG_UNSAFE_CLEAR);
+	return -ENOMEM;
+}
+
+/**
+  *	memory_bm_free - free memory occupied by the memory bitmap @bm
+  */
+
+static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free)
+{
+	struct zone_bitmap *zone_bm;
+
+	/* Free the list of bit blocks for each zone_bitmap object */
+	zone_bm = bm->zone_bm_list;
+	while (zone_bm) {
+		struct bm_block *bb;
+
+		bb = zone_bm->bm_blocks;
+		while (bb) {
+			if (bb->data)
+				free_image_page(bb->data, clear_nosave_free);
+			bb = bb->next;
+		}
+		zone_bm = zone_bm->next;
+	}
+	free_list_of_pages(bm->p_list, clear_nosave_free);
+	bm->zone_bm_list = NULL;
+}
+
+/**
+ *	memory_bm_set_bit - set the bit in the bitmap @bm that corresponds
+ *	to given pfn.  The cur_zone_bm member of @bm and the cur_block member
+ *	of @bm->cur_zone_bm are updated.
+ *
+ *	If the bit cannot be set, the function returns -EINVAL .
+ */
+
+static int
+memory_bm_set_bit(struct memory_bitmap *bm, unsigned long pfn)
+{
+	struct zone_bitmap *zone_bm;
+	struct bm_block *bb;
+
+	/* Check if the pfn is from the current zone */
+	zone_bm = bm->cur.zone_bm;
+	if (pfn < zone_bm->start_pfn || pfn >= zone_bm->end_pfn) {
+		zone_bm = bm->zone_bm_list;
+		/* We don't assume that the zones are sorted by pfns */
+		while (pfn < zone_bm->start_pfn || pfn >= zone_bm->end_pfn) {
+			zone_bm = zone_bm->next;
+			if (unlikely(!zone_bm))
+				return -EINVAL;
+		}
+		bm->cur.zone_bm = zone_bm;
+	}
+	/* Check if the pfn corresponds to the current bitmap block */
+	bb = zone_bm->cur_block;
+	if (pfn < bb->start_pfn)
+		bb = zone_bm->bm_blocks;
+
+	while (pfn >= bb->end_pfn) {
+		bb = bb->next;
+		if (unlikely(!bb))
+			return -EINVAL;
+	}
+	zone_bm->cur_block = bb;
+	pfn -= bb->start_pfn;
+	set_bit(pfn % BM_BITS_PER_CHUNK, bb->data + pfn / BM_BITS_PER_CHUNK);
+	return 0;
+}
+
+/* Two auxiliary functions for memory_bm_next_pfn */
+
+/* Find the first set bit in the given chunk, if there is one */
+
+static inline int next_bit_in_chunk(int bit, unsigned long *chunk_p)
+{
+	bit++;
+	while (bit < BM_BITS_PER_CHUNK) {
+		if (test_bit(bit, chunk_p))
+			return bit;
+
+		bit++;
+	}
+	return -1;
+}
+
+/* Find a chunk containing some bits set in given block of bits */
+
+static inline int next_chunk_in_block(int n, struct bm_block *bb)
+{
+	n++;
+	while (n < bb->size) {
+		if (bb->data[n])
+			return n;
+
+		n++;
+	}
+	return -1;
+}
+
+/**
+ *	memory_bm_next_pfn - find the pfn that corresponds to the next set bit
+ *	in the bitmap @bm.  If the pfn cannot be found, BM_END_OF_MAP is
+ *	returned.
+ *
+ *	It is required to run memory_bm_position_reset() before the first call to
+ *	this function.
+ */
+
+static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm)
+{
+	struct zone_bitmap *zone_bm;
+	struct bm_block *bb;
+	int chunk;
+	int bit;
+
+	do {
+		bb = bm->cur.block;
+		do {
+			chunk = bm->cur.chunk;
+			bit = bm->cur.bit;
+			do {
+				bit = next_bit_in_chunk(bit, bb->data + chunk);
+				if (bit >= 0)
+					goto Return_pfn;
+
+				chunk = next_chunk_in_block(chunk, bb);
+				bit = -1;
+			} while (chunk >= 0);
+			bb = bb->next;
+			bm->cur.block = bb;
+			memory_bm_reset_chunk(bm);
+		} while (bb);
+		zone_bm = bm->cur.zone_bm->next;
+		if (zone_bm) {
+			bm->cur.zone_bm = zone_bm;
+			bm->cur.block = zone_bm->bm_blocks;
+			memory_bm_reset_chunk(bm);
+		}
+	} while (zone_bm);
+	memory_bm_position_reset(bm);
+	return BM_END_OF_MAP;
+
+Return_pfn:
+	bm->cur.chunk = chunk;
+	bm->cur.bit = bit;
+	return bb->start_pfn + chunk * BM_BITS_PER_CHUNK + bit;
+}
+
+/**
+ *	snapshot_additional_pages - estimate the number of additional pages
+ *	be needed for setting up the suspend image data structures for given
+ *	zone (usually the returned value is greater than the exact number)
+ */
+
+unsigned int snapshot_additional_pages(struct zone *zone)
+{
+	unsigned int res;
+
+	res = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK);
+	res += DIV_ROUND_UP(res * sizeof(struct bm_block), PAGE_SIZE);
+	return res;
+}
+
 /**
  *	pfn_is_nosave - check if given pfn is in the 'nosave' section
  */
@@ -276,32 +737,38 @@ static inline void copy_data_page(long *dst, long *src)
 		*dst++ = *src++;
 }
 
-static void copy_data_pages(struct pbe *pblist)
+static void
+copy_data_pages(struct memory_bitmap *copy_bm, struct memory_bitmap *orig_bm)
 {
 	struct zone *zone;
-	unsigned long pfn, max_zone_pfn;
-	struct pbe *pbe;
+	unsigned long pfn;
 
-	pbe = pblist;
 	for_each_zone (zone) {
+		unsigned long max_zone_pfn;
+
 		if (is_highmem(zone))
 			continue;
+
 		mark_free_pages(zone);
 		max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
-		for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) {
-			struct page *page = saveable_page(pfn);
-
-			if (page) {
-				void *ptr = page_address(page);
-
-				BUG_ON(!pbe);
-				copy_data_page((void *)pbe->address, ptr);
-				pbe->orig_address = (unsigned long)ptr;
-				pbe = pbe->next;
-			}
-		}
+		for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
+			if (saveable_page(pfn))
+				memory_bm_set_bit(orig_bm, pfn);
 	}
-	BUG_ON(pbe);
+	memory_bm_position_reset(orig_bm);
+	memory_bm_position_reset(copy_bm);
+	do {
+		pfn = memory_bm_next_pfn(orig_bm);
+		if (likely(pfn != BM_END_OF_MAP)) {
+			struct page *page;
+			void *src;
+
+			page = pfn_to_page(pfn);
+			src = page_address(page);
+			page = pfn_to_page(memory_bm_next_pfn(copy_bm));
+			copy_data_page(page_address(page), src);
+		}
+	} while (pfn != BM_END_OF_MAP);
 }
 
 /**
@@ -447,37 +914,43 @@ static int enough_free_mem(unsigned int nr_pages)
 		(nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE);
 }
 
-static int alloc_data_pages(struct pbe *pblist, gfp_t gfp_mask, int safe_needed)
+static int
+swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm,
+		unsigned int nr_pages)
 {
-	struct pbe *p;
+	int error;
 
-	for_each_pbe (p, pblist) {
-		p->address = (unsigned long)alloc_image_page(gfp_mask, safe_needed);
-		if (!p->address)
-			return -ENOMEM;
-	}
-	return 0;
-}
+	error = memory_bm_create(orig_bm, GFP_ATOMIC | __GFP_COLD, PG_ANY);
+	if (error)
+		goto Free;
 
-static struct pbe *swsusp_alloc(unsigned int nr_pages)
-{
-	struct pbe *pblist;
+	error = memory_bm_create(copy_bm, GFP_ATOMIC | __GFP_COLD, PG_ANY);
+	if (error)
+		goto Free;
 
-	pblist = alloc_pagedir(nr_pages, GFP_ATOMIC | __GFP_COLD, PG_ANY);
-	if (!pblist) {
-		printk(KERN_ERR "suspend: Allocating pagedir failed.\n");
-		return NULL;
-	}
+	while (nr_pages-- > 0) {
+		struct page *page = alloc_page(GFP_ATOMIC | __GFP_COLD);
+		if (!page)
+			goto Free;
 
-	if (alloc_data_pages(pblist, GFP_ATOMIC | __GFP_COLD, PG_ANY)) {
-		printk(KERN_ERR "suspend: Allocating image pages failed.\n");
-		swsusp_free();
-		return NULL;
+		SetPageNosave(page);
+		SetPageNosaveFree(page);
+		memory_bm_set_bit(copy_bm, page_to_pfn(page));
 	}
+	return 0;
 
-	return pblist;
+Free:
+	swsusp_free();
+	return -ENOMEM;
 }
 
+/* Memory bitmap used for marking saveable pages */
+static struct memory_bitmap orig_bm;
+/* Memory bitmap used for marking allocated pages that will contain the copies
+ * of saveable pages
+ */
+static struct memory_bitmap copy_bm;
+
 asmlinkage int swsusp_save(void)
 {
 	unsigned int nr_pages;
@@ -498,15 +971,14 @@ asmlinkage int swsusp_save(void)
 		return -ENOMEM;
 	}
 
-	restore_pblist = swsusp_alloc(nr_pages);
-	if (!restore_pblist)
+	if (swsusp_alloc(&orig_bm, &copy_bm, nr_pages))
 		return -ENOMEM;
 
 	/* During allocating of suspend pagedir, new cold pages may appear.
 	 * Kill them.
 	 */
 	drain_local_pages();
-	copy_data_pages(restore_pblist);
+	copy_data_pages(&copy_bm, &orig_bm);
 
 	/*
 	 * End of critical section. From now on, we can write to memory,
@@ -535,22 +1007,23 @@ static void init_header(struct swsusp_info *info)
 }
 
 /**
- *	pack_orig_addresses - the .orig_address fields of the PBEs from the
- *	list starting at @pbe are stored in the array @buf[] (1 page)
+ *	pack_addresses - the addresses corresponding to pfns found in the
+ *	bitmap @bm are stored in the array @buf[] (1 page)
  */
 
-static inline struct pbe *pack_orig_addresses(unsigned long *buf, struct pbe *pbe)
+static inline void
+pack_addresses(unsigned long *buf, struct memory_bitmap *bm)
 {
 	int j;
 
-	for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) {
-		buf[j] = pbe->orig_address;
-		pbe = pbe->next;
+	for (j = 0; j < PAGE_SIZE / sizeof(long); j++) {
+		unsigned long pfn = memory_bm_next_pfn(bm);
+
+		if (unlikely(pfn == BM_END_OF_MAP))
+			break;
+
+		buf[j] = (unsigned long)page_address(pfn_to_page(pfn));
 	}
-	if (!pbe)
-		for (; j < PAGE_SIZE / sizeof(long); j++)
-			buf[j] = 0;
-	return pbe;
 }
 
 /**
@@ -579,6 +1052,7 @@ int snapshot_read_next(struct snapshot_handle *handle, size_t count)
 {
 	if (handle->cur > nr_meta_pages + nr_copy_pages)
 		return 0;
+
 	if (!buffer) {
 		/* This makes the buffer be freed by swsusp_free() */
 		buffer = alloc_image_page(GFP_ATOMIC, PG_ANY);
@@ -588,16 +1062,17 @@ int snapshot_read_next(struct snapshot_handle *handle, size_t count)
 	if (!handle->offset) {
 		init_header((struct swsusp_info *)buffer);
 		handle->buffer = buffer;
-		handle->pbe = restore_pblist;
+		memory_bm_position_reset(&orig_bm);
+		memory_bm_position_reset(&copy_bm);
 	}
 	if (handle->prev < handle->cur) {
 		if (handle->cur <= nr_meta_pages) {
-			handle->pbe = pack_orig_addresses(buffer, handle->pbe);
-			if (!handle->pbe)
-				handle->pbe = restore_pblist;
+			memset(buffer, 0, PAGE_SIZE);
+			pack_addresses(buffer, &orig_bm);
 		} else {
-			handle->buffer = (void *)handle->pbe->address;
-			handle->pbe = handle->pbe->next;
+			unsigned long pfn = memory_bm_next_pfn(&copy_bm);
+
+			handle->buffer = page_address(pfn_to_page(pfn));
 		}
 		handle->prev = handle->cur;
 	}
@@ -736,12 +1211,7 @@ static inline struct pbe *unpack_orig_addresses(unsigned long *buf,
  *	of "safe" which will be used later
  */
 
-struct safe_page {
-	struct safe_page *next;
-	char padding[PAGE_SIZE - sizeof(void *)];
-};
-
-static struct safe_page *safe_pages;
+static struct linked_page *safe_pages;
 
 static int prepare_image(struct snapshot_handle *handle)
 {
@@ -763,9 +1233,9 @@ static int prepare_image(struct snapshot_handle *handle)
 	if (!error && nr_pages > unsafe_pages) {
 		nr_pages -= unsafe_pages;
 		while (nr_pages--) {
-			struct safe_page *ptr;
+			struct linked_page *ptr;
 
-			ptr = (struct safe_page *)get_zeroed_page(GFP_ATOMIC);
+			ptr = (void *)get_zeroed_page(GFP_ATOMIC);
 			if (!ptr) {
 				error = -ENOMEM;
 				break;
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 17f669c83012..8ef677ea0cea 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -193,14 +193,13 @@ int swsusp_shrink_memory(void)
 	printk("Shrinking memory...  ");
 	do {
 		size = 2 * count_highmem_pages();
-		size += size / 50 + count_data_pages();
-		size += (size + PBES_PER_PAGE - 1) / PBES_PER_PAGE +
-			PAGES_FOR_IO;
+		size += size / 50 + count_data_pages() + PAGES_FOR_IO;
 		tmp = size;
 		for_each_zone (zone)
 			if (!is_highmem(zone) && populated_zone(zone)) {
 				tmp -= zone->free_pages;
 				tmp += zone->lowmem_reserve[ZONE_NORMAL];
+				tmp += snapshot_additional_pages(zone);
 			}
 		if (tmp > 0) {
 			tmp = __shrink_memory(tmp);
-- 
cgit v1.2.3


From 940864ddabdb180e02041c4dcd46ba6f9eee732f Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Mon, 25 Sep 2006 23:32:55 -0700
Subject: [PATCH] swsusp: Use memory bitmaps during resume

Make swsusp use memory bitmaps to store its internal information during the
resume phase of the suspend-resume cycle.

If the pfns of saveable pages are saved during the suspend phase instead of
the kernel virtual addresses of these pages, we can use them during the resume
phase directly to set the corresponding bits in a memory bitmap.  Then, this
bitmap is used to mark the page frames corresponding to the pages that were
saveable before the suspend (aka "unsafe" page frames).

Next, we allocate as many page frames as needed to store the entire suspend
image and make sure that there will be some extra free "safe" page frames for
the list of PBEs constructed later.  Subsequently, the image is loaded and, if
possible, the data loaded from it are written into their "original" page
frames (ie.  the ones they had occupied before the suspend).

The image data that cannot be written into their "original" page frames are
loaded into "safe" page frames and their "original" kernel virtual addresses,
as well as the addresses of the "safe" pages containing their copies, are
stored in a list of PBEs.  Finally, the list of PBEs is used to copy the
remaining image data into their "original" page frames (this is done
atomically, by the architecture-dependent parts of swsusp).

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/suspend.h |   9 --
 kernel/power/power.h    |  11 +-
 kernel/power/snapshot.c | 416 +++++++++++++++++++++---------------------------
 kernel/power/swap.c     |   4 +-
 kernel/power/user.c     |   1 +
 5 files changed, 186 insertions(+), 255 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index c11cacf1a13b..b1237f16ecde 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -16,15 +16,6 @@ struct pbe {
 	struct pbe *next;
 };
 
-#define for_each_pbe(pbe, pblist) \
-	for (pbe = pblist ; pbe ; pbe = pbe->next)
-
-#define PBES_PER_PAGE      (PAGE_SIZE/sizeof(struct pbe))
-#define PB_PAGE_SKIP       (PBES_PER_PAGE-1)
-
-#define for_each_pb_page(pbe, pblist) \
-	for (pbe = pblist ; pbe ; pbe = (pbe+PB_PAGE_SKIP)->next)
-
 /* mm/page_alloc.c */
 extern void drain_local_pages(void);
 extern void mark_free_pages(struct zone *zone);
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 6e9e2acc34f8..bfe999f7b272 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -81,16 +81,6 @@ struct snapshot_handle {
 	unsigned int	prev;	/* number of the block of PAGE_SIZE bytes that
 				 * was the current one previously
 				 */
-	struct pbe	*pbe;	/* PBE that corresponds to 'buffer' */
-	struct pbe	*last_pbe;	/* When the image is restored (eg. read
-					 * from disk) we can store some image
-					 * data directly in the page frames
-					 * in which they were before suspend.
-					 * In such a case the PBEs that
-					 * correspond to them will be unused.
-					 * This is the last PBE, so far, that
-					 * does not correspond to such data.
-					 */
 	void		*buffer;	/* address of the block to read from
 					 * or write to
 					 */
@@ -113,6 +103,7 @@ extern unsigned int snapshot_additional_pages(struct zone *zone);
 extern int snapshot_read_next(struct snapshot_handle *handle, size_t count);
 extern int snapshot_write_next(struct snapshot_handle *handle, size_t count);
 extern int snapshot_image_loaded(struct snapshot_handle *handle);
+extern void snapshot_free_unused_memory(struct snapshot_handle *handle);
 
 #define SNAPSHOT_IOC_MAGIC	'3'
 #define SNAPSHOT_FREEZE			_IO(SNAPSHOT_IOC_MAGIC, 1)
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 852e0df41719..1b84313cbab5 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -39,7 +39,7 @@ struct pbe *restore_pblist;
 
 static unsigned int nr_copy_pages;
 static unsigned int nr_meta_pages;
-static unsigned long *buffer;
+static void *buffer;
 
 #ifdef CONFIG_HIGHMEM
 unsigned int count_highmem_pages(void)
@@ -172,7 +172,7 @@ static inline int restore_highmem(void) {return 0;}
 #define PG_UNSAFE_CLEAR	1
 #define PG_UNSAFE_KEEP	0
 
-static unsigned int unsafe_pages;
+static unsigned int allocated_unsafe_pages;
 
 static void *alloc_image_page(gfp_t gfp_mask, int safe_needed)
 {
@@ -183,7 +183,7 @@ static void *alloc_image_page(gfp_t gfp_mask, int safe_needed)
 		while (res && PageNosaveFree(virt_to_page(res))) {
 			/* The page is unsafe, mark it for swsusp_free() */
 			SetPageNosave(virt_to_page(res));
-			unsafe_pages++;
+			allocated_unsafe_pages++;
 			res = (void *)get_zeroed_page(gfp_mask);
 		}
 	if (res) {
@@ -772,101 +772,10 @@ copy_data_pages(struct memory_bitmap *copy_bm, struct memory_bitmap *orig_bm)
 }
 
 /**
- *	free_pagedir - free pages allocated with alloc_pagedir()
- */
-
-static void free_pagedir(struct pbe *pblist, int clear_nosave_free)
-{
-	struct pbe *pbe;
-
-	while (pblist) {
-		pbe = (pblist + PB_PAGE_SKIP)->next;
-		free_image_page(pblist, clear_nosave_free);
-		pblist = pbe;
-	}
-}
-
-/**
- *	fill_pb_page - Create a list of PBEs on a given memory page
- */
-
-static inline void fill_pb_page(struct pbe *pbpage, unsigned int n)
-{
-	struct pbe *p;
-
-	p = pbpage;
-	pbpage += n - 1;
-	do
-		p->next = p + 1;
-	while (++p < pbpage);
-}
-
-/**
- *	create_pbe_list - Create a list of PBEs on top of a given chain
- *	of memory pages allocated with alloc_pagedir()
+ *	swsusp_free - free pages allocated for the suspend.
  *
- *	This function assumes that pages allocated by alloc_image_page() will
- *	always be zeroed.
- */
-
-static inline void create_pbe_list(struct pbe *pblist, unsigned int nr_pages)
-{
-	struct pbe *pbpage;
-	unsigned int num = PBES_PER_PAGE;
-
-	for_each_pb_page (pbpage, pblist) {
-		if (num >= nr_pages)
-			break;
-
-		fill_pb_page(pbpage, PBES_PER_PAGE);
-		num += PBES_PER_PAGE;
-	}
-	if (pbpage) {
-		num -= PBES_PER_PAGE;
-		fill_pb_page(pbpage, nr_pages - num);
-	}
-}
-
-/**
- *	alloc_pagedir - Allocate the page directory.
- *
- *	First, determine exactly how many pages we need and
- *	allocate them.
- *
- *	We arrange the pages in a chain: each page is an array of PBES_PER_PAGE
- *	struct pbe elements (pbes) and the last element in the page points
- *	to the next page.
- *
- *	On each page we set up a list of struct_pbe elements.
- */
-
-static struct pbe *alloc_pagedir(unsigned int nr_pages, gfp_t gfp_mask,
-				 int safe_needed)
-{
-	unsigned int num;
-	struct pbe *pblist, *pbe;
-
-	if (!nr_pages)
-		return NULL;
-
-	pblist = alloc_image_page(gfp_mask, safe_needed);
-	pbe = pblist;
-	for (num = PBES_PER_PAGE; num < nr_pages; num += PBES_PER_PAGE) {
-		if (!pbe) {
-			free_pagedir(pblist, PG_UNSAFE_CLEAR);
-			return NULL;
-		}
-		pbe += PB_PAGE_SKIP;
-		pbe->next = alloc_image_page(gfp_mask, safe_needed);
-		pbe = pbe->next;
-	}
-	create_pbe_list(pblist, nr_pages);
-	return pblist;
-}
-
-/**
- * Free pages we allocated for suspend. Suspend pages are alocated
- * before atomic copy, so we need to free them after resume.
+ *	Suspend pages are alocated before the atomic copy is made, so we
+ *	need to release them after the resume.
  */
 
 void swsusp_free(void)
@@ -904,14 +813,18 @@ void swsusp_free(void)
 static int enough_free_mem(unsigned int nr_pages)
 {
 	struct zone *zone;
-	unsigned int n = 0;
+	unsigned int free = 0, meta = 0;
 
 	for_each_zone (zone)
-		if (!is_highmem(zone))
-			n += zone->free_pages;
-	pr_debug("swsusp: available memory: %u pages\n", n);
-	return n > (nr_pages + PAGES_FOR_IO +
-		(nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE);
+		if (!is_highmem(zone)) {
+			free += zone->free_pages;
+			meta += snapshot_additional_pages(zone);
+		}
+
+	pr_debug("swsusp: pages needed: %u + %u + %u, available pages: %u\n",
+		nr_pages, PAGES_FOR_IO, meta, free);
+
+	return free > nr_pages + PAGES_FOR_IO + meta;
 }
 
 static int
@@ -961,11 +874,6 @@ asmlinkage int swsusp_save(void)
 	nr_pages = count_data_pages();
 	printk("swsusp: Need to copy %u pages\n", nr_pages);
 
-	pr_debug("swsusp: pages needed: %u + %lu + %u, free: %u\n",
-		 nr_pages,
-		 (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE,
-		 PAGES_FOR_IO, nr_free_pages());
-
 	if (!enough_free_mem(nr_pages)) {
 		printk(KERN_ERR "swsusp: Not enough free memory\n");
 		return -ENOMEM;
@@ -1007,22 +915,19 @@ static void init_header(struct swsusp_info *info)
 }
 
 /**
- *	pack_addresses - the addresses corresponding to pfns found in the
- *	bitmap @bm are stored in the array @buf[] (1 page)
+ *	pack_pfns - pfns corresponding to the set bits found in the bitmap @bm
+ *	are stored in the array @buf[] (1 page at a time)
  */
 
 static inline void
-pack_addresses(unsigned long *buf, struct memory_bitmap *bm)
+pack_pfns(unsigned long *buf, struct memory_bitmap *bm)
 {
 	int j;
 
 	for (j = 0; j < PAGE_SIZE / sizeof(long); j++) {
-		unsigned long pfn = memory_bm_next_pfn(bm);
-
-		if (unlikely(pfn == BM_END_OF_MAP))
+		buf[j] = memory_bm_next_pfn(bm);
+		if (unlikely(buf[j] == BM_END_OF_MAP))
 			break;
-
-		buf[j] = (unsigned long)page_address(pfn_to_page(pfn));
 	}
 }
 
@@ -1068,7 +973,7 @@ int snapshot_read_next(struct snapshot_handle *handle, size_t count)
 	if (handle->prev < handle->cur) {
 		if (handle->cur <= nr_meta_pages) {
 			memset(buffer, 0, PAGE_SIZE);
-			pack_addresses(buffer, &orig_bm);
+			pack_pfns(buffer, &orig_bm);
 		} else {
 			unsigned long pfn = memory_bm_next_pfn(&copy_bm);
 
@@ -1094,14 +999,10 @@ int snapshot_read_next(struct snapshot_handle *handle, size_t count)
  *	had been used before suspend
  */
 
-static int mark_unsafe_pages(struct pbe *pblist)
+static int mark_unsafe_pages(struct memory_bitmap *bm)
 {
 	struct zone *zone;
 	unsigned long pfn, max_zone_pfn;
-	struct pbe *p;
-
-	if (!pblist) /* a sanity check */
-		return -EINVAL;
 
 	/* Clear page flags */
 	for_each_zone (zone) {
@@ -1111,30 +1012,37 @@ static int mark_unsafe_pages(struct pbe *pblist)
 				ClearPageNosaveFree(pfn_to_page(pfn));
 	}
 
-	/* Mark orig addresses */
-	for_each_pbe (p, pblist) {
-		if (virt_addr_valid(p->orig_address))
-			SetPageNosaveFree(virt_to_page(p->orig_address));
-		else
-			return -EFAULT;
-	}
+	/* Mark pages that correspond to the "original" pfns as "unsafe" */
+	memory_bm_position_reset(bm);
+	do {
+		pfn = memory_bm_next_pfn(bm);
+		if (likely(pfn != BM_END_OF_MAP)) {
+			if (likely(pfn_valid(pfn)))
+				SetPageNosaveFree(pfn_to_page(pfn));
+			else
+				return -EFAULT;
+		}
+	} while (pfn != BM_END_OF_MAP);
 
-	unsafe_pages = 0;
+	allocated_unsafe_pages = 0;
 
 	return 0;
 }
 
-static void copy_page_backup_list(struct pbe *dst, struct pbe *src)
+static void
+duplicate_memory_bitmap(struct memory_bitmap *dst, struct memory_bitmap *src)
 {
-	/* We assume both lists contain the same number of elements */
-	while (src) {
-		dst->orig_address = src->orig_address;
-		dst = dst->next;
-		src = src->next;
+	unsigned long pfn;
+
+	memory_bm_position_reset(src);
+	pfn = memory_bm_next_pfn(src);
+	while (pfn != BM_END_OF_MAP) {
+		memory_bm_set_bit(dst, pfn);
+		pfn = memory_bm_next_pfn(src);
 	}
 }
 
-static int check_header(struct swsusp_info *info)
+static inline int check_header(struct swsusp_info *info)
 {
 	char *reason = NULL;
 
@@ -1161,19 +1069,14 @@ static int check_header(struct swsusp_info *info)
  *	load header - check the image header and copy data from it
  */
 
-static int load_header(struct snapshot_handle *handle,
-                              struct swsusp_info *info)
+static int
+load_header(struct swsusp_info *info)
 {
 	int error;
-	struct pbe *pblist;
 
+	restore_pblist = NULL;
 	error = check_header(info);
 	if (!error) {
-		pblist = alloc_pagedir(info->image_pages, GFP_ATOMIC, PG_ANY);
-		if (!pblist)
-			return -ENOMEM;
-		restore_pblist = pblist;
-		handle->pbe = pblist;
 		nr_copy_pages = info->image_pages;
 		nr_meta_pages = info->pages - info->image_pages - 1;
 	}
@@ -1181,108 +1084,137 @@ static int load_header(struct snapshot_handle *handle,
 }
 
 /**
- *	unpack_orig_addresses - copy the elements of @buf[] (1 page) to
- *	the PBEs in the list starting at @pbe
+ *	unpack_orig_pfns - for each element of @buf[] (1 page at a time) set
+ *	the corresponding bit in the memory bitmap @bm
  */
 
-static inline struct pbe *unpack_orig_addresses(unsigned long *buf,
-                                                struct pbe *pbe)
+static inline void
+unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm)
 {
 	int j;
 
-	for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) {
-		pbe->orig_address = buf[j];
-		pbe = pbe->next;
+	for (j = 0; j < PAGE_SIZE / sizeof(long); j++) {
+		if (unlikely(buf[j] == BM_END_OF_MAP))
+			break;
+
+		memory_bm_set_bit(bm, buf[j]);
 	}
-	return pbe;
 }
 
 /**
- *	prepare_image - use metadata contained in the PBE list
- *	pointed to by restore_pblist to mark the pages that will
- *	be overwritten in the process of restoring the system
- *	memory state from the image ("unsafe" pages) and allocate
- *	memory for the image
+ *	prepare_image - use the memory bitmap @bm to mark the pages that will
+ *	be overwritten in the process of restoring the system memory state
+ *	from the suspend image ("unsafe" pages) and allocate memory for the
+ *	image.
  *
- *	The idea is to allocate the PBE list first and then
- *	allocate as many pages as it's needed for the image data,
- *	but not to assign these pages to the PBEs initially.
- *	Instead, we just mark them as allocated and create a list
- *	of "safe" which will be used later
+ *	The idea is to allocate a new memory bitmap first and then allocate
+ *	as many pages as needed for the image data, but not to assign these
+ *	pages to specific tasks initially.  Instead, we just mark them as
+ *	allocated and create a list of "safe" pages that will be used later.
  */
 
-static struct linked_page *safe_pages;
+#define PBES_PER_LINKED_PAGE	(LINKED_PAGE_DATA_SIZE / sizeof(struct pbe))
+
+static struct linked_page *safe_pages_list;
 
-static int prepare_image(struct snapshot_handle *handle)
+static int
+prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
 {
-	int error = 0;
-	unsigned int nr_pages = nr_copy_pages;
-	struct pbe *p, *pblist = NULL;
+	unsigned int nr_pages;
+	struct linked_page *sp_list, *lp;
+	int error;
 
-	p = restore_pblist;
-	error = mark_unsafe_pages(p);
-	if (!error) {
-		pblist = alloc_pagedir(nr_pages, GFP_ATOMIC, PG_SAFE);
-		if (pblist)
-			copy_page_backup_list(pblist, p);
-		free_pagedir(p, PG_UNSAFE_KEEP);
-		if (!pblist)
+	error = mark_unsafe_pages(bm);
+	if (error)
+		goto Free;
+
+	error = memory_bm_create(new_bm, GFP_ATOMIC, PG_SAFE);
+	if (error)
+		goto Free;
+
+	duplicate_memory_bitmap(new_bm, bm);
+	memory_bm_free(bm, PG_UNSAFE_KEEP);
+	/* Reserve some safe pages for potential later use.
+	 *
+	 * NOTE: This way we make sure there will be enough safe pages for the
+	 * chain_alloc() in get_buffer().  It is a bit wasteful, but
+	 * nr_copy_pages cannot be greater than 50% of the memory anyway.
+	 */
+	sp_list = NULL;
+	/* nr_copy_pages cannot be lesser than allocated_unsafe_pages */
+	nr_pages = nr_copy_pages - allocated_unsafe_pages;
+	nr_pages = DIV_ROUND_UP(nr_pages, PBES_PER_LINKED_PAGE);
+	while (nr_pages > 0) {
+		lp = alloc_image_page(GFP_ATOMIC, PG_SAFE);
+		if (!lp) {
 			error = -ENOMEM;
+			goto Free;
+		}
+		lp->next = sp_list;
+		sp_list = lp;
+		nr_pages--;
 	}
-	safe_pages = NULL;
-	if (!error && nr_pages > unsafe_pages) {
-		nr_pages -= unsafe_pages;
-		while (nr_pages--) {
-			struct linked_page *ptr;
-
-			ptr = (void *)get_zeroed_page(GFP_ATOMIC);
-			if (!ptr) {
-				error = -ENOMEM;
-				break;
-			}
-			if (!PageNosaveFree(virt_to_page(ptr))) {
-				/* The page is "safe", add it to the list */
-				ptr->next = safe_pages;
-				safe_pages = ptr;
-			}
-			/* Mark the page as allocated */
-			SetPageNosave(virt_to_page(ptr));
-			SetPageNosaveFree(virt_to_page(ptr));
+	/* Preallocate memory for the image */
+	safe_pages_list = NULL;
+	nr_pages = nr_copy_pages - allocated_unsafe_pages;
+	while (nr_pages > 0) {
+		lp = (struct linked_page *)get_zeroed_page(GFP_ATOMIC);
+		if (!lp) {
+			error = -ENOMEM;
+			goto Free;
+		}
+		if (!PageNosaveFree(virt_to_page(lp))) {
+			/* The page is "safe", add it to the list */
+			lp->next = safe_pages_list;
+			safe_pages_list = lp;
 		}
+		/* Mark the page as allocated */
+		SetPageNosave(virt_to_page(lp));
+		SetPageNosaveFree(virt_to_page(lp));
+		nr_pages--;
 	}
-	if (!error) {
-		restore_pblist = pblist;
-	} else {
-		handle->pbe = NULL;
-		swsusp_free();
+	/* Free the reserved safe pages so that chain_alloc() can use them */
+	while (sp_list) {
+		lp = sp_list->next;
+		free_image_page(sp_list, PG_UNSAFE_CLEAR);
+		sp_list = lp;
 	}
+	return 0;
+
+Free:
+	swsusp_free();
 	return error;
 }
 
-static void *get_buffer(struct snapshot_handle *handle)
+/**
+ *	get_buffer - compute the address that snapshot_write_next() should
+ *	set for its caller to write to.
+ */
+
+static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)
 {
-	struct pbe *pbe = handle->pbe, *last = handle->last_pbe;
-	struct page *page = virt_to_page(pbe->orig_address);
+	struct pbe *pbe;
+	struct page *page = pfn_to_page(memory_bm_next_pfn(bm));
 
-	if (PageNosave(page) && PageNosaveFree(page)) {
-		/*
-		 * We have allocated the "original" page frame and we can
-		 * use it directly to store the read page
+	if (PageNosave(page) && PageNosaveFree(page))
+		/* We have allocated the "original" page frame and we can
+		 * use it directly to store the loaded page.
 		 */
-		pbe->address = 0;
-		if (last && last->next)
-			last->next = NULL;
-		return (void *)pbe->orig_address;
-	}
-	/*
-	 * The "original" page frame has not been allocated and we have to
-	 * use a "safe" page frame to store the read page
+		return page_address(page);
+
+	/* The "original" page frame has not been allocated and we have to
+	 * use a "safe" page frame to store the loaded page.
 	 */
-	pbe->address = (unsigned long)safe_pages;
-	safe_pages = safe_pages->next;
-	if (last)
-		last->next = pbe;
-	handle->last_pbe = pbe;
+	pbe = chain_alloc(ca, sizeof(struct pbe));
+	if (!pbe) {
+		swsusp_free();
+		return NULL;
+	}
+	pbe->orig_address = (unsigned long)page_address(page);
+	pbe->address = (unsigned long)safe_pages_list;
+	safe_pages_list = safe_pages_list->next;
+	pbe->next = restore_pblist;
+	restore_pblist = pbe;
 	return (void *)pbe->address;
 }
 
@@ -1310,10 +1242,13 @@ static void *get_buffer(struct snapshot_handle *handle)
 
 int snapshot_write_next(struct snapshot_handle *handle, size_t count)
 {
+	static struct chain_allocator ca;
 	int error = 0;
 
+	/* Check if we have already loaded the entire image */
 	if (handle->prev && handle->cur > nr_meta_pages + nr_copy_pages)
 		return 0;
+
 	if (!buffer) {
 		/* This makes the buffer be freed by swsusp_free() */
 		buffer = alloc_image_page(GFP_ATOMIC, PG_ANY);
@@ -1324,26 +1259,32 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count)
 		handle->buffer = buffer;
 	handle->sync_read = 1;
 	if (handle->prev < handle->cur) {
-		if (!handle->prev) {
-			error = load_header(handle,
-					(struct swsusp_info *)buffer);
+		if (handle->prev == 0) {
+			error = load_header(buffer);
+			if (error)
+				return error;
+
+			error = memory_bm_create(&copy_bm, GFP_ATOMIC, PG_ANY);
 			if (error)
 				return error;
+
 		} else if (handle->prev <= nr_meta_pages) {
-			handle->pbe = unpack_orig_addresses(buffer,
-							handle->pbe);
-			if (!handle->pbe) {
-				error = prepare_image(handle);
+			unpack_orig_pfns(buffer, &copy_bm);
+			if (handle->prev == nr_meta_pages) {
+				error = prepare_image(&orig_bm, &copy_bm);
 				if (error)
 					return error;
-				handle->pbe = restore_pblist;
-				handle->last_pbe = NULL;
-				handle->buffer = get_buffer(handle);
+
+				chain_init(&ca, GFP_ATOMIC, PG_SAFE);
+				memory_bm_position_reset(&orig_bm);
+				restore_pblist = NULL;
+				handle->buffer = get_buffer(&orig_bm, &ca);
 				handle->sync_read = 0;
+				if (!handle->buffer)
+					return -ENOMEM;
 			}
 		} else {
-			handle->pbe = handle->pbe->next;
-			handle->buffer = get_buffer(handle);
+			handle->buffer = get_buffer(&orig_bm, &ca);
 			handle->sync_read = 0;
 		}
 		handle->prev = handle->cur;
@@ -1362,6 +1303,13 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count)
 
 int snapshot_image_loaded(struct snapshot_handle *handle)
 {
-	return !(!handle->pbe || handle->pbe->next || !nr_copy_pages ||
-		handle->cur <= nr_meta_pages + nr_copy_pages);
+	return !(!nr_copy_pages ||
+			handle->cur <= nr_meta_pages + nr_copy_pages);
+}
+
+void snapshot_free_unused_memory(struct snapshot_handle *handle)
+{
+	/* Free only if we have loaded the image entirely */
+	if (handle->prev && handle->cur > nr_meta_pages + nr_copy_pages)
+		memory_bm_free(&orig_bm, PG_UNSAFE_CLEAR);
 }
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 8309d20b2563..9b2ee5344dee 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -331,8 +331,7 @@ static int enough_swap(unsigned int nr_pages)
 	unsigned int free_swap = count_swap_pages(root_swap, 1);
 
 	pr_debug("swsusp: free swap pages: %u\n", free_swap);
-	return free_swap > (nr_pages + PAGES_FOR_IO +
-		(nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE);
+	return free_swap > nr_pages + PAGES_FOR_IO;
 }
 
 /**
@@ -547,6 +546,7 @@ static int load_image(struct swap_map_handle *handle,
 		error = err2;
 	if (!error) {
 		printk("\b\b\b\bdone\n");
+		snapshot_free_unused_memory(snapshot);
 		if (!snapshot_image_loaded(snapshot))
 			error = -ENODATA;
 	}
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 0ef5e4ba39e5..2e4499f3e4d9 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -193,6 +193,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
 			error = -EPERM;
 			break;
 		}
+		snapshot_free_unused_memory(&data->handle);
 		down(&pm_sem);
 		pm_prepare_console();
 		error = device_suspend(PMSG_FREEZE);
-- 
cgit v1.2.3


From c8eb8b4025175f967af0ba8e933f23aa9954dc35 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Mon, 25 Sep 2006 23:32:56 -0700
Subject: [PATCH] PM: make it possible to disable console suspending

Change suspend_console() so that it waits for all consoles to flush the
remaining messages and make it possible to switch the console suspending off
with the help of a Kconfig option.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@ucw.cz>
Cc: Stefan Seyfried <seife@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/console.h |  5 +++++
 kernel/power/Kconfig    | 11 +++++++++++
 kernel/printk.c         |  3 +++
 3 files changed, 19 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/console.h b/include/linux/console.h
index 3bdf2155e565..76a1807726eb 100644
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -120,9 +120,14 @@ extern void console_stop(struct console *);
 extern void console_start(struct console *);
 extern int is_console_locked(void);
 
+#ifndef CONFIG_DISABLE_CONSOLE_SUSPEND
 /* Suspend and resume console messages over PM events */
 extern void suspend_console(void);
 extern void resume_console(void);
+#else
+static inline void suspend_console(void) {}
+static inline void resume_console(void) {}
+#endif /* CONFIG_DISABLE_CONSOLE_SUSPEND */
 
 /* Some debug stub to catch some of the obvious races in the VT code */
 #if 1
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 619ecabf7c58..4b6e2f18e056 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -36,6 +36,17 @@ config PM_DEBUG
 	code. This is helpful when debugging and reporting various PM bugs, 
 	like suspend support.
 
+config DISABLE_CONSOLE_SUSPEND
+	bool "Keep console(s) enabled during suspend/resume (DANGEROUS)"
+	depends on PM && PM_DEBUG
+	default n
+	---help---
+	This option turns off the console suspend mechanism that prevents
+	debug messages from reaching the console during the suspend/resume
+	operations.  This may be helpful when debugging device drivers'
+	suspend/resume routines, but may itself lead to problems, for example
+	if netconsole is used.
+
 config PM_TRACE
 	bool "Suspend/resume event tracing"
 	depends on PM && PM_DEBUG && X86_32 && EXPERIMENTAL
diff --git a/kernel/printk.c b/kernel/printk.c
index 1149365e989e..771f5e861bcd 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -721,6 +721,7 @@ int __init add_preferred_console(char *name, int idx, char *options)
 	return 0;
 }
 
+#ifndef CONFIG_DISABLE_CONSOLE_SUSPEND
 /**
  * suspend_console - suspend the console subsystem
  *
@@ -728,6 +729,7 @@ int __init add_preferred_console(char *name, int idx, char *options)
  */
 void suspend_console(void)
 {
+	printk("Suspending console(s)\n");
 	acquire_console_sem();
 	console_suspended = 1;
 }
@@ -737,6 +739,7 @@ void resume_console(void)
 	console_suspended = 0;
 	release_console_sem();
 }
+#endif /* CONFIG_DISABLE_CONSOLE_SUSPEND */
 
 /**
  * acquire_console_sem - lock the console system for exclusive use.
-- 
cgit v1.2.3


From c5c6ba4e08ab9c9e390a0f3a7d9a5c332f5cc6ef Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Mon, 25 Sep 2006 23:32:58 -0700
Subject: [PATCH] PM: Add pm_trace switch

Add the pm_trace attribute in /sys/power which has to be explicitly set to
one to really enable the "PM tracing" code compiled in when CONFIG_PM_TRACE
is set (which modifies the machine's CMOS clock in unpredictable ways).

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/power/interface.txt | 15 +++++++++++++++
 include/linux/resume-trace.h      | 24 ++++++++++++++----------
 kernel/power/main.c               | 30 ++++++++++++++++++++++++++++++
 3 files changed, 59 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/power/interface.txt b/Documentation/power/interface.txt
index 4117802af0f8..a66bec222b16 100644
--- a/Documentation/power/interface.txt
+++ b/Documentation/power/interface.txt
@@ -52,3 +52,18 @@ suspend image will be as small as possible.
 
 Reading from this file will display the current image size limit, which
 is set to 500 MB by default.
+
+/sys/power/pm_trace controls the code which saves the last PM event point in
+the RTC across reboots, so that you can debug a machine that just hangs
+during suspend (or more commonly, during resume).  Namely, the RTC is only
+used to save the last PM event point if this file contains '1'.  Initially it
+contains '0' which may be changed to '1' by writing a string representing a
+nonzero integer into it.
+
+To use this debugging feature you should attempt to suspend the machine, then
+reboot it and run
+
+	dmesg -s 1000000 | grep 'hash matches'
+
+CAUTION: Using it will cause your machine's real-time (CMOS) clock to be
+set to a random invalid time after a resume.
diff --git a/include/linux/resume-trace.h b/include/linux/resume-trace.h
index a376bd4ade39..81e9299ca148 100644
--- a/include/linux/resume-trace.h
+++ b/include/linux/resume-trace.h
@@ -3,21 +3,25 @@
 
 #ifdef CONFIG_PM_TRACE
 
+extern int pm_trace_enabled;
+
 struct device;
 extern void set_trace_device(struct device *);
 extern void generate_resume_trace(void *tracedata, unsigned int user);
 
 #define TRACE_DEVICE(dev) set_trace_device(dev)
-#define TRACE_RESUME(user) do {				\
-	void *tracedata;				\
-	asm volatile("movl $1f,%0\n"			\
-		".section .tracedata,\"a\"\n"		\
-		"1:\t.word %c1\n"			\
-		"\t.long %c2\n"				\
-		".previous"				\
-		:"=r" (tracedata)			\
-		: "i" (__LINE__), "i" (__FILE__));	\
-	generate_resume_trace(tracedata, user);		\
+#define TRACE_RESUME(user) do {					\
+	if (pm_trace_enabled) {					\
+		void *tracedata;				\
+		asm volatile("movl $1f,%0\n"			\
+			".section .tracedata,\"a\"\n"		\
+			"1:\t.word %c1\n"			\
+			"\t.long %c2\n"				\
+			".previous"				\
+			:"=r" (tracedata)			\
+			: "i" (__LINE__), "i" (__FILE__));	\
+		generate_resume_trace(tracedata, user);		\
+	}							\
 } while (0)
 
 #else
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 4d403323a7bb..873228c71dab 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -17,6 +17,7 @@
 #include <linux/pm.h>
 #include <linux/console.h>
 #include <linux/cpu.h>
+#include <linux/resume-trace.h>
 
 #include "power.h"
 
@@ -281,10 +282,39 @@ static ssize_t state_store(struct subsystem * subsys, const char * buf, size_t n
 
 power_attr(state);
 
+#ifdef CONFIG_PM_TRACE
+int pm_trace_enabled;
+
+static ssize_t pm_trace_show(struct subsystem * subsys, char * buf)
+{
+	return sprintf(buf, "%d\n", pm_trace_enabled);
+}
+
+static ssize_t
+pm_trace_store(struct subsystem * subsys, const char * buf, size_t n)
+{
+	int val;
+
+	if (sscanf(buf, "%d", &val) == 1) {
+		pm_trace_enabled = !!val;
+		return n;
+	}
+	return -EINVAL;
+}
+
+power_attr(pm_trace);
+
+static struct attribute * g[] = {
+	&state_attr.attr,
+	&pm_trace_attr.attr,
+	NULL,
+};
+#else
 static struct attribute * g[] = {
 	&state_attr.attr,
 	NULL,
 };
+#endif /* CONFIG_PM_TRACE */
 
 static struct attribute_group attr_group = {
 	.attrs = g,
-- 
cgit v1.2.3


From d33b6fba2c4350651f3f61ff2ab858a2f116e9a4 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew@wil.cx>
Date: Fri, 30 Jun 2006 02:31:24 -0700
Subject: Resources: insert identical resources above existing resources

If you have two resources which aree exactly the same size,
insert_resource() currently inserts the new one below the existing one.
This is wrong because there's no way to insert a resource of the same size
above an existing one.

I took this opportunity to rewrite the initial loop to be a for-loop
instead of a goto-loop and fix the documentation.

Signed-off-by: Matthew Wilcox <matthew@wil.cx>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: Dominik Brodowski <linux@dominikbrodowski.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 kernel/resource.c | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/resource.c b/kernel/resource.c
index 46286434af80..9db38a1a7520 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -344,12 +344,11 @@ EXPORT_SYMBOL(allocate_resource);
  *
  * Returns 0 on success, -EBUSY if the resource can't be inserted.
  *
- * This function is equivalent of request_resource when no conflict
+ * This function is equivalent to request_resource when no conflict
  * happens. If a conflict happens, and the conflicting resources
  * entirely fit within the range of the new resource, then the new
- * resource is inserted and the conflicting resources become childs of
- * the new resource.  Otherwise the new resource becomes the child of
- * the conflicting resource
+ * resource is inserted and the conflicting resources become children of
+ * the new resource.
  */
 int insert_resource(struct resource *parent, struct resource *new)
 {
@@ -357,20 +356,21 @@ int insert_resource(struct resource *parent, struct resource *new)
 	struct resource *first, *next;
 
 	write_lock(&resource_lock);
- begin:
- 	result = 0;
-	first = __request_resource(parent, new);
-	if (!first)
-		goto out;
 
-	result = -EBUSY;
-	if (first == parent)
-		goto out;
+	for (;; parent = first) {
+	 	result = 0;
+		first = __request_resource(parent, new);
+		if (!first)
+			goto out;
 
-	/* Resource fully contained by the clashing resource? Recurse into it */
-	if (first->start <= new->start && first->end >= new->end) {
-		parent = first;
-		goto begin;
+		result = -EBUSY;
+		if (first == parent)
+			goto out;
+
+		if ((first->start > new->start) || (first->end < new->end))
+			break;
+		if ((first->start == new->start) && (first->end == new->end))
+			break;
 	}
 
 	for (next = first; ; next = next->sibling) {
-- 
cgit v1.2.3


From 0ec76a110f432e98277e464b82ace8dd66571689 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 27 Sep 2006 01:50:15 -0700
Subject: [PATCH] NOMMU: Check that access_process_vm() has a valid target

Check that access_process_vm() is accessing a valid mapping in the target
process.

This limits ptrace() accesses and accesses through /proc/<pid>/maps to only
those regions actually mapped by a program.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/ptrace.c | 54 ------------------------------------------------------
 mm/memory.c     | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 mm/nommu.c      | 47 +++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 100 insertions(+), 54 deletions(-)

(limited to 'kernel')

diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 9a111f70145c..8aad0331d82e 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -241,60 +241,6 @@ int ptrace_detach(struct task_struct *child, unsigned int data)
 	return 0;
 }
 
-/*
- * Access another process' address space.
- * Source/target buffer must be kernel space, 
- * Do not walk the page table directly, use get_user_pages
- */
-
-int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write)
-{
-	struct mm_struct *mm;
-	struct vm_area_struct *vma;
-	struct page *page;
-	void *old_buf = buf;
-
-	mm = get_task_mm(tsk);
-	if (!mm)
-		return 0;
-
-	down_read(&mm->mmap_sem);
-	/* ignore errors, just check how much was sucessfully transfered */
-	while (len) {
-		int bytes, ret, offset;
-		void *maddr;
-
-		ret = get_user_pages(tsk, mm, addr, 1,
-				write, 1, &page, &vma);
-		if (ret <= 0)
-			break;
-
-		bytes = len;
-		offset = addr & (PAGE_SIZE-1);
-		if (bytes > PAGE_SIZE-offset)
-			bytes = PAGE_SIZE-offset;
-
-		maddr = kmap(page);
-		if (write) {
-			copy_to_user_page(vma, page, addr,
-					  maddr + offset, buf, bytes);
-			set_page_dirty_lock(page);
-		} else {
-			copy_from_user_page(vma, page, addr,
-					    buf, maddr + offset, bytes);
-		}
-		kunmap(page);
-		page_cache_release(page);
-		len -= bytes;
-		buf += bytes;
-		addr += bytes;
-	}
-	up_read(&mm->mmap_sem);
-	mmput(mm);
-	
-	return buf - old_buf;
-}
-
 int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len)
 {
 	int copied = 0;
diff --git a/mm/memory.c b/mm/memory.c
index f2ef1dcfff77..601159a46ab6 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2604,3 +2604,56 @@ int in_gate_area_no_task(unsigned long addr)
 }
 
 #endif	/* __HAVE_ARCH_GATE_AREA */
+
+/*
+ * Access another process' address space.
+ * Source/target buffer must be kernel space,
+ * Do not walk the page table directly, use get_user_pages
+ */
+int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write)
+{
+	struct mm_struct *mm;
+	struct vm_area_struct *vma;
+	struct page *page;
+	void *old_buf = buf;
+
+	mm = get_task_mm(tsk);
+	if (!mm)
+		return 0;
+
+	down_read(&mm->mmap_sem);
+	/* ignore errors, just check how much was sucessfully transfered */
+	while (len) {
+		int bytes, ret, offset;
+		void *maddr;
+
+		ret = get_user_pages(tsk, mm, addr, 1,
+				write, 1, &page, &vma);
+		if (ret <= 0)
+			break;
+
+		bytes = len;
+		offset = addr & (PAGE_SIZE-1);
+		if (bytes > PAGE_SIZE-offset)
+			bytes = PAGE_SIZE-offset;
+
+		maddr = kmap(page);
+		if (write) {
+			copy_to_user_page(vma, page, addr,
+					  maddr + offset, buf, bytes);
+			set_page_dirty_lock(page);
+		} else {
+			copy_from_user_page(vma, page, addr,
+					    buf, maddr + offset, bytes);
+		}
+		kunmap(page);
+		page_cache_release(page);
+		len -= bytes;
+		buf += bytes;
+		addr += bytes;
+	}
+	up_read(&mm->mmap_sem);
+	mmput(mm);
+
+	return buf - old_buf;
+}
diff --git a/mm/nommu.c b/mm/nommu.c
index d99dea31e443..d08acdae0036 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1206,3 +1206,50 @@ struct page *filemap_nopage(struct vm_area_struct *area,
 	BUG();
 	return NULL;
 }
+
+/*
+ * Access another process' address space.
+ * - source/target buffer must be kernel space
+ */
+int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write)
+{
+	struct vm_list_struct *vml;
+	struct vm_area_struct *vma;
+	struct mm_struct *mm;
+
+	if (addr + len < addr)
+		return 0;
+
+	mm = get_task_mm(tsk);
+	if (!mm)
+		return 0;
+
+	down_read(&mm->mmap_sem);
+
+	/* the access must start within one of the target process's mappings */
+	for (vml = mm->context.vmlist; vml; vml = vml->next)
+		if (addr >= vml->vma->vm_start && addr < vml->vma->vm_end)
+			break;
+
+	if (vml) {
+		vma = vml->vma;
+
+		/* don't overrun this mapping */
+		if (addr + len >= vma->vm_end)
+			len = vma->vm_end - addr;
+
+		/* only read or write mappings where it is permitted */
+		if (write && vma->vm_flags & VM_WRITE)
+			len -= copy_to_user((void *) addr, buf, len);
+		else if (!write && vma->vm_flags & VM_READ)
+			len -= copy_from_user(buf, (void *) addr, len);
+		else
+			len = 0;
+	} else {
+		len = 0;
+	}
+
+	up_read(&mm->mmap_sem);
+	mmput(mm);
+	return len;
+}
-- 
cgit v1.2.3


From f269fdd1829acc5e53bf57b145003e5733133f2b Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 27 Sep 2006 01:50:23 -0700
Subject: [PATCH] NOMMU: move the fallback arch_vma_name() to a sensible place

Move the fallback arch_vma_name() to a sensible place (kernel/signal.c).

Currently it's in fs/proc/task_mmu.c, a file that is dependent on both
CONFIG_PROC_FS and CONFIG_MMU being enabled, but it's used from
kernel/signal.c from where it is called unconditionally.

[akpm@osdl.org: build fix]
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/proc/task_mmu.c | 5 -----
 include/linux/mm.h | 2 +-
 kernel/signal.c    | 5 +++++
 3 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 0a163a4f7764..6b769afac55a 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -122,11 +122,6 @@ struct mem_size_stats
 	unsigned long private_dirty;
 };
 
-__attribute__((weak)) const char *arch_vma_name(struct vm_area_struct *vma)
-{
-	return NULL;
-}
-
 static int show_map_internal(struct seq_file *m, void *v, struct mem_size_stats *mss)
 {
 	struct proc_maps_private *priv = m->private;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 22165cb18906..7b703b6d4358 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1131,7 +1131,7 @@ void drop_slab(void);
 extern int randomize_va_space;
 #endif
 
-const char *arch_vma_name(struct vm_area_struct *vma);
+__attribute__((weak)) const char *arch_vma_name(struct vm_area_struct *vma);
 
 #endif /* __KERNEL__ */
 #endif /* _LINUX_MM_H */
diff --git a/kernel/signal.c b/kernel/signal.c
index bfdb5686fa3e..05853a7337e3 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2577,6 +2577,11 @@ asmlinkage long sys_rt_sigsuspend(sigset_t __user *unewset, size_t sigsetsize)
 }
 #endif /* __ARCH_WANT_SYS_RT_SIGSUSPEND */
 
+__attribute__((weak)) const char *arch_vma_name(struct vm_area_struct *vma)
+{
+	return NULL;
+}
+
 void __init signals_init(void)
 {
 	sigqueue_cachep =
-- 
cgit v1.2.3


From 8e18e2941c53416aa219708e7dcad21fb4bd6794 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Wed, 27 Sep 2006 01:50:46 -0700
Subject: [PATCH] inode_diet: Replace inode.u.generic_ip with inode.i_private

The following patches reduce the size of the VFS inode structure by 28 bytes
on a UP x86.  (It would be more on an x86_64 system).  This is a 10% reduction
in the inode size on a UP kernel that is configured in a production mode
(i.e., with no spinlock or other debugging functions enabled; if you want to
save memory taken up by in-core inodes, the first thing you should do is
disable the debugging options; they are responsible for a huge amount of bloat
in the VFS inode structure).

This patch:

The filesystem or device-specific pointer in the inode is inside a union,
which is pretty pointless given that all 30+ users of this field have been
using the void pointer.  Get rid of the union and rename it to i_private, with
a comment to explain who is allowed to use the void pointer.  This is just a
cleanup, but it allows us to reuse the union 'u' for something something where
the union will actually be used.

[judith@osdl.org: powerpc build fix]
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Signed-off-by: Judith Lebzelter <judith@osdl.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/powerpc/platforms/cell/spufs/inode.c      |  2 +-
 arch/powerpc/platforms/pseries/hvCall_inst.c   |  2 +-
 arch/s390/hypfs/inode.c                        |  6 ++---
 arch/s390/kernel/debug.c                       |  2 +-
 block/blktrace.c                               |  2 +-
 drivers/i2c/chips/tps65010.c                   |  2 +-
 drivers/infiniband/hw/ipath/ipath_fs.c         | 12 ++++-----
 drivers/infiniband/ulp/ipoib/ipoib_fs.c        |  4 +--
 drivers/misc/ibmasm/ibmasmfs.c                 | 16 ++++++------
 drivers/net/irda/vlsi_ir.h                     |  2 +-
 drivers/net/wireless/bcm43xx/bcm43xx_debugfs.c |  2 +-
 drivers/oprofile/oprofilefs.c                  | 10 ++++----
 drivers/pci/hotplug/cpqphp_sysfs.c             |  2 +-
 drivers/usb/core/devio.c                       |  2 +-
 drivers/usb/core/inode.c                       |  6 ++---
 drivers/usb/gadget/inode.c                     |  6 ++---
 drivers/usb/host/isp116x-hcd.c                 |  2 +-
 drivers/usb/host/uhci-debug.c                  |  2 +-
 drivers/usb/mon/mon_stat.c                     |  2 +-
 drivers/usb/mon/mon_text.c                     |  4 +--
 fs/autofs/inode.c                              |  2 +-
 fs/autofs/symlink.c                            |  2 +-
 fs/binfmt_misc.c                               |  8 +++---
 fs/debugfs/file.c                              |  4 +--
 fs/debugfs/inode.c                             |  4 +--
 fs/devpts/inode.c                              |  4 +--
 fs/freevxfs/vxfs.h                             |  2 +-
 fs/freevxfs/vxfs_inode.c                       |  4 +--
 fs/fuse/control.c                              |  6 ++---
 fs/inode.c                                     |  2 +-
 fs/jffs/inode-v23.c                            | 34 +++++++++++++-------------
 fs/libfs.c                                     |  2 +-
 fs/ocfs2/dlmglue.c                             |  2 +-
 include/linux/fs.h                             |  4 +--
 kernel/relay.c                                 |  2 +-
 security/inode.c                               |  8 +++---
 36 files changed, 88 insertions(+), 90 deletions(-)

(limited to 'kernel')

diff --git a/arch/powerpc/platforms/cell/spufs/inode.c b/arch/powerpc/platforms/cell/spufs/inode.c
index 7b4572805db9..b837f12a84ae 100644
--- a/arch/powerpc/platforms/cell/spufs/inode.c
+++ b/arch/powerpc/platforms/cell/spufs/inode.c
@@ -120,7 +120,7 @@ spufs_new_file(struct super_block *sb, struct dentry *dentry,
 	ret = 0;
 	inode->i_op = &spufs_file_iops;
 	inode->i_fop = fops;
-	inode->u.generic_ip = SPUFS_I(inode)->i_ctx = get_spu_context(ctx);
+	inode->i_private = SPUFS_I(inode)->i_ctx = get_spu_context(ctx);
 	d_add(dentry, inode);
 out:
 	return ret;
diff --git a/arch/powerpc/platforms/pseries/hvCall_inst.c b/arch/powerpc/platforms/pseries/hvCall_inst.c
index 641e6511cf06..446e17d162a5 100644
--- a/arch/powerpc/platforms/pseries/hvCall_inst.c
+++ b/arch/powerpc/platforms/pseries/hvCall_inst.c
@@ -85,7 +85,7 @@ static int hcall_inst_seq_open(struct inode *inode, struct file *file)
 
 	rc = seq_open(file, &hcall_inst_seq_ops);
 	seq = file->private_data;
-	seq->private = file->f_dentry->d_inode->u.generic_ip;
+	seq->private = file->f_dentry->d_inode->i_private;
 
 	return rc;
 }
diff --git a/arch/s390/hypfs/inode.c b/arch/s390/hypfs/inode.c
index bdade5f2e325..8735024d235b 100644
--- a/arch/s390/hypfs/inode.c
+++ b/arch/s390/hypfs/inode.c
@@ -104,13 +104,13 @@ static struct inode *hypfs_make_inode(struct super_block *sb, int mode)
 
 static void hypfs_drop_inode(struct inode *inode)
 {
-	kfree(inode->u.generic_ip);
+	kfree(inode->i_private);
 	generic_delete_inode(inode);
 }
 
 static int hypfs_open(struct inode *inode, struct file *filp)
 {
-	char *data = filp->f_dentry->d_inode->u.generic_ip;
+	char *data = filp->f_dentry->d_inode->i_private;
 	struct hypfs_sb_info *fs_info;
 
 	if (filp->f_mode & FMODE_WRITE) {
@@ -352,7 +352,7 @@ static struct dentry *hypfs_create_file(struct super_block *sb,
 		parent->d_inode->i_nlink++;
 	} else
 		BUG();
-	inode->u.generic_ip = data;
+	inode->i_private = data;
 	d_instantiate(dentry, inode);
 	dget(dentry);
 	return dentry;
diff --git a/arch/s390/kernel/debug.c b/arch/s390/kernel/debug.c
index 7ba20922a535..43f3d0c7e132 100644
--- a/arch/s390/kernel/debug.c
+++ b/arch/s390/kernel/debug.c
@@ -603,7 +603,7 @@ debug_open(struct inode *inode, struct file *file)
 	debug_info_t *debug_info, *debug_info_snapshot;
 
 	down(&debug_lock);
-	debug_info = (struct debug_info*)file->f_dentry->d_inode->u.generic_ip;
+	debug_info = file->f_dentry->d_inode->i_private;
 	/* find debug view */
 	for (i = 0; i < DEBUG_MAX_VIEWS; i++) {
 		if (!debug_info->views[i])
diff --git a/block/blktrace.c b/block/blktrace.c
index 265f7a830619..2b4ef2b89b8d 100644
--- a/block/blktrace.c
+++ b/block/blktrace.c
@@ -217,7 +217,7 @@ static int blk_trace_remove(request_queue_t *q)
 
 static int blk_dropped_open(struct inode *inode, struct file *filp)
 {
-	filp->private_data = inode->u.generic_ip;
+	filp->private_data = inode->i_private;
 
 	return 0;
 }
diff --git a/drivers/i2c/chips/tps65010.c b/drivers/i2c/chips/tps65010.c
index 0be6fd6a267d..6a7578217177 100644
--- a/drivers/i2c/chips/tps65010.c
+++ b/drivers/i2c/chips/tps65010.c
@@ -305,7 +305,7 @@ static int dbg_show(struct seq_file *s, void *_)
 
 static int dbg_tps_open(struct inode *inode, struct file *file)
 {
-	return single_open(file, dbg_show, inode->u.generic_ip);
+	return single_open(file, dbg_show, inode->i_private);
 }
 
 static struct file_operations debug_fops = {
diff --git a/drivers/infiniband/hw/ipath/ipath_fs.c b/drivers/infiniband/hw/ipath/ipath_fs.c
index a5eb30a06a5c..055cdd089b28 100644
--- a/drivers/infiniband/hw/ipath/ipath_fs.c
+++ b/drivers/infiniband/hw/ipath/ipath_fs.c
@@ -64,7 +64,7 @@ static int ipathfs_mknod(struct inode *dir, struct dentry *dentry,
 	inode->i_blksize = PAGE_CACHE_SIZE;
 	inode->i_blocks = 0;
 	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
-	inode->u.generic_ip = data;
+	inode->i_private = data;
 	if ((mode & S_IFMT) == S_IFDIR) {
 		inode->i_op = &simple_dir_inode_operations;
 		inode->i_nlink++;
@@ -119,7 +119,7 @@ static ssize_t atomic_counters_read(struct file *file, char __user *buf,
 	u16 i;
 	struct ipath_devdata *dd;
 
-	dd = file->f_dentry->d_inode->u.generic_ip;
+	dd = file->f_dentry->d_inode->i_private;
 
 	for (i = 0; i < NUM_COUNTERS; i++)
 		counters[i] = ipath_snap_cntr(dd, i);
@@ -139,7 +139,7 @@ static ssize_t atomic_node_info_read(struct file *file, char __user *buf,
 	struct ipath_devdata *dd;
 	u64 guid;
 
-	dd = file->f_dentry->d_inode->u.generic_ip;
+	dd = file->f_dentry->d_inode->i_private;
 
 	guid = be64_to_cpu(dd->ipath_guid);
 
@@ -178,7 +178,7 @@ static ssize_t atomic_port_info_read(struct file *file, char __user *buf,
 	u32 tmp, tmp2;
 	struct ipath_devdata *dd;
 
-	dd = file->f_dentry->d_inode->u.generic_ip;
+	dd = file->f_dentry->d_inode->i_private;
 
 	/* so we only initialize non-zero fields. */
 	memset(portinfo, 0, sizeof portinfo);
@@ -325,7 +325,7 @@ static ssize_t flash_read(struct file *file, char __user *buf,
 		goto bail;
 	}
 
-	dd = file->f_dentry->d_inode->u.generic_ip;
+	dd = file->f_dentry->d_inode->i_private;
 	if (ipath_eeprom_read(dd, pos, tmp, count)) {
 		ipath_dev_err(dd, "failed to read from flash\n");
 		ret = -ENXIO;
@@ -381,7 +381,7 @@ static ssize_t flash_write(struct file *file, const char __user *buf,
 		goto bail_tmp;
 	}
 
-	dd = file->f_dentry->d_inode->u.generic_ip;
+	dd = file->f_dentry->d_inode->i_private;
 	if (ipath_eeprom_write(dd, pos, tmp, count)) {
 		ret = -ENXIO;
 		ipath_dev_err(dd, "failed to write to flash\n");
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_fs.c b/drivers/infiniband/ulp/ipoib/ipoib_fs.c
index 5dde380e8dbe..f1cb83688b31 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_fs.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_fs.c
@@ -141,7 +141,7 @@ static int ipoib_mcg_open(struct inode *inode, struct file *file)
 		return ret;
 
 	seq = file->private_data;
-	seq->private = inode->u.generic_ip;
+	seq->private = inode->i_private;
 
 	return 0;
 }
@@ -247,7 +247,7 @@ static int ipoib_path_open(struct inode *inode, struct file *file)
 		return ret;
 
 	seq = file->private_data;
-	seq->private = inode->u.generic_ip;
+	seq->private = inode->i_private;
 
 	return 0;
 }
diff --git a/drivers/misc/ibmasm/ibmasmfs.c b/drivers/misc/ibmasm/ibmasmfs.c
index 4a35caff5d02..0e909b617226 100644
--- a/drivers/misc/ibmasm/ibmasmfs.c
+++ b/drivers/misc/ibmasm/ibmasmfs.c
@@ -175,7 +175,7 @@ static struct dentry *ibmasmfs_create_file (struct super_block *sb,
 	}
 
 	inode->i_fop = fops;
-	inode->u.generic_ip = data;
+	inode->i_private = data;
 
 	d_add(dentry, inode);
 	return dentry;
@@ -244,7 +244,7 @@ static int command_file_open(struct inode *inode, struct file *file)
 {
 	struct ibmasmfs_command_data *command_data;
 
-	if (!inode->u.generic_ip)
+	if (!inode->i_private)
 		return -ENODEV;
 
 	command_data = kmalloc(sizeof(struct ibmasmfs_command_data), GFP_KERNEL);
@@ -252,7 +252,7 @@ static int command_file_open(struct inode *inode, struct file *file)
 		return -ENOMEM;
 
 	command_data->command = NULL;
-	command_data->sp = inode->u.generic_ip;
+	command_data->sp = inode->i_private;
 	file->private_data = command_data;
 	return 0;
 }
@@ -351,10 +351,10 @@ static int event_file_open(struct inode *inode, struct file *file)
 	struct ibmasmfs_event_data *event_data;
 	struct service_processor *sp; 
 
-	if (!inode->u.generic_ip)
+	if (!inode->i_private)
 		return -ENODEV;
 
-	sp = inode->u.generic_ip;
+	sp = inode->i_private;
 
 	event_data = kmalloc(sizeof(struct ibmasmfs_event_data), GFP_KERNEL);
 	if (!event_data)
@@ -439,14 +439,14 @@ static int r_heartbeat_file_open(struct inode *inode, struct file *file)
 {
 	struct ibmasmfs_heartbeat_data *rhbeat;
 
-	if (!inode->u.generic_ip)
+	if (!inode->i_private)
 		return -ENODEV;
 
 	rhbeat = kmalloc(sizeof(struct ibmasmfs_heartbeat_data), GFP_KERNEL);
 	if (!rhbeat)
 		return -ENOMEM;
 
-	rhbeat->sp = (struct service_processor *)inode->u.generic_ip;
+	rhbeat->sp = inode->i_private;
 	rhbeat->active = 0;
 	ibmasm_init_reverse_heartbeat(rhbeat->sp, &rhbeat->heartbeat);
 	file->private_data = rhbeat;
@@ -508,7 +508,7 @@ static ssize_t r_heartbeat_file_write(struct file *file, const char __user *buf,
 
 static int remote_settings_file_open(struct inode *inode, struct file *file)
 {
-	file->private_data = inode->u.generic_ip;
+	file->private_data = inode->i_private;
 	return 0;
 }
 
diff --git a/drivers/net/irda/vlsi_ir.h b/drivers/net/irda/vlsi_ir.h
index a82a4ba8de4f..c37f0bc4c7f9 100644
--- a/drivers/net/irda/vlsi_ir.h
+++ b/drivers/net/irda/vlsi_ir.h
@@ -58,7 +58,7 @@ typedef void irqreturn_t;
 
 /* PDE() introduced in 2.5.4 */
 #ifdef CONFIG_PROC_FS
-#define PDE(inode) ((inode)->u.generic_ip)
+#define PDE(inode) ((inode)->i_private)
 #endif
 
 /* irda crc16 calculation exported in 2.5.42 */
diff --git a/drivers/net/wireless/bcm43xx/bcm43xx_debugfs.c b/drivers/net/wireless/bcm43xx/bcm43xx_debugfs.c
index 923275ea0789..b9df06a06ea9 100644
--- a/drivers/net/wireless/bcm43xx/bcm43xx_debugfs.c
+++ b/drivers/net/wireless/bcm43xx/bcm43xx_debugfs.c
@@ -54,7 +54,7 @@ static ssize_t write_file_dummy(struct file *file, const char __user *buf,
 
 static int open_file_generic(struct inode *inode, struct file *file)
 {
-	file->private_data = inode->u.generic_ip;
+	file->private_data = inode->i_private;
 	return 0;
 }
 
diff --git a/drivers/oprofile/oprofilefs.c b/drivers/oprofile/oprofilefs.c
index 71c2da277d6e..deb37354785b 100644
--- a/drivers/oprofile/oprofilefs.c
+++ b/drivers/oprofile/oprofilefs.c
@@ -110,8 +110,8 @@ static ssize_t ulong_write_file(struct file * file, char const __user * buf, siz
 
 static int default_open(struct inode * inode, struct file * filp)
 {
-	if (inode->u.generic_ip)
-		filp->private_data = inode->u.generic_ip;
+	if (inode->i_private)
+		filp->private_data = inode->i_private;
 	return 0;
 }
 
@@ -158,7 +158,7 @@ int oprofilefs_create_ulong(struct super_block * sb, struct dentry * root,
 	if (!d)
 		return -EFAULT;
 
-	d->d_inode->u.generic_ip = val;
+	d->d_inode->i_private = val;
 	return 0;
 }
 
@@ -171,7 +171,7 @@ int oprofilefs_create_ro_ulong(struct super_block * sb, struct dentry * root,
 	if (!d)
 		return -EFAULT;
 
-	d->d_inode->u.generic_ip = val;
+	d->d_inode->i_private = val;
 	return 0;
 }
 
@@ -197,7 +197,7 @@ int oprofilefs_create_ro_atomic(struct super_block * sb, struct dentry * root,
 	if (!d)
 		return -EFAULT;
 
-	d->d_inode->u.generic_ip = val;
+	d->d_inode->i_private = val;
 	return 0;
 }
 
diff --git a/drivers/pci/hotplug/cpqphp_sysfs.c b/drivers/pci/hotplug/cpqphp_sysfs.c
index 8b3da007e859..5bab666cd67e 100644
--- a/drivers/pci/hotplug/cpqphp_sysfs.c
+++ b/drivers/pci/hotplug/cpqphp_sysfs.c
@@ -140,7 +140,7 @@ struct ctrl_dbg {
 
 static int open(struct inode *inode, struct file *file)
 {
-	struct controller *ctrl = inode->u.generic_ip;
+	struct controller *ctrl = inode->i_private;
 	struct ctrl_dbg *dbg;
 	int retval = -ENOMEM;
 
diff --git a/drivers/usb/core/devio.c b/drivers/usb/core/devio.c
index 218621b9958e..32e03000420c 100644
--- a/drivers/usb/core/devio.c
+++ b/drivers/usb/core/devio.c
@@ -555,7 +555,7 @@ static int usbdev_open(struct inode *inode, struct file *file)
 	if (imajor(inode) == USB_DEVICE_MAJOR)
 		dev = usbdev_lookup_minor(iminor(inode));
 	if (!dev)
-		dev = inode->u.generic_ip;
+		dev = inode->i_private;
 	if (!dev) {
 		kfree(ps);
 		goto out;
diff --git a/drivers/usb/core/inode.c b/drivers/usb/core/inode.c
index 3182c2224ba2..482f253085e5 100644
--- a/drivers/usb/core/inode.c
+++ b/drivers/usb/core/inode.c
@@ -402,8 +402,8 @@ static loff_t default_file_lseek (struct file *file, loff_t offset, int orig)
 
 static int default_open (struct inode *inode, struct file *file)
 {
-	if (inode->u.generic_ip)
-		file->private_data = inode->u.generic_ip;
+	if (inode->i_private)
+		file->private_data = inode->i_private;
 
 	return 0;
 }
@@ -509,7 +509,7 @@ static struct dentry *fs_create_file (const char *name, mode_t mode,
 	} else {
 		if (dentry->d_inode) {
 			if (data)
-				dentry->d_inode->u.generic_ip = data;
+				dentry->d_inode->i_private = data;
 			if (fops)
 				dentry->d_inode->i_fop = fops;
 			dentry->d_inode->i_uid = uid;
diff --git a/drivers/usb/gadget/inode.c b/drivers/usb/gadget/inode.c
index 3bdc5e3ba234..ffaa8c1afad8 100644
--- a/drivers/usb/gadget/inode.c
+++ b/drivers/usb/gadget/inode.c
@@ -844,7 +844,7 @@ fail1:
 static int
 ep_open (struct inode *inode, struct file *fd)
 {
-	struct ep_data		*data = inode->u.generic_ip;
+	struct ep_data		*data = inode->i_private;
 	int			value = -EBUSY;
 
 	if (down_interruptible (&data->lock) != 0)
@@ -1909,7 +1909,7 @@ fail:
 static int
 dev_open (struct inode *inode, struct file *fd)
 {
-	struct dev_data		*dev = inode->u.generic_ip;
+	struct dev_data		*dev = inode->i_private;
 	int			value = -EBUSY;
 
 	if (dev->state == STATE_DEV_DISABLED) {
@@ -1970,7 +1970,7 @@ gadgetfs_make_inode (struct super_block *sb,
 		inode->i_blocks = 0;
 		inode->i_atime = inode->i_mtime = inode->i_ctime
 				= CURRENT_TIME;
-		inode->u.generic_ip = data;
+		inode->i_private = data;
 		inode->i_fop = fops;
 	}
 	return inode;
diff --git a/drivers/usb/host/isp116x-hcd.c b/drivers/usb/host/isp116x-hcd.c
index 5147ed4a6662..8c6b38a0b5bb 100644
--- a/drivers/usb/host/isp116x-hcd.c
+++ b/drivers/usb/host/isp116x-hcd.c
@@ -1204,7 +1204,7 @@ static int isp116x_show_dbg(struct seq_file *s, void *unused)
 
 static int isp116x_open_seq(struct inode *inode, struct file *file)
 {
-	return single_open(file, isp116x_show_dbg, inode->u.generic_ip);
+	return single_open(file, isp116x_show_dbg, inode->i_private);
 }
 
 static struct file_operations isp116x_debug_fops = {
diff --git a/drivers/usb/host/uhci-debug.c b/drivers/usb/host/uhci-debug.c
index dc286a48cafd..d1372cb27f33 100644
--- a/drivers/usb/host/uhci-debug.c
+++ b/drivers/usb/host/uhci-debug.c
@@ -428,7 +428,7 @@ struct uhci_debug {
 
 static int uhci_debug_open(struct inode *inode, struct file *file)
 {
-	struct uhci_hcd *uhci = inode->u.generic_ip;
+	struct uhci_hcd *uhci = inode->i_private;
 	struct uhci_debug *up;
 	int ret = -ENOMEM;
 	unsigned long flags;
diff --git a/drivers/usb/mon/mon_stat.c b/drivers/usb/mon/mon_stat.c
index 1fe01d994a79..86ad2b381c4b 100644
--- a/drivers/usb/mon/mon_stat.c
+++ b/drivers/usb/mon/mon_stat.c
@@ -28,7 +28,7 @@ static int mon_stat_open(struct inode *inode, struct file *file)
 	if ((sp = kmalloc(sizeof(struct snap), GFP_KERNEL)) == NULL)
 		return -ENOMEM;
 
-	mbus = inode->u.generic_ip;
+	mbus = inode->i_private;
 
 	sp->slen = snprintf(sp->str, STAT_BUF_SIZE,
 	    "nreaders %d events %u text_lost %u\n",
diff --git a/drivers/usb/mon/mon_text.c b/drivers/usb/mon/mon_text.c
index f961a770cee2..2fd39b4fa166 100644
--- a/drivers/usb/mon/mon_text.c
+++ b/drivers/usb/mon/mon_text.c
@@ -238,7 +238,7 @@ static int mon_text_open(struct inode *inode, struct file *file)
 	int rc;
 
 	mutex_lock(&mon_lock);
-	mbus = inode->u.generic_ip;
+	mbus = inode->i_private;
 	ubus = mbus->u_bus;
 
 	rp = kzalloc(sizeof(struct mon_reader_text), GFP_KERNEL);
@@ -401,7 +401,7 @@ static int mon_text_release(struct inode *inode, struct file *file)
 	struct mon_event_text *ep;
 
 	mutex_lock(&mon_lock);
-	mbus = inode->u.generic_ip;
+	mbus = inode->i_private;
 
 	if (mbus->nreaders <= 0) {
 		printk(KERN_ERR TAG ": consistency error on close\n");
diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c
index c81d6b8c2828..d39d2acd9b38 100644
--- a/fs/autofs/inode.c
+++ b/fs/autofs/inode.c
@@ -241,7 +241,7 @@ static void autofs_read_inode(struct inode *inode)
 		
 		inode->i_op = &autofs_symlink_inode_operations;
 		sl = &sbi->symlink[n];
-		inode->u.generic_ip = sl;
+		inode->i_private = sl;
 		inode->i_mode = S_IFLNK | S_IRWXUGO;
 		inode->i_mtime.tv_sec = inode->i_ctime.tv_sec = sl->mtime;
 		inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec = 0;
diff --git a/fs/autofs/symlink.c b/fs/autofs/symlink.c
index 52e8772b066e..c74f2eb65775 100644
--- a/fs/autofs/symlink.c
+++ b/fs/autofs/symlink.c
@@ -15,7 +15,7 @@
 /* Nothing to release.. */
 static void *autofs_follow_link(struct dentry *dentry, struct nameidata *nd)
 {
-	char *s=((struct autofs_symlink *)dentry->d_inode->u.generic_ip)->data;
+	char *s=((struct autofs_symlink *)dentry->d_inode->i_private)->data;
 	nd_set_link(nd, s);
 	return NULL;
 }
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index 34ebbc191e46..6759b9839ce8 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -517,7 +517,7 @@ static struct inode *bm_get_inode(struct super_block *sb, int mode)
 
 static void bm_clear_inode(struct inode *inode)
 {
-	kfree(inode->u.generic_ip);
+	kfree(inode->i_private);
 }
 
 static void kill_node(Node *e)
@@ -545,7 +545,7 @@ static void kill_node(Node *e)
 static ssize_t
 bm_entry_read(struct file * file, char __user * buf, size_t nbytes, loff_t *ppos)
 {
-	Node *e = file->f_dentry->d_inode->u.generic_ip;
+	Node *e = file->f_dentry->d_inode->i_private;
 	loff_t pos = *ppos;
 	ssize_t res;
 	char *page;
@@ -579,7 +579,7 @@ static ssize_t bm_entry_write(struct file *file, const char __user *buffer,
 				size_t count, loff_t *ppos)
 {
 	struct dentry *root;
-	Node *e = file->f_dentry->d_inode->u.generic_ip;
+	Node *e = file->f_dentry->d_inode->i_private;
 	int res = parse_command(buffer, count);
 
 	switch (res) {
@@ -646,7 +646,7 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer,
 	}
 
 	e->dentry = dget(dentry);
-	inode->u.generic_ip = e;
+	inode->i_private = e;
 	inode->i_fop = &bm_entry_operations;
 
 	d_instantiate(dentry, inode);
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index e4b430552c88..bf3901ab1744 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -32,8 +32,8 @@ static ssize_t default_write_file(struct file *file, const char __user *buf,
 
 static int default_open(struct inode *inode, struct file *file)
 {
-	if (inode->u.generic_ip)
-		file->private_data = inode->u.generic_ip;
+	if (inode->i_private)
+		file->private_data = inode->i_private;
 
 	return 0;
 }
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 3ca268d2e5a2..717f4821ed02 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -168,7 +168,7 @@ static int debugfs_create_by_name(const char *name, mode_t mode,
  *          directory dentry if set.  If this paramater is NULL, then the
  *          file will be created in the root of the debugfs filesystem.
  * @data: a pointer to something that the caller will want to get to later
- *        on.  The inode.u.generic_ip pointer will point to this value on
+ *        on.  The inode.i_private pointer will point to this value on
  *        the open() call.
  * @fops: a pointer to a struct file_operations that should be used for
  *        this file.
@@ -209,7 +209,7 @@ struct dentry *debugfs_create_file(const char *name, mode_t mode,
 
 	if (dentry->d_inode) {
 		if (data)
-			dentry->d_inode->u.generic_ip = data;
+			dentry->d_inode->i_private = data;
 		if (fops)
 			dentry->d_inode->i_fop = fops;
 	}
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index f7aef5bb584a..5bf06a10dddf 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -177,7 +177,7 @@ int devpts_pty_new(struct tty_struct *tty)
 	inode->i_gid = config.setgid ? config.gid : current->fsgid;
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
 	init_special_inode(inode, S_IFCHR|config.mode, device);
-	inode->u.generic_ip = tty;
+	inode->i_private = tty;
 
 	dentry = get_node(number);
 	if (!IS_ERR(dentry) && !dentry->d_inode)
@@ -196,7 +196,7 @@ struct tty_struct *devpts_get_tty(int number)
 	tty = NULL;
 	if (!IS_ERR(dentry)) {
 		if (dentry->d_inode)
-			tty = dentry->d_inode->u.generic_ip;
+			tty = dentry->d_inode->i_private;
 		dput(dentry);
 	}
 
diff --git a/fs/freevxfs/vxfs.h b/fs/freevxfs/vxfs.h
index d35979a58743..c8a92652612a 100644
--- a/fs/freevxfs/vxfs.h
+++ b/fs/freevxfs/vxfs.h
@@ -252,7 +252,7 @@ enum {
  * Get filesystem private data from VFS inode.
  */
 #define VXFS_INO(ip) \
-	((struct vxfs_inode_info *)(ip)->u.generic_ip)
+	((struct vxfs_inode_info *)(ip)->i_private)
 
 /*
  * Get filesystem private data from VFS superblock.
diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c
index ca6a39714771..32a82ed108e4 100644
--- a/fs/freevxfs/vxfs_inode.c
+++ b/fs/freevxfs/vxfs_inode.c
@@ -243,7 +243,7 @@ vxfs_iinit(struct inode *ip, struct vxfs_inode_info *vip)
 	ip->i_blocks = vip->vii_blocks;
 	ip->i_generation = vip->vii_gen;
 
-	ip->u.generic_ip = (void *)vip;
+	ip->i_private = vip;
 	
 }
 
@@ -338,5 +338,5 @@ vxfs_read_inode(struct inode *ip)
 void
 vxfs_clear_inode(struct inode *ip)
 {
-	kmem_cache_free(vxfs_inode_cachep, ip->u.generic_ip);
+	kmem_cache_free(vxfs_inode_cachep, ip->i_private);
 }
diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index 46fe60b2da23..79ec1f23d4d2 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -23,7 +23,7 @@ static struct fuse_conn *fuse_ctl_file_conn_get(struct file *file)
 {
 	struct fuse_conn *fc;
 	mutex_lock(&fuse_mutex);
-	fc = file->f_dentry->d_inode->u.generic_ip;
+	fc = file->f_dentry->d_inode->i_private;
 	if (fc)
 		fc = fuse_conn_get(fc);
 	mutex_unlock(&fuse_mutex);
@@ -98,7 +98,7 @@ static struct dentry *fuse_ctl_add_dentry(struct dentry *parent,
 		inode->i_op = iop;
 	inode->i_fop = fop;
 	inode->i_nlink = nlink;
-	inode->u.generic_ip = fc;
+	inode->i_private = fc;
 	d_add(dentry, inode);
 	return dentry;
 }
@@ -150,7 +150,7 @@ void fuse_ctl_remove_conn(struct fuse_conn *fc)
 
 	for (i = fc->ctl_ndents - 1; i >= 0; i--) {
 		struct dentry *dentry = fc->ctl_dentry[i];
-		dentry->d_inode->u.generic_ip = NULL;
+		dentry->d_inode->i_private = NULL;
 		d_drop(dentry);
 		dput(dentry);
 	}
diff --git a/fs/inode.c b/fs/inode.c
index 0bf9f0444a96..77e254792025 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -163,7 +163,7 @@ static struct inode *alloc_inode(struct super_block *sb)
 				bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
 			mapping->backing_dev_info = bdi;
 		}
-		memset(&inode->u, 0, sizeof(inode->u));
+		inode->i_private = 0;
 		inode->i_mapping = mapping;
 	}
 	return inode;
diff --git a/fs/jffs/inode-v23.c b/fs/jffs/inode-v23.c
index b59553d28d13..7358ef87f16b 100644
--- a/fs/jffs/inode-v23.c
+++ b/fs/jffs/inode-v23.c
@@ -369,7 +369,7 @@ jffs_new_inode(const struct inode * dir, struct jffs_raw_inode *raw_inode,
 
 	f = jffs_find_file(c, raw_inode->ino);
 
-	inode->u.generic_ip = (void *)f;
+	inode->i_private = (void *)f;
 	insert_inode_hash(inode);
 
 	return inode;
@@ -442,7 +442,7 @@ jffs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	});
 
 	result = -ENOTDIR;
-	if (!(old_dir_f = (struct jffs_file *)old_dir->u.generic_ip)) {
+	if (!(old_dir_f = old_dir->i_private)) {
 		D(printk("jffs_rename(): Old dir invalid.\n"));
 		goto jffs_rename_end;
 	}
@@ -456,7 +456,7 @@ jffs_rename(struct inode *old_dir, struct dentry *old_dentry,
 
 	/* Find the new directory.  */
 	result = -ENOTDIR;
-	if (!(new_dir_f = (struct jffs_file *)new_dir->u.generic_ip)) {
+	if (!(new_dir_f = new_dir->i_private)) {
 		D(printk("jffs_rename(): New dir invalid.\n"));
 		goto jffs_rename_end;
 	}
@@ -593,7 +593,7 @@ jffs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 		}
 		else {
 			ddino = ((struct jffs_file *)
-				 inode->u.generic_ip)->pino;
+				 inode->i_private)->pino;
 		}
 		D3(printk("jffs_readdir(): \"..\" %u\n", ddino));
 		if (filldir(dirent, "..", 2, filp->f_pos, ddino, DT_DIR) < 0) {
@@ -604,7 +604,7 @@ jffs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 		}
 		filp->f_pos++;
 	}
-	f = ((struct jffs_file *)inode->u.generic_ip)->children;
+	f = ((struct jffs_file *)inode->i_private)->children;
 
 	j = 2;
 	while(f && (f->deleted || j++ < filp->f_pos )) {
@@ -668,7 +668,7 @@ jffs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
 	}
 
 	r = -EACCES;
-	if (!(d = (struct jffs_file *)dir->u.generic_ip)) {
+	if (!(d = (struct jffs_file *)dir->i_private)) {
 		D(printk("jffs_lookup(): No such inode! (%lu)\n",
 			 dir->i_ino));
 		goto jffs_lookup_end;
@@ -739,7 +739,7 @@ jffs_do_readpage_nolock(struct file *file, struct page *page)
 	unsigned long read_len;
 	int result;
 	struct inode *inode = (struct inode*)page->mapping->host;
-	struct jffs_file *f = (struct jffs_file *)inode->u.generic_ip;
+	struct jffs_file *f = (struct jffs_file *)inode->i_private;
 	struct jffs_control *c = (struct jffs_control *)inode->i_sb->s_fs_info;
 	int r;
 	loff_t offset;
@@ -828,7 +828,7 @@ jffs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	});
 
 	lock_kernel();
-	dir_f = (struct jffs_file *)dir->u.generic_ip;
+	dir_f = dir->i_private;
 
 	ASSERT(if (!dir_f) {
 		printk(KERN_ERR "jffs_mkdir(): No reference to a "
@@ -972,7 +972,7 @@ jffs_remove(struct inode *dir, struct dentry *dentry, int type)
 		kfree(_name);
 	});
 
-	dir_f = (struct jffs_file *) dir->u.generic_ip;
+	dir_f = dir->i_private;
 	c = dir_f->c;
 
 	result = -ENOENT;
@@ -1082,7 +1082,7 @@ jffs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
 	if (!old_valid_dev(rdev))
 		return -EINVAL;
 	lock_kernel();
-	dir_f = (struct jffs_file *)dir->u.generic_ip;
+	dir_f = dir->i_private;
 	c = dir_f->c;
 
 	D3(printk (KERN_NOTICE "mknod(): down biglock\n"));
@@ -1186,7 +1186,7 @@ jffs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
 		kfree(_symname);
 	});
 
-	dir_f = (struct jffs_file *)dir->u.generic_ip;
+	dir_f = dir->i_private;
 	ASSERT(if (!dir_f) {
 		printk(KERN_ERR "jffs_symlink(): No reference to a "
 		       "jffs_file struct in inode.\n");
@@ -1289,7 +1289,7 @@ jffs_create(struct inode *dir, struct dentry *dentry, int mode,
 		kfree(s);
 	});
 
-	dir_f = (struct jffs_file *)dir->u.generic_ip;
+	dir_f = dir->i_private;
 	ASSERT(if (!dir_f) {
 		printk(KERN_ERR "jffs_create(): No reference to a "
 		       "jffs_file struct in inode.\n");
@@ -1403,9 +1403,9 @@ jffs_file_write(struct file *filp, const char *buf, size_t count,
 		goto out_isem;
 	}
 
-	if (!(f = (struct jffs_file *)inode->u.generic_ip)) {
-		D(printk("jffs_file_write(): inode->u.generic_ip = 0x%p\n",
-				inode->u.generic_ip));
+	if (!(f = inode->i_private)) {
+		D(printk("jffs_file_write(): inode->i_private = 0x%p\n",
+				inode->i_private));
 		goto out_isem;
 	}
 
@@ -1693,7 +1693,7 @@ jffs_read_inode(struct inode *inode)
 		mutex_unlock(&c->fmc->biglock);
 		return;
 	}
-	inode->u.generic_ip = (void *)f;
+	inode->i_private = f;
 	inode->i_mode = f->mode;
 	inode->i_nlink = f->nlink;
 	inode->i_uid = f->uid;
@@ -1748,7 +1748,7 @@ jffs_delete_inode(struct inode *inode)
 	lock_kernel();
 	inode->i_size = 0;
 	inode->i_blocks = 0;
-	inode->u.generic_ip = NULL;
+	inode->i_private = NULL;
 	clear_inode(inode);
 	if (inode->i_nlink == 0) {
 		c = (struct jffs_control *) inode->i_sb->s_fs_info;
diff --git a/fs/libfs.c b/fs/libfs.c
index ac02ea602c3d..2751793beeaa 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -547,7 +547,7 @@ int simple_attr_open(struct inode *inode, struct file *file,
 
 	attr->get = get;
 	attr->set = set;
-	attr->data = inode->u.generic_ip;
+	attr->data = inode->i_private;
 	attr->fmt = fmt;
 	mutex_init(&attr->mutex);
 
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index de887063dcfc..8801e41afe80 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -2052,7 +2052,7 @@ static int ocfs2_dlm_debug_open(struct inode *inode, struct file *file)
 		mlog_errno(ret);
 		goto out;
 	}
-	osb = (struct ocfs2_super *) inode->u.generic_ip;
+	osb = inode->i_private;
 	ocfs2_get_dlm_debug(osb->osb_dlm_debug);
 	priv->p_dlm_debug = osb->osb_dlm_debug;
 	INIT_LIST_HEAD(&priv->p_iter_res.l_debug_list);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 1d3e601ece73..4f77ec9c3353 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -554,9 +554,7 @@ struct inode {
 
 	atomic_t		i_writecount;
 	void			*i_security;
-	union {
-		void		*generic_ip;
-	} u;
+	void			*i_private; /* fs or device private pointer */
 #ifdef __NEED_I_SIZE_ORDERED
 	seqcount_t		i_size_seqcount;
 #endif
diff --git a/kernel/relay.c b/kernel/relay.c
index 33345e73485c..85786ff2a4f9 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -669,7 +669,7 @@ EXPORT_SYMBOL_GPL(relay_flush);
  */
 static int relay_file_open(struct inode *inode, struct file *filp)
 {
-	struct rchan_buf *buf = inode->u.generic_ip;
+	struct rchan_buf *buf = inode->i_private;
 	kref_get(&buf->kref);
 	filp->private_data = buf;
 
diff --git a/security/inode.c b/security/inode.c
index 47eb63480dac..176aacea8ca4 100644
--- a/security/inode.c
+++ b/security/inode.c
@@ -44,8 +44,8 @@ static ssize_t default_write_file(struct file *file, const char __user *buf,
 
 static int default_open(struct inode *inode, struct file *file)
 {
-	if (inode->u.generic_ip)
-		file->private_data = inode->u.generic_ip;
+	if (inode->i_private)
+		file->private_data = inode->i_private;
 
 	return 0;
 }
@@ -194,7 +194,7 @@ static int create_by_name(const char *name, mode_t mode,
  *          directory dentry if set.  If this paramater is NULL, then the
  *          file will be created in the root of the securityfs filesystem.
  * @data: a pointer to something that the caller will want to get to later
- *        on.  The inode.u.generic_ip pointer will point to this value on
+ *        on.  The inode.i_private pointer will point to this value on
  *        the open() call.
  * @fops: a pointer to a struct file_operations that should be used for
  *        this file.
@@ -240,7 +240,7 @@ struct dentry *securityfs_create_file(const char *name, mode_t mode,
 		if (fops)
 			dentry->d_inode->i_fop = fops;
 		if (data)
-			dentry->d_inode->u.generic_ip = data;
+			dentry->d_inode->i_private = data;
 	}
 exit:
 	return dentry;
-- 
cgit v1.2.3


From ba52de123d454b57369f291348266d86f4b35070 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Wed, 27 Sep 2006 01:50:49 -0700
Subject: [PATCH] inode-diet: Eliminate i_blksize from the inode structure

This eliminates the i_blksize field from struct inode.  Filesystems that want
to provide a per-inode st_blksize can do so by providing their own getattr
routine instead of using the generic_fillattr() function.

Note that some filesystems were providing pretty much random (and incorrect)
values for i_blksize.

[bunk@stusta.de: cleanup]
[akpm@osdl.org: generic_fillattr() fix]
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/powerpc/platforms/cell/spufs/inode.c |  1 -
 arch/s390/hypfs/inode.c                   |  1 -
 drivers/block/loop.c                      |  7 +++++--
 drivers/infiniband/hw/ipath/ipath_fs.c    |  1 -
 drivers/isdn/capi/capifs.c                |  2 --
 drivers/misc/ibmasm/ibmasmfs.c            |  1 -
 drivers/oprofile/oprofilefs.c             |  1 -
 drivers/usb/core/inode.c                  |  1 -
 drivers/usb/gadget/inode.c                |  1 -
 fs/9p/vfs_inode.c                         |  4 +---
 fs/adfs/inode.c                           |  1 -
 fs/afs/inode.c                            |  1 -
 fs/autofs/inode.c                         |  1 -
 fs/autofs4/inode.c                        |  1 -
 fs/befs/linuxvfs.c                        |  1 -
 fs/bfs/dir.c                              |  2 +-
 fs/bfs/inode.c                            |  1 -
 fs/binfmt_misc.c                          |  1 -
 fs/cifs/cifsfs.c                          |  1 -
 fs/cifs/readdir.c                         |  5 ++---
 fs/coda/coda_linux.c                      |  2 --
 fs/configfs/inode.c                       |  1 -
 fs/cramfs/inode.c                         |  1 -
 fs/debugfs/inode.c                        |  1 -
 fs/devpts/inode.c                         |  2 --
 fs/eventpoll.c                            |  1 -
 fs/ext2/ialloc.c                          |  1 -
 fs/ext2/inode.c                           |  1 -
 fs/ext3/ialloc.c                          |  1 -
 fs/ext3/inode.c                           |  3 ---
 fs/fat/inode.c                            |  3 ---
 fs/freevxfs/vxfs_inode.c                  |  1 -
 fs/fuse/inode.c                           |  1 -
 fs/hfs/inode.c                            |  2 --
 fs/hfsplus/inode.c                        |  2 --
 fs/hostfs/hostfs_kern.c                   |  1 -
 fs/hpfs/inode.c                           |  1 -
 fs/hppfs/hppfs_kern.c                     |  1 -
 fs/hugetlbfs/inode.c                      |  1 -
 fs/isofs/inode.c                          |  3 +--
 fs/jffs/inode-v23.c                       |  2 --
 fs/jffs2/fs.c                             |  2 --
 fs/jfs/jfs_extent.c                       |  2 +-
 fs/jfs/jfs_imap.c                         |  1 -
 fs/jfs/jfs_inode.c                        |  1 -
 fs/jfs/jfs_metapage.c                     |  2 +-
 fs/libfs.c                                |  2 --
 fs/minix/bitmap.c                         |  2 +-
 fs/minix/inode.c                          |  4 ++--
 fs/ncpfs/inode.c                          |  1 -
 fs/nfs/inode.c                            |  4 ----
 fs/ntfs/inode.c                           |  4 ----
 fs/ntfs/mft.c                             |  5 -----
 fs/ocfs2/dlm/dlmfs.c                      |  2 --
 fs/ocfs2/inode.c                          |  4 ----
 fs/pipe.c                                 |  1 -
 fs/qnx4/inode.c                           |  1 -
 fs/ramfs/inode.c                          |  1 -
 fs/reiserfs/inode.c                       |  4 ----
 fs/smbfs/inode.c                          |  2 --
 fs/smbfs/proc.c                           |  1 -
 fs/stat.c                                 |  3 ++-
 fs/sysfs/inode.c                          |  1 -
 fs/sysv/ialloc.c                          |  2 +-
 fs/sysv/inode.c                           |  2 +-
 fs/udf/ialloc.c                           |  1 -
 fs/udf/inode.c                            |  2 --
 fs/ufs/ialloc.c                           |  1 -
 fs/ufs/inode.c                            |  1 -
 fs/xfs/linux-2.6/xfs_super.c              |  1 -
 fs/xfs/linux-2.6/xfs_vnode.c              |  1 -
 include/linux/fs.h                        |  1 -
 include/linux/nfsd/nfsfh.h                | 10 ++--------
 include/linux/smb.h                       |  1 -
 ipc/mqueue.c                              |  1 -
 kernel/cpuset.c                           |  1 -
 mm/shmem.c                                |  1 -
 net/sunrpc/rpc_pipe.c                     |  1 -
 security/inode.c                          |  1 -
 security/selinux/selinuxfs.c              |  1 -
 80 files changed, 21 insertions(+), 125 deletions(-)

(limited to 'kernel')

diff --git a/arch/powerpc/platforms/cell/spufs/inode.c b/arch/powerpc/platforms/cell/spufs/inode.c
index b837f12a84ae..3950ddccb2c8 100644
--- a/arch/powerpc/platforms/cell/spufs/inode.c
+++ b/arch/powerpc/platforms/cell/spufs/inode.c
@@ -82,7 +82,6 @@ spufs_new_inode(struct super_block *sb, int mode)
 	inode->i_mode = mode;
 	inode->i_uid = current->fsuid;
 	inode->i_gid = current->fsgid;
-	inode->i_blksize = PAGE_CACHE_SIZE;
 	inode->i_blocks = 0;
 	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 out:
diff --git a/arch/s390/hypfs/inode.c b/arch/s390/hypfs/inode.c
index 8735024d235b..813fc21358f9 100644
--- a/arch/s390/hypfs/inode.c
+++ b/arch/s390/hypfs/inode.c
@@ -91,7 +91,6 @@ static struct inode *hypfs_make_inode(struct super_block *sb, int mode)
 		ret->i_mode = mode;
 		ret->i_uid = hypfs_info->uid;
 		ret->i_gid = hypfs_info->gid;
-		ret->i_blksize = PAGE_CACHE_SIZE;
 		ret->i_blocks = 0;
 		ret->i_atime = ret->i_mtime = ret->i_ctime = CURRENT_TIME;
 		if (mode & S_IFDIR)
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 7b3b94ddddcc..c774121684d7 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -662,7 +662,8 @@ static void do_loop_switch(struct loop_device *lo, struct switch_request *p)
 
 	mapping_set_gfp_mask(old_file->f_mapping, lo->old_gfp_mask);
 	lo->lo_backing_file = file;
-	lo->lo_blocksize = mapping->host->i_blksize;
+	lo->lo_blocksize = S_ISBLK(mapping->host->i_mode) ?
+		mapping->host->i_bdev->bd_block_size : PAGE_SIZE;
 	lo->old_gfp_mask = mapping_gfp_mask(mapping);
 	mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS));
 	complete(&p->wait);
@@ -794,7 +795,9 @@ static int loop_set_fd(struct loop_device *lo, struct file *lo_file,
 		if (!(lo_flags & LO_FLAGS_USE_AOPS) && !file->f_op->write)
 			lo_flags |= LO_FLAGS_READ_ONLY;
 
-		lo_blocksize = inode->i_blksize;
+		lo_blocksize = S_ISBLK(inode->i_mode) ?
+			inode->i_bdev->bd_block_size : PAGE_SIZE;
+
 		error = 0;
 	} else {
 		goto out_putf;
diff --git a/drivers/infiniband/hw/ipath/ipath_fs.c b/drivers/infiniband/hw/ipath/ipath_fs.c
index 055cdd089b28..c8a8af0fe471 100644
--- a/drivers/infiniband/hw/ipath/ipath_fs.c
+++ b/drivers/infiniband/hw/ipath/ipath_fs.c
@@ -61,7 +61,6 @@ static int ipathfs_mknod(struct inode *dir, struct dentry *dentry,
 	inode->i_mode = mode;
 	inode->i_uid = 0;
 	inode->i_gid = 0;
-	inode->i_blksize = PAGE_CACHE_SIZE;
 	inode->i_blocks = 0;
 	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 	inode->i_private = data;
diff --git a/drivers/isdn/capi/capifs.c b/drivers/isdn/capi/capifs.c
index 9ea6bd0ddc35..2dd1b57b0ba4 100644
--- a/drivers/isdn/capi/capifs.c
+++ b/drivers/isdn/capi/capifs.c
@@ -104,7 +104,6 @@ capifs_fill_super(struct super_block *s, void *data, int silent)
 	inode->i_ino = 1;
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
 	inode->i_blocks = 0;
-	inode->i_blksize = 1024;
 	inode->i_uid = inode->i_gid = 0;
 	inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR;
 	inode->i_op = &simple_dir_inode_operations;
@@ -149,7 +148,6 @@ void capifs_new_ncci(unsigned int number, dev_t device)
 	if (!inode)
 		return;
 	inode->i_ino = number+2;
-	inode->i_blksize = 1024;
 	inode->i_uid = config.setuid ? config.uid : current->fsuid;
 	inode->i_gid = config.setgid ? config.gid : current->fsgid;
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
diff --git a/drivers/misc/ibmasm/ibmasmfs.c b/drivers/misc/ibmasm/ibmasmfs.c
index 0e909b617226..b99dc507de2e 100644
--- a/drivers/misc/ibmasm/ibmasmfs.c
+++ b/drivers/misc/ibmasm/ibmasmfs.c
@@ -147,7 +147,6 @@ static struct inode *ibmasmfs_make_inode(struct super_block *sb, int mode)
 	if (ret) {
 		ret->i_mode = mode;
 		ret->i_uid = ret->i_gid = 0;
-		ret->i_blksize = PAGE_CACHE_SIZE;
 		ret->i_blocks = 0;
 		ret->i_atime = ret->i_mtime = ret->i_ctime = CURRENT_TIME;
 	}
diff --git a/drivers/oprofile/oprofilefs.c b/drivers/oprofile/oprofilefs.c
index deb37354785b..5756401fb15b 100644
--- a/drivers/oprofile/oprofilefs.c
+++ b/drivers/oprofile/oprofilefs.c
@@ -31,7 +31,6 @@ static struct inode * oprofilefs_get_inode(struct super_block * sb, int mode)
 		inode->i_mode = mode;
 		inode->i_uid = 0;
 		inode->i_gid = 0;
-		inode->i_blksize = PAGE_CACHE_SIZE;
 		inode->i_blocks = 0;
 		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 	}
diff --git a/drivers/usb/core/inode.c b/drivers/usb/core/inode.c
index 482f253085e5..58b4b1012120 100644
--- a/drivers/usb/core/inode.c
+++ b/drivers/usb/core/inode.c
@@ -249,7 +249,6 @@ static struct inode *usbfs_get_inode (struct super_block *sb, int mode, dev_t de
 		inode->i_mode = mode;
 		inode->i_uid = current->fsuid;
 		inode->i_gid = current->fsgid;
-		inode->i_blksize = PAGE_CACHE_SIZE;
 		inode->i_blocks = 0;
 		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 		switch (mode & S_IFMT) {
diff --git a/drivers/usb/gadget/inode.c b/drivers/usb/gadget/inode.c
index ffaa8c1afad8..2a7162d89799 100644
--- a/drivers/usb/gadget/inode.c
+++ b/drivers/usb/gadget/inode.c
@@ -1966,7 +1966,6 @@ gadgetfs_make_inode (struct super_block *sb,
 		inode->i_mode = mode;
 		inode->i_uid = default_uid;
 		inode->i_gid = default_gid;
-		inode->i_blksize = PAGE_CACHE_SIZE;
 		inode->i_blocks = 0;
 		inode->i_atime = inode->i_mtime = inode->i_ctime
 				= CURRENT_TIME;
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index eae50c9d6dc4..7a7ec2d1d2f4 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -204,7 +204,6 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode)
 		inode->i_mode = mode;
 		inode->i_uid = current->fsuid;
 		inode->i_gid = current->fsgid;
-		inode->i_blksize = sb->s_blocksize;
 		inode->i_blocks = 0;
 		inode->i_rdev = 0;
 		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
@@ -950,9 +949,8 @@ v9fs_stat2inode(struct v9fs_stat *stat, struct inode *inode,
 
 	inode->i_size = stat->length;
 
-	inode->i_blksize = sb->s_blocksize;
 	inode->i_blocks =
-	    (inode->i_size + inode->i_blksize - 1) >> sb->s_blocksize_bits;
+	    (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
 }
 
 /**
diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c
index 534f3eecc985..7e7a04be1278 100644
--- a/fs/adfs/inode.c
+++ b/fs/adfs/inode.c
@@ -269,7 +269,6 @@ adfs_iget(struct super_block *sb, struct object_info *obj)
 	inode->i_ino	 = obj->file_id;
 	inode->i_size	 = obj->size;
 	inode->i_nlink	 = 2;
-	inode->i_blksize = PAGE_SIZE;
 	inode->i_blocks	 = (inode->i_size + sb->s_blocksize - 1) >>
 			    sb->s_blocksize_bits;
 
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 4ebb30a50ed5..6f37754906c2 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -72,7 +72,6 @@ static int afs_inode_map_status(struct afs_vnode *vnode)
 	inode->i_ctime.tv_sec	= vnode->status.mtime_server;
 	inode->i_ctime.tv_nsec	= 0;
 	inode->i_atime		= inode->i_mtime = inode->i_ctime;
-	inode->i_blksize	= PAGE_CACHE_SIZE;
 	inode->i_blocks		= 0;
 	inode->i_version	= vnode->fid.unique;
 	inode->i_mapping->a_ops	= &afs_fs_aops;
diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c
index d39d2acd9b38..2c9759baad61 100644
--- a/fs/autofs/inode.c
+++ b/fs/autofs/inode.c
@@ -216,7 +216,6 @@ static void autofs_read_inode(struct inode *inode)
 	inode->i_nlink = 2;
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
 	inode->i_blocks = 0;
-	inode->i_blksize = 1024;
 
 	if ( ino == AUTOFS_ROOT_INO ) {
 		inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR;
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index 11a6a9ae51b7..800ce876caec 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -447,7 +447,6 @@ struct inode *autofs4_get_inode(struct super_block *sb,
 		inode->i_uid = 0;
 		inode->i_gid = 0;
 	}
-	inode->i_blksize = PAGE_CACHE_SIZE;
 	inode->i_blocks = 0;
 	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index f6676fbe9484..57020c7a7e65 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -365,7 +365,6 @@ befs_read_inode(struct inode *inode)
 	inode->i_mtime.tv_nsec = 0;   /* lower 16 bits are not a time */	
 	inode->i_ctime = inode->i_mtime;
 	inode->i_atime = inode->i_mtime;
-	inode->i_blksize = befs_sb->block_size;
 
 	befs_ino->i_inode_num = fsrun_to_cpu(sb, raw_inode->inode_num);
 	befs_ino->i_parent = fsrun_to_cpu(sb, raw_inode->parent);
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index 26fad9621738..dcf04cb13283 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -102,7 +102,7 @@ static int bfs_create(struct inode * dir, struct dentry * dentry, int mode,
 	inode->i_uid = current->fsuid;
 	inode->i_gid = (dir->i_mode & S_ISGID) ? dir->i_gid : current->fsgid;
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
-	inode->i_blocks = inode->i_blksize = 0;
+	inode->i_blocks = 0;
 	inode->i_op = &bfs_file_inops;
 	inode->i_fop = &bfs_file_operations;
 	inode->i_mapping->a_ops = &bfs_aops;
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index 8fc2e8e49dbe..ed27ffb3459e 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -76,7 +76,6 @@ static void bfs_read_inode(struct inode * inode)
 	inode->i_size = BFS_FILESIZE(di);
 	inode->i_blocks = BFS_FILEBLOCKS(di);
         if (inode->i_size || inode->i_blocks) dprintf("Registered inode with %lld size, %ld blocks\n", inode->i_size, inode->i_blocks);
-	inode->i_blksize = PAGE_SIZE;
 	inode->i_atime.tv_sec =  le32_to_cpu(di->i_atime);
 	inode->i_mtime.tv_sec =  le32_to_cpu(di->i_mtime);
 	inode->i_ctime.tv_sec =  le32_to_cpu(di->i_ctime);
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index 6759b9839ce8..66ba137f8661 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -507,7 +507,6 @@ static struct inode *bm_get_inode(struct super_block *sb, int mode)
 		inode->i_mode = mode;
 		inode->i_uid = 0;
 		inode->i_gid = 0;
-		inode->i_blksize = PAGE_CACHE_SIZE;
 		inode->i_blocks = 0;
 		inode->i_atime = inode->i_mtime = inode->i_ctime =
 			current_fs_time(inode->i_sb);
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 4197a5043f13..22bcf4d7e7ae 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -253,7 +253,6 @@ cifs_alloc_inode(struct super_block *sb)
 	file data or metadata */
 	cifs_inode->clientCanCacheRead = FALSE;
 	cifs_inode->clientCanCacheAll = FALSE;
-	cifs_inode->vfs_inode.i_blksize = CIFS_MAX_MSGSIZE;
 	cifs_inode->vfs_inode.i_blkbits = 14;  /* 2**14 = CIFS_MAX_MSGSIZE */
 	cifs_inode->vfs_inode.i_flags = S_NOATIME | S_NOCMTIME;
 	INIT_LIST_HEAD(&cifs_inode->openFileList);
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 9aeb58a7d369..b27b34537bf2 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -216,10 +216,9 @@ static void fill_in_inode(struct inode *tmp_inode, int new_buf_type,
 
 	if (allocation_size < end_of_file)
 		cFYI(1, ("May be sparse file, allocation less than file size"));
-	cFYI(1, ("File Size %ld and blocks %llu and blocksize %ld",
+	cFYI(1, ("File Size %ld and blocks %llu",
 		(unsigned long)tmp_inode->i_size,
-		(unsigned long long)tmp_inode->i_blocks,
-		tmp_inode->i_blksize));
+		(unsigned long long)tmp_inode->i_blocks));
 	if (S_ISREG(tmp_inode->i_mode)) {
 		cFYI(1, ("File inode"));
 		tmp_inode->i_op = &cifs_file_inode_ops;
diff --git a/fs/coda/coda_linux.c b/fs/coda/coda_linux.c
index 5597080cb811..95a54253c047 100644
--- a/fs/coda/coda_linux.c
+++ b/fs/coda/coda_linux.c
@@ -110,8 +110,6 @@ void coda_vattr_to_iattr(struct inode *inode, struct coda_vattr *attr)
 	        inode->i_nlink = attr->va_nlink;
 	if (attr->va_size != -1)
 	        inode->i_size = attr->va_size;
-	if (attr->va_blocksize != -1)
-		inode->i_blksize = attr->va_blocksize;
 	if (attr->va_size != -1)
 		inode->i_blocks = (attr->va_size + 511) >> 9;
 	if (attr->va_atime.tv_sec != -1) 
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index 5047e6a7581e..fb18917954a9 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -135,7 +135,6 @@ struct inode * configfs_new_inode(mode_t mode, struct configfs_dirent * sd)
 {
 	struct inode * inode = new_inode(configfs_sb);
 	if (inode) {
-		inode->i_blksize = PAGE_CACHE_SIZE;
 		inode->i_blocks = 0;
 		inode->i_mapping->a_ops = &configfs_aops;
 		inode->i_mapping->backing_dev_info = &configfs_backing_dev_info;
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index d09b6777c41a..ad96b6990715 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -73,7 +73,6 @@ static int cramfs_iget5_set(struct inode *inode, void *opaque)
 	inode->i_uid = cramfs_inode->uid;
 	inode->i_size = cramfs_inode->size;
 	inode->i_blocks = (cramfs_inode->size - 1) / 512 + 1;
-	inode->i_blksize = PAGE_CACHE_SIZE;
 	inode->i_gid = cramfs_inode->gid;
 	/* Struct copy intentional */
 	inode->i_mtime = inode->i_atime = inode->i_ctime = zerotime;
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 717f4821ed02..269e649e6dc6 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -40,7 +40,6 @@ static struct inode *debugfs_get_inode(struct super_block *sb, int mode, dev_t d
 		inode->i_mode = mode;
 		inode->i_uid = 0;
 		inode->i_gid = 0;
-		inode->i_blksize = PAGE_CACHE_SIZE;
 		inode->i_blocks = 0;
 		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 		switch (mode & S_IFMT) {
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 5bf06a10dddf..5f7b5a6025bf 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -113,7 +113,6 @@ devpts_fill_super(struct super_block *s, void *data, int silent)
 	inode->i_ino = 1;
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
 	inode->i_blocks = 0;
-	inode->i_blksize = 1024;
 	inode->i_uid = inode->i_gid = 0;
 	inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR;
 	inode->i_op = &simple_dir_inode_operations;
@@ -172,7 +171,6 @@ int devpts_pty_new(struct tty_struct *tty)
 		return -ENOMEM;
 
 	inode->i_ino = number+2;
-	inode->i_blksize = 1024;
 	inode->i_uid = config.setuid ? config.uid : current->fsuid;
 	inode->i_gid = config.setgid ? config.gid : current->fsgid;
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 3a3567433b92..8d544334bcd2 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1590,7 +1590,6 @@ static struct inode *ep_eventpoll_inode(void)
 	inode->i_uid = current->fsuid;
 	inode->i_gid = current->fsgid;
 	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
-	inode->i_blksize = PAGE_SIZE;
 	return inode;
 
 eexit_1:
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index 695f69ccf908..2cb545bf0f3c 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -574,7 +574,6 @@ got:
 	inode->i_mode = mode;
 
 	inode->i_ino = ino;
-	inode->i_blksize = PAGE_SIZE;	/* This is the optimal IO size (for stat), not the fs block size */
 	inode->i_blocks = 0;
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
 	memset(ei->i_data, 0, sizeof(ei->i_data));
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index fb4d3220eb8d..dd4e14c221e0 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -1094,7 +1094,6 @@ void ext2_read_inode (struct inode * inode)
 		brelse (bh);
 		goto bad_inode;
 	}
-	inode->i_blksize = PAGE_SIZE;	/* This is the optimal IO size (for stat), not the fs block size */
 	inode->i_blocks = le32_to_cpu(raw_inode->i_blocks);
 	ei->i_flags = le32_to_cpu(raw_inode->i_flags);
 	ei->i_faddr = le32_to_cpu(raw_inode->i_faddr);
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index 1e4ded8aa3cd..e45dbd651736 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -559,7 +559,6 @@ got:
 
 	inode->i_ino = ino;
 	/* This is the optimal IO size (for stat), not the fs block size */
-	inode->i_blksize = PAGE_SIZE;
 	inode->i_blocks = 0;
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
 
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index c9fc15f8b609..dcf4f1dd108b 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -2632,9 +2632,6 @@ void ext3_read_inode(struct inode * inode)
 		 * recovery code: that's fine, we're about to complete
 		 * the process of deleting those. */
 	}
-	inode->i_blksize = PAGE_SIZE;	/* This is the optimal IO size
-					 * (for stat), not the fs block
-					 * size */  
 	inode->i_blocks = le32_to_cpu(raw_inode->i_blocks);
 	ei->i_flags = le32_to_cpu(raw_inode->i_flags);
 #ifdef EXT3_FRAGMENTS
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index e1035a590664..ab96ae823753 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -370,8 +370,6 @@ static int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de)
 			inode->i_flags |= S_IMMUTABLE;
 	}
 	MSDOS_I(inode)->i_attrs = de->attr & ATTR_UNUSED;
-	/* this is as close to the truth as we can get ... */
-	inode->i_blksize = sbi->cluster_size;
 	inode->i_blocks = ((inode->i_size + (sbi->cluster_size - 1))
 			   & ~((loff_t)sbi->cluster_size - 1)) >> 9;
 	inode->i_mtime.tv_sec =
@@ -1131,7 +1129,6 @@ static int fat_read_root(struct inode *inode)
 		MSDOS_I(inode)->i_start = 0;
 		inode->i_size = sbi->dir_entries * sizeof(struct msdos_dir_entry);
 	}
-	inode->i_blksize = sbi->cluster_size;
 	inode->i_blocks = ((inode->i_size + (sbi->cluster_size - 1))
 			   & ~((loff_t)sbi->cluster_size - 1)) >> 9;
 	MSDOS_I(inode)->i_logstart = 0;
diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c
index 32a82ed108e4..4786d51ad3bd 100644
--- a/fs/freevxfs/vxfs_inode.c
+++ b/fs/freevxfs/vxfs_inode.c
@@ -239,7 +239,6 @@ vxfs_iinit(struct inode *ip, struct vxfs_inode_info *vip)
 	ip->i_ctime.tv_nsec = 0;
 	ip->i_mtime.tv_nsec = 0;
 
-	ip->i_blksize = PAGE_SIZE;
 	ip->i_blocks = vip->vii_blocks;
 	ip->i_generation = vip->vii_gen;
 
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 7d25092262ae..cb7cadb0b790 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -118,7 +118,6 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr)
 	inode->i_uid     = attr->uid;
 	inode->i_gid     = attr->gid;
 	i_size_write(inode, attr->size);
-	inode->i_blksize = PAGE_CACHE_SIZE;
 	inode->i_blocks  = attr->blocks;
 	inode->i_atime.tv_sec   = attr->atime;
 	inode->i_atime.tv_nsec  = attr->atimensec;
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index 315cf44a90b2..d05641c35fc9 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -154,7 +154,6 @@ struct inode *hfs_new_inode(struct inode *dir, struct qstr *name, int mode)
 	inode->i_gid = current->fsgid;
 	inode->i_nlink = 1;
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
-	inode->i_blksize = HFS_SB(sb)->alloc_blksz;
 	HFS_I(inode)->flags = 0;
 	HFS_I(inode)->rsrc_inode = NULL;
 	HFS_I(inode)->fs_blocks = 0;
@@ -284,7 +283,6 @@ static int hfs_read_inode(struct inode *inode, void *data)
 	inode->i_uid = hsb->s_uid;
 	inode->i_gid = hsb->s_gid;
 	inode->i_nlink = 1;
-	inode->i_blksize = HFS_SB(inode->i_sb)->alloc_blksz;
 
 	if (idata->key)
 		HFS_I(inode)->cat_key = *idata->key;
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 924ecdef8091..0eb1a6092668 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -304,7 +304,6 @@ struct inode *hfsplus_new_inode(struct super_block *sb, int mode)
 	inode->i_gid = current->fsgid;
 	inode->i_nlink = 1;
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
-	inode->i_blksize = HFSPLUS_SB(sb).alloc_blksz;
 	INIT_LIST_HEAD(&HFSPLUS_I(inode).open_dir_list);
 	init_MUTEX(&HFSPLUS_I(inode).extents_lock);
 	atomic_set(&HFSPLUS_I(inode).opencnt, 0);
@@ -407,7 +406,6 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
 	type = hfs_bnode_read_u16(fd->bnode, fd->entryoffset);
 
 	HFSPLUS_I(inode).dev = 0;
-	inode->i_blksize = HFSPLUS_SB(inode->i_sb).alloc_blksz;
 	if (type == HFSPLUS_FOLDER) {
 		struct hfsplus_cat_folder *folder = &entry.folder;
 
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index b82e3d9c8790..322e876c35ed 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -156,7 +156,6 @@ static int read_name(struct inode *ino, char *name)
 	ino->i_mode = i_mode;
 	ino->i_nlink = i_nlink;
 	ino->i_size = i_size;
-	ino->i_blksize = i_blksize;
 	ino->i_blocks = i_blocks;
 	return(0);
 }
diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c
index 56f2c338c4d9..bcf6ee36e065 100644
--- a/fs/hpfs/inode.c
+++ b/fs/hpfs/inode.c
@@ -17,7 +17,6 @@ void hpfs_init_inode(struct inode *i)
 	i->i_gid = hpfs_sb(sb)->sb_gid;
 	i->i_mode = hpfs_sb(sb)->sb_mode;
 	hpfs_inode->i_conv = hpfs_sb(sb)->sb_conv;
-	i->i_blksize = 512;
 	i->i_size = -1;
 	i->i_blocks = -1;
 	
diff --git a/fs/hppfs/hppfs_kern.c b/fs/hppfs/hppfs_kern.c
index 3a9bdf58166f..dcb6d2e988b8 100644
--- a/fs/hppfs/hppfs_kern.c
+++ b/fs/hppfs/hppfs_kern.c
@@ -152,7 +152,6 @@ static void hppfs_read_inode(struct inode *ino)
 	ino->i_mode = proc_ino->i_mode;
 	ino->i_nlink = proc_ino->i_nlink;
 	ino->i_size = proc_ino->i_size;
-	ino->i_blksize = proc_ino->i_blksize;
 	ino->i_blocks = proc_ino->i_blocks;
 }
 
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index c3920c96dadf..e025a31b4c64 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -357,7 +357,6 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid,
 		inode->i_mode = mode;
 		inode->i_uid = uid;
 		inode->i_gid = gid;
-		inode->i_blksize = HPAGE_SIZE;
 		inode->i_blocks = 0;
 		inode->i_mapping->a_ops = &hugetlbfs_aops;
 		inode->i_mapping->backing_dev_info =&hugetlbfs_backing_dev_info;
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 10e47897bac7..4527692f432b 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -1235,7 +1235,7 @@ static void isofs_read_inode(struct inode *inode)
 	}
 	inode->i_uid = sbi->s_uid;
 	inode->i_gid = sbi->s_gid;
-	inode->i_blocks = inode->i_blksize = 0;
+	inode->i_blocks = 0;
 
 	ei->i_format_parm[0] = 0;
 	ei->i_format_parm[1] = 0;
@@ -1291,7 +1291,6 @@ static void isofs_read_inode(struct inode *inode)
 			      isonum_711 (de->ext_attr_length));
 
 	/* Set the number of blocks for stat() - should be done before RR */
-	inode->i_blksize = PAGE_CACHE_SIZE; /* For stat() only */
 	inode->i_blocks  = (inode->i_size + 511) >> 9;
 
 	/*
diff --git a/fs/jffs/inode-v23.c b/fs/jffs/inode-v23.c
index 7358ef87f16b..f5cf9c93e243 100644
--- a/fs/jffs/inode-v23.c
+++ b/fs/jffs/inode-v23.c
@@ -364,7 +364,6 @@ jffs_new_inode(const struct inode * dir, struct jffs_raw_inode *raw_inode,
 	inode->i_ctime.tv_nsec = 0;
 	inode->i_mtime.tv_nsec = 0;
 	inode->i_atime.tv_nsec = 0;
-	inode->i_blksize = PAGE_SIZE;
 	inode->i_blocks = (inode->i_size + 511) >> 9;
 
 	f = jffs_find_file(c, raw_inode->ino);
@@ -1706,7 +1705,6 @@ jffs_read_inode(struct inode *inode)
 	inode->i_mtime.tv_nsec = 
 	inode->i_ctime.tv_nsec = 0;
 
-	inode->i_blksize = PAGE_SIZE;
 	inode->i_blocks = (inode->i_size + 511) >> 9;
 	if (S_ISREG(inode->i_mode)) {
 		inode->i_op = &jffs_file_inode_operations;
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 4780f82825d6..72d9909d95ff 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -263,7 +263,6 @@ void jffs2_read_inode (struct inode *inode)
 
 	inode->i_nlink = f->inocache->nlink;
 
-	inode->i_blksize = PAGE_SIZE;
 	inode->i_blocks = (inode->i_size + 511) >> 9;
 
 	switch (inode->i_mode & S_IFMT) {
@@ -449,7 +448,6 @@ struct inode *jffs2_new_inode (struct inode *dir_i, int mode, struct jffs2_raw_i
 	inode->i_atime = inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC;
 	ri->atime = ri->mtime = ri->ctime = cpu_to_je32(I_SEC(inode->i_mtime));
 
-	inode->i_blksize = PAGE_SIZE;
 	inode->i_blocks = 0;
 	inode->i_size = 0;
 
diff --git a/fs/jfs/jfs_extent.c b/fs/jfs/jfs_extent.c
index 4d52593a5fc6..4c74f0944f7e 100644
--- a/fs/jfs/jfs_extent.c
+++ b/fs/jfs/jfs_extent.c
@@ -468,7 +468,7 @@ int extRecord(struct inode *ip, xad_t * xp)
 int extFill(struct inode *ip, xad_t * xp)
 {
 	int rc, nbperpage = JFS_SBI(ip->i_sb)->nbperpage;
-	s64 blkno = offsetXAD(xp) >> ip->i_blksize;
+	s64 blkno = offsetXAD(xp) >> ip->i_blkbits;
 
 //      assert(ISSPARSE(ip));
 
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index ccbe60aff83d..369d7f39c040 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -3115,7 +3115,6 @@ static int copy_from_dinode(struct dinode * dip, struct inode *ip)
 	ip->i_mtime.tv_nsec = le32_to_cpu(dip->di_mtime.tv_nsec);
 	ip->i_ctime.tv_sec = le32_to_cpu(dip->di_ctime.tv_sec);
 	ip->i_ctime.tv_nsec = le32_to_cpu(dip->di_ctime.tv_nsec);
-	ip->i_blksize = ip->i_sb->s_blocksize;
 	ip->i_blocks = LBLK2PBLK(ip->i_sb, le64_to_cpu(dip->di_nblocks));
 	ip->i_generation = le32_to_cpu(dip->di_gen);
 
diff --git a/fs/jfs/jfs_inode.c b/fs/jfs/jfs_inode.c
index 495df402916d..bffaca9ae3a2 100644
--- a/fs/jfs/jfs_inode.c
+++ b/fs/jfs/jfs_inode.c
@@ -115,7 +115,6 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
 	}
 	jfs_inode->mode2 |= mode;
 
-	inode->i_blksize = sb->s_blocksize;
 	inode->i_blocks = 0;
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
 	jfs_inode->otime = inode->i_ctime.tv_sec;
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index e1e0a6e6ebdf..f5afc129d6b1 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -257,7 +257,7 @@ static sector_t metapage_get_blocks(struct inode *inode, sector_t lblock,
 	int rc = 0;
 	int xflag;
 	s64 xaddr;
-	sector_t file_blocks = (inode->i_size + inode->i_blksize - 1) >>
+	sector_t file_blocks = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
 			       inode->i_blkbits;
 
 	if (lblock >= file_blocks)
diff --git a/fs/libfs.c b/fs/libfs.c
index 2751793beeaa..8db5afb7b0a7 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -383,7 +383,6 @@ int simple_fill_super(struct super_block *s, int magic, struct tree_descr *files
 		return -ENOMEM;
 	inode->i_mode = S_IFDIR | 0755;
 	inode->i_uid = inode->i_gid = 0;
-	inode->i_blksize = PAGE_CACHE_SIZE;
 	inode->i_blocks = 0;
 	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 	inode->i_op = &simple_dir_inode_operations;
@@ -405,7 +404,6 @@ int simple_fill_super(struct super_block *s, int magic, struct tree_descr *files
 			goto out;
 		inode->i_mode = S_IFREG | files->mode;
 		inode->i_uid = inode->i_gid = 0;
-		inode->i_blksize = PAGE_CACHE_SIZE;
 		inode->i_blocks = 0;
 		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 		inode->i_fop = files->ops;
diff --git a/fs/minix/bitmap.c b/fs/minix/bitmap.c
index 4a6abc49418e..df6b1075b549 100644
--- a/fs/minix/bitmap.c
+++ b/fs/minix/bitmap.c
@@ -254,7 +254,7 @@ struct inode * minix_new_inode(const struct inode * dir, int * error)
 	inode->i_gid = (dir->i_mode & S_ISGID) ? dir->i_gid : current->fsgid;
 	inode->i_ino = j;
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
-	inode->i_blocks = inode->i_blksize = 0;
+	inode->i_blocks = 0;
 	memset(&minix_i(inode)->u, 0, sizeof(minix_i(inode)->u));
 	insert_inode_hash(inode);
 	mark_inode_dirty(inode);
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 826b9d830650..c11a4b9fb863 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -396,7 +396,7 @@ static void V1_minix_read_inode(struct inode * inode)
 	inode->i_mtime.tv_nsec = 0;
 	inode->i_atime.tv_nsec = 0;
 	inode->i_ctime.tv_nsec = 0;
-	inode->i_blocks = inode->i_blksize = 0;
+	inode->i_blocks = 0;
 	for (i = 0; i < 9; i++)
 		minix_inode->u.i1_data[i] = raw_inode->i_zone[i];
 	minix_set_inode(inode, old_decode_dev(raw_inode->i_zone[0]));
@@ -429,7 +429,7 @@ static void V2_minix_read_inode(struct inode * inode)
 	inode->i_mtime.tv_nsec = 0;
 	inode->i_atime.tv_nsec = 0;
 	inode->i_ctime.tv_nsec = 0;
-	inode->i_blocks = inode->i_blksize = 0;
+	inode->i_blocks = 0;
 	for (i = 0; i < 10; i++)
 		minix_inode->u.i2_data[i] = raw_inode->i_zone[i];
 	minix_set_inode(inode, old_decode_dev(raw_inode->i_zone[0]));
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index 8244710e97dd..42e3bef270c9 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -223,7 +223,6 @@ static void ncp_set_attr(struct inode *inode, struct ncp_entry_info *nwinfo)
 	inode->i_nlink = 1;
 	inode->i_uid = server->m.uid;
 	inode->i_gid = server->m.gid;
-	inode->i_blksize = NCP_BLOCK_SIZE;
 
 	ncp_update_dates(inode, &nwinfo->i);
 	ncp_update_inode(inode, nwinfo);
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 931f52a19579..bc9376ca86cd 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -277,10 +277,8 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
 			 * report the blocks in 512byte units
 			 */
 			inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used);
-			inode->i_blksize = inode->i_sb->s_blocksize;
 		} else {
 			inode->i_blocks = fattr->du.nfs2.blocks;
-			inode->i_blksize = fattr->du.nfs2.blocksize;
 		}
 		nfsi->attrtimeo = NFS_MINATTRTIMEO(inode);
 		nfsi->attrtimeo_timestamp = jiffies;
@@ -969,10 +967,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 		 * report the blocks in 512byte units
 		 */
 		inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used);
-		inode->i_blksize = inode->i_sb->s_blocksize;
  	} else {
  		inode->i_blocks = fattr->du.nfs2.blocks;
- 		inode->i_blksize = fattr->du.nfs2.blocksize;
  	}
 
 	if ((fattr->valid & NFS_ATTR_FATTR_V4) != 0 &&
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index 31852121b3f5..933dbd89c2a4 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -556,8 +556,6 @@ static int ntfs_read_locked_inode(struct inode *vi)
 
 	/* Setup the generic vfs inode parts now. */
 
-	/* This is the optimal IO size (for stat), not the fs block size. */
-	vi->i_blksize = PAGE_CACHE_SIZE;
 	/*
 	 * This is for checking whether an inode has changed w.r.t. a file so
 	 * that the file can be updated if necessary (compare with f_version).
@@ -1234,7 +1232,6 @@ static int ntfs_read_locked_attr_inode(struct inode *base_vi, struct inode *vi)
 	base_ni = NTFS_I(base_vi);
 
 	/* Just mirror the values from the base inode. */
-	vi->i_blksize	= base_vi->i_blksize;
 	vi->i_version	= base_vi->i_version;
 	vi->i_uid	= base_vi->i_uid;
 	vi->i_gid	= base_vi->i_gid;
@@ -1504,7 +1501,6 @@ static int ntfs_read_locked_index_inode(struct inode *base_vi, struct inode *vi)
 	ni	= NTFS_I(vi);
 	base_ni = NTFS_I(base_vi);
 	/* Just mirror the values from the base inode. */
-	vi->i_blksize	= base_vi->i_blksize;
 	vi->i_version	= base_vi->i_version;
 	vi->i_uid	= base_vi->i_uid;
 	vi->i_gid	= base_vi->i_gid;
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index 578fb3d5e803..584260fd6848 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c
@@ -2637,11 +2637,6 @@ mft_rec_already_initialized:
 			goto undo_mftbmp_alloc;
 		}
 		vi->i_ino = bit;
-		/*
-		 * This is the optimal IO size (for stat), not the fs block
-		 * size.
-		 */
-		vi->i_blksize = PAGE_CACHE_SIZE;
 		/*
 		 * This is for checking whether an inode has changed w.r.t. a
 		 * file so that the file can be updated if necessary (compare
diff --git a/fs/ocfs2/dlm/dlmfs.c b/fs/ocfs2/dlm/dlmfs.c
index 0ff0898a0b9c..0368c6402182 100644
--- a/fs/ocfs2/dlm/dlmfs.c
+++ b/fs/ocfs2/dlm/dlmfs.c
@@ -335,7 +335,6 @@ static struct inode *dlmfs_get_root_inode(struct super_block *sb)
 		inode->i_mode = mode;
 		inode->i_uid = current->fsuid;
 		inode->i_gid = current->fsgid;
-		inode->i_blksize = PAGE_CACHE_SIZE;
 		inode->i_blocks = 0;
 		inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info;
 		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
@@ -362,7 +361,6 @@ static struct inode *dlmfs_get_inode(struct inode *parent,
 	inode->i_mode = mode;
 	inode->i_uid = current->fsuid;
 	inode->i_gid = current->fsgid;
-	inode->i_blksize = PAGE_CACHE_SIZE;
 	inode->i_blocks = 0;
 	inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info;
 	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 69d3db569166..16e8e74dc966 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -269,7 +269,6 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
 	inode->i_mode = le16_to_cpu(fe->i_mode);
 	inode->i_uid = le32_to_cpu(fe->i_uid);
 	inode->i_gid = le32_to_cpu(fe->i_gid);
-	inode->i_blksize = (u32)osb->s_clustersize;
 
 	/* Fast symlinks will have i_size but no allocated clusters. */
 	if (S_ISLNK(inode->i_mode) && !fe->i_clusters)
@@ -1258,8 +1257,6 @@ leave:
 void ocfs2_refresh_inode(struct inode *inode,
 			 struct ocfs2_dinode *fe)
 {
-	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-
 	spin_lock(&OCFS2_I(inode)->ip_lock);
 
 	OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
@@ -1270,7 +1267,6 @@ void ocfs2_refresh_inode(struct inode *inode,
 	inode->i_uid = le32_to_cpu(fe->i_uid);
 	inode->i_gid = le32_to_cpu(fe->i_gid);
 	inode->i_mode = le16_to_cpu(fe->i_mode);
-	inode->i_blksize = (u32) osb->s_clustersize;
 	if (S_ISLNK(inode->i_mode) && le32_to_cpu(fe->i_clusters) == 0)
 		inode->i_blocks = 0;
 	else
diff --git a/fs/pipe.c b/fs/pipe.c
index 20352573e025..f3b6f71e9d0b 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -879,7 +879,6 @@ static struct inode * get_pipe_inode(void)
 	inode->i_uid = current->fsuid;
 	inode->i_gid = current->fsgid;
 	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
-	inode->i_blksize = PAGE_SIZE;
 
 	return inode;
 
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index fddbd61c68d0..5a41db2a218d 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -496,7 +496,6 @@ static void qnx4_read_inode(struct inode *inode)
 	inode->i_ctime.tv_sec   = le32_to_cpu(raw_inode->di_ctime);
 	inode->i_ctime.tv_nsec = 0;
 	inode->i_blocks  = le32_to_cpu(raw_inode->di_first_xtnt.xtnt_size);
-	inode->i_blksize = QNX4_DIR_ENTRY_SIZE;
 
 	memcpy(qnx4_inode, raw_inode, QNX4_DIR_ENTRY_SIZE);
 	if (S_ISREG(inode->i_mode)) {
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index b9677335cc8d..bc0e51662424 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -58,7 +58,6 @@ struct inode *ramfs_get_inode(struct super_block *sb, int mode, dev_t dev)
 		inode->i_mode = mode;
 		inode->i_uid = current->fsuid;
 		inode->i_gid = current->fsgid;
-		inode->i_blksize = PAGE_CACHE_SIZE;
 		inode->i_blocks = 0;
 		inode->i_mapping->a_ops = &ramfs_aops;
 		inode->i_mapping->backing_dev_info = &ramfs_backing_dev_info;
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 52f1e2136546..8810fda0da46 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -17,8 +17,6 @@
 #include <linux/writeback.h>
 #include <linux/quotaops.h>
 
-extern int reiserfs_default_io_size;	/* default io size devuned in super.c */
-
 static int reiserfs_commit_write(struct file *f, struct page *page,
 				 unsigned from, unsigned to);
 static int reiserfs_prepare_write(struct file *f, struct page *page,
@@ -1122,7 +1120,6 @@ static void init_inode(struct inode *inode, struct path *path)
 	ih = PATH_PITEM_HEAD(path);
 
 	copy_key(INODE_PKEY(inode), &(ih->ih_key));
-	inode->i_blksize = reiserfs_default_io_size;
 
 	INIT_LIST_HEAD(&(REISERFS_I(inode)->i_prealloc_list));
 	REISERFS_I(inode)->i_flags = 0;
@@ -1877,7 +1874,6 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
 	}
 	// these do not go to on-disk stat data
 	inode->i_ino = le32_to_cpu(ih.ih_key.k_objectid);
-	inode->i_blksize = reiserfs_default_io_size;
 
 	// store in in-core inode the key of stat data and version all
 	// object items will have (directory items will have old offset
diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c
index 92cf60aa6121..2c122ee83adb 100644
--- a/fs/smbfs/inode.c
+++ b/fs/smbfs/inode.c
@@ -166,7 +166,6 @@ smb_get_inode_attr(struct inode *inode, struct smb_fattr *fattr)
 	fattr->f_mtime	= inode->i_mtime;
 	fattr->f_ctime	= inode->i_ctime;
 	fattr->f_atime	= inode->i_atime;
-	fattr->f_blksize= inode->i_blksize;
 	fattr->f_blocks	= inode->i_blocks;
 
 	fattr->attr	= SMB_I(inode)->attr;
@@ -200,7 +199,6 @@ smb_set_inode_attr(struct inode *inode, struct smb_fattr *fattr)
 	inode->i_uid	= fattr->f_uid;
 	inode->i_gid	= fattr->f_gid;
 	inode->i_ctime	= fattr->f_ctime;
-	inode->i_blksize= fattr->f_blksize;
 	inode->i_blocks = fattr->f_blocks;
 	inode->i_size	= fattr->f_size;
 	inode->i_mtime	= fattr->f_mtime;
diff --git a/fs/smbfs/proc.c b/fs/smbfs/proc.c
index c3495059889d..40e174db9872 100644
--- a/fs/smbfs/proc.c
+++ b/fs/smbfs/proc.c
@@ -1826,7 +1826,6 @@ smb_init_dirent(struct smb_sb_info *server, struct smb_fattr *fattr)
 	fattr->f_nlink = 1;
 	fattr->f_uid = server->mnt->uid;
 	fattr->f_gid = server->mnt->gid;
-	fattr->f_blksize = SMB_ST_BLKSIZE;
 	fattr->f_unix = 0;
 }
 
diff --git a/fs/stat.c b/fs/stat.c
index 3a44dcf97da2..60a31d5e5966 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -14,6 +14,7 @@
 #include <linux/namei.h>
 #include <linux/security.h>
 #include <linux/syscalls.h>
+#include <linux/pagemap.h>
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -32,7 +33,7 @@ void generic_fillattr(struct inode *inode, struct kstat *stat)
 	stat->ctime = inode->i_ctime;
 	stat->size = i_size_read(inode);
 	stat->blocks = inode->i_blocks;
-	stat->blksize = inode->i_blksize;
+	stat->blksize = (1 << inode->i_blkbits);
 }
 
 EXPORT_SYMBOL(generic_fillattr);
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index fd7cd5f843d2..e79e38d52c00 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -125,7 +125,6 @@ struct inode * sysfs_new_inode(mode_t mode, struct sysfs_dirent * sd)
 {
 	struct inode * inode = new_inode(sysfs_sb);
 	if (inode) {
-		inode->i_blksize = PAGE_CACHE_SIZE;
 		inode->i_blocks = 0;
 		inode->i_mapping->a_ops = &sysfs_aops;
 		inode->i_mapping->backing_dev_info = &sysfs_backing_dev_info;
diff --git a/fs/sysv/ialloc.c b/fs/sysv/ialloc.c
index 9b585d1081c0..115ab0d6f4bc 100644
--- a/fs/sysv/ialloc.c
+++ b/fs/sysv/ialloc.c
@@ -170,7 +170,7 @@ struct inode * sysv_new_inode(const struct inode * dir, mode_t mode)
 	inode->i_uid = current->fsuid;
 	inode->i_ino = fs16_to_cpu(sbi, ino);
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
-	inode->i_blocks = inode->i_blksize = 0;
+	inode->i_blocks = 0;
 	memset(SYSV_I(inode)->i_data, 0, sizeof(SYSV_I(inode)->i_data));
 	SYSV_I(inode)->i_dir_start_lookup = 0;
 	insert_inode_hash(inode);
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index 58b2d22142ba..d63c5e48b050 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -201,7 +201,7 @@ static void sysv_read_inode(struct inode *inode)
 	inode->i_ctime.tv_nsec = 0;
 	inode->i_atime.tv_nsec = 0;
 	inode->i_mtime.tv_nsec = 0;
-	inode->i_blocks = inode->i_blksize = 0;
+	inode->i_blocks = 0;
 
 	si = SYSV_I(inode);
 	for (block = 0; block < 10+1+1+1; block++)
diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c
index d1d38238ce65..8206983f2ebf 100644
--- a/fs/udf/ialloc.c
+++ b/fs/udf/ialloc.c
@@ -121,7 +121,6 @@ struct inode * udf_new_inode (struct inode *dir, int mode, int * err)
 	UDF_I_LOCATION(inode).logicalBlockNum = block;
 	UDF_I_LOCATION(inode).partitionReferenceNum = UDF_I_LOCATION(dir).partitionReferenceNum;
 	inode->i_ino = udf_get_lb_pblock(sb, UDF_I_LOCATION(inode), 0);
-	inode->i_blksize = PAGE_SIZE;
 	inode->i_blocks = 0;
 	UDF_I_LENEATTR(inode) = 0;
 	UDF_I_LENALLOC(inode) = 0;
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 605f5111b6d8..b223b32db991 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -916,8 +916,6 @@ __udf_read_inode(struct inode *inode)
 	 *      i_nlink = 1
 	 *      i_op = NULL;
 	 */
-	inode->i_blksize = PAGE_SIZE;
-
 	bh = udf_read_ptagged(inode->i_sb, UDF_I_LOCATION(inode), 0, &ident);
 
 	if (!bh)
diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c
index 9501dcd3b213..2ad1259c6eca 100644
--- a/fs/ufs/ialloc.c
+++ b/fs/ufs/ialloc.c
@@ -255,7 +255,6 @@ cg_found:
 		inode->i_gid = current->fsgid;
 
 	inode->i_ino = cg * uspi->s_ipg + bit;
-	inode->i_blksize = PAGE_SIZE;	/* This is the optimal IO size (for stat), not the fs block size */
 	inode->i_blocks = 0;
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
 	ufsi->i_flags = UFS_I(dir)->i_flags;
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index 30c6e8a9446c..ee1eaa6f4ec2 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -741,7 +741,6 @@ void ufs_read_inode(struct inode * inode)
 		ufs1_read_inode(inode, ufs_inode + ufs_inotofsbo(inode->i_ino));
 	}
 
-	inode->i_blksize = PAGE_SIZE;/*This is the optimal IO size (for stat)*/
 	inode->i_version++;
 	ufsi->i_lastfrag =
 		(inode->i_size + uspi->s_fsize - 1) >> uspi->s_fshift;
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 4754f342a5d3..9df9ed37d219 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -171,7 +171,6 @@ xfs_revalidate_inode(
 		break;
 	}
 
-	inode->i_blksize = xfs_preferred_iosize(mp);
 	inode->i_generation = ip->i_d.di_gen;
 	i_size_write(inode, ip->i_d.di_size);
 	inode->i_blocks =
diff --git a/fs/xfs/linux-2.6/xfs_vnode.c b/fs/xfs/linux-2.6/xfs_vnode.c
index 6628d96b6fd6..553fa731ade5 100644
--- a/fs/xfs/linux-2.6/xfs_vnode.c
+++ b/fs/xfs/linux-2.6/xfs_vnode.c
@@ -122,7 +122,6 @@ vn_revalidate_core(
 	inode->i_blocks	    = vap->va_nblocks;
 	inode->i_mtime	    = vap->va_mtime;
 	inode->i_ctime	    = vap->va_ctime;
-	inode->i_blksize    = vap->va_blocksize;
 	if (vap->va_xflags & XFS_XFLAG_IMMUTABLE)
 		inode->i_flags |= S_IMMUTABLE;
 	else
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 192e69bb55b5..8f74dfbb2edd 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -512,7 +512,6 @@ struct inode {
 	struct timespec		i_mtime;
 	struct timespec		i_ctime;
 	unsigned int		i_blkbits;
-	unsigned long		i_blksize;
 	unsigned long		i_version;
 	blkcnt_t		i_blocks;
 	unsigned short          i_bytes;
diff --git a/include/linux/nfsd/nfsfh.h b/include/linux/nfsd/nfsfh.h
index f9edcd2ff3c8..31a3cb617ce0 100644
--- a/include/linux/nfsd/nfsfh.h
+++ b/include/linux/nfsd/nfsfh.h
@@ -269,14 +269,8 @@ fill_post_wcc(struct svc_fh *fhp)
 	fhp->fh_post_uid	= inode->i_uid;
 	fhp->fh_post_gid	= inode->i_gid;
 	fhp->fh_post_size       = inode->i_size;
-	if (inode->i_blksize) {
-		fhp->fh_post_blksize    = inode->i_blksize;
-		fhp->fh_post_blocks     = inode->i_blocks;
-	} else {
-		fhp->fh_post_blksize    = BLOCK_SIZE;
-		/* how much do we care for accuracy with MinixFS? */
-		fhp->fh_post_blocks     = (inode->i_size+511) >> 9;
-	}
+	fhp->fh_post_blksize    = BLOCK_SIZE;
+	fhp->fh_post_blocks     = inode->i_blocks;
 	fhp->fh_post_rdev[0]    = htonl((u32)imajor(inode));
 	fhp->fh_post_rdev[1]    = htonl((u32)iminor(inode));
 	fhp->fh_post_atime      = inode->i_atime;
diff --git a/include/linux/smb.h b/include/linux/smb.h
index 6df3b1501559..f098dff93f6b 100644
--- a/include/linux/smb.h
+++ b/include/linux/smb.h
@@ -89,7 +89,6 @@ struct smb_fattr {
 	struct timespec	f_atime;
 	struct timespec f_mtime;
 	struct timespec f_ctime;
-	unsigned long	f_blksize;
 	unsigned long	f_blocks;
 	int		f_unix;
 };
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index f8381f0660b2..840f8a6fb85f 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -115,7 +115,6 @@ static struct inode *mqueue_get_inode(struct super_block *sb, int mode,
 		inode->i_mode = mode;
 		inode->i_uid = current->fsuid;
 		inode->i_gid = current->fsgid;
-		inode->i_blksize = PAGE_CACHE_SIZE;
 		inode->i_blocks = 0;
 		inode->i_mtime = inode->i_ctime = inode->i_atime =
 				CURRENT_TIME;
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index cff41511269f..1b32c2c04c15 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -289,7 +289,6 @@ static struct inode *cpuset_new_inode(mode_t mode)
 		inode->i_mode = mode;
 		inode->i_uid = current->fsuid;
 		inode->i_gid = current->fsgid;
-		inode->i_blksize = PAGE_CACHE_SIZE;
 		inode->i_blocks = 0;
 		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 		inode->i_mapping->backing_dev_info = &cpuset_backing_dev_info;
diff --git a/mm/shmem.c b/mm/shmem.c
index 0a8e29cf87e0..eda907c3a86a 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1351,7 +1351,6 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
 		inode->i_mode = mode;
 		inode->i_uid = current->fsuid;
 		inode->i_gid = current->fsgid;
-		inode->i_blksize = PAGE_CACHE_SIZE;
 		inode->i_blocks = 0;
 		inode->i_mapping->a_ops = &shmem_aops;
 		inode->i_mapping->backing_dev_info = &shmem_backing_dev_info;
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index f65bea74734d..700c6e061a04 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -488,7 +488,6 @@ rpc_get_inode(struct super_block *sb, int mode)
 		return NULL;
 	inode->i_mode = mode;
 	inode->i_uid = inode->i_gid = 0;
-	inode->i_blksize = PAGE_CACHE_SIZE;
 	inode->i_blocks = 0;
 	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 	switch(mode & S_IFMT) {
diff --git a/security/inode.c b/security/inode.c
index 176aacea8ca4..49ee51529396 100644
--- a/security/inode.c
+++ b/security/inode.c
@@ -64,7 +64,6 @@ static struct inode *get_inode(struct super_block *sb, int mode, dev_t dev)
 		inode->i_mode = mode;
 		inode->i_uid = 0;
 		inode->i_gid = 0;
-		inode->i_blksize = PAGE_CACHE_SIZE;
 		inode->i_blocks = 0;
 		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 		switch (mode & S_IFMT) {
diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c
index 00534c302ba2..bab7b386cb8d 100644
--- a/security/selinux/selinuxfs.c
+++ b/security/selinux/selinuxfs.c
@@ -771,7 +771,6 @@ static struct inode *sel_make_inode(struct super_block *sb, int mode)
 	if (ret) {
 		ret->i_mode = mode;
 		ret->i_uid = ret->i_gid = 0;
-		ret->i_blksize = PAGE_CACHE_SIZE;
 		ret->i_blocks = 0;
 		ret->i_atime = ret->i_mtime = ret->i_ctime = CURRENT_TIME;
 	}
-- 
cgit v1.2.3


From b89a81712f486e4f7a606987413e387605fdeaf4 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 27 Sep 2006 01:51:04 -0700
Subject: [PATCH] sysctl: Allow /proc/sys without sys_sysctl

Since sys_sysctl is deprecated start allow it to be compiled out.  This
should catch any remaining user space code that cares, and paves the way
for further sysctl cleanups.

[akpm@osdl.org: If sys_sysctl() is not compiled-in, emit a warning]
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/ia64/ia32/sys_ia32.c         |   2 +-
 arch/mips/kernel/linux32.c        |   4 +-
 arch/powerpc/kernel/sys_ppc32.c   |   2 +-
 arch/s390/kernel/compat_linux.c   |   2 +-
 arch/sparc64/kernel/sys_sparc32.c |   2 +-
 arch/x86_64/ia32/sys_ia32.c       |   2 +-
 fs/Kconfig                        |  19 +++++++
 init/Kconfig                      |  31 ++++++-----
 kernel/sysctl.c                   | 113 +++++++++++++-------------------------
 9 files changed, 81 insertions(+), 96 deletions(-)

(limited to 'kernel')

diff --git a/arch/ia64/ia32/sys_ia32.c b/arch/ia64/ia32/sys_ia32.c
index 6aa3c51619ca..bddbd22706ed 100644
--- a/arch/ia64/ia32/sys_ia32.c
+++ b/arch/ia64/ia32/sys_ia32.c
@@ -1942,7 +1942,7 @@ struct sysctl32 {
 	unsigned int	__unused[4];
 };
 
-#ifdef CONFIG_SYSCTL
+#ifdef CONFIG_SYSCTL_SYSCALL
 asmlinkage long
 sys32_sysctl (struct sysctl32 __user *args)
 {
diff --git a/arch/mips/kernel/linux32.c b/arch/mips/kernel/linux32.c
index dc500e20cf14..43b1162d714f 100644
--- a/arch/mips/kernel/linux32.c
+++ b/arch/mips/kernel/linux32.c
@@ -991,7 +991,7 @@ struct sysctl_args32
 	unsigned int __unused[4];
 };
 
-#ifdef CONFIG_SYSCTL
+#ifdef CONFIG_SYSCTL_SYSCALL
 
 asmlinkage long sys32_sysctl(struct sysctl_args32 __user *args)
 {
@@ -1032,7 +1032,7 @@ asmlinkage long sys32_sysctl(struct sysctl_args32 __user *args)
 	return error;
 }
 
-#endif /* CONFIG_SYSCTL */
+#endif /* CONFIG_SYSCTL_SYSCALL */
 
 asmlinkage long sys32_newuname(struct new_utsname __user * name)
 {
diff --git a/arch/powerpc/kernel/sys_ppc32.c b/arch/powerpc/kernel/sys_ppc32.c
index 2e292863e982..5e391fc25340 100644
--- a/arch/powerpc/kernel/sys_ppc32.c
+++ b/arch/powerpc/kernel/sys_ppc32.c
@@ -740,7 +740,7 @@ asmlinkage long compat_sys_umask(u32 mask)
 	return sys_umask((int)mask);
 }
 
-#ifdef CONFIG_SYSCTL
+#ifdef CONFIG_SYSCTL_SYSCALL
 struct __sysctl_args32 {
 	u32 name;
 	int nlen;
diff --git a/arch/s390/kernel/compat_linux.c b/arch/s390/kernel/compat_linux.c
index 785c9f70ac98..91b2884fa5c4 100644
--- a/arch/s390/kernel/compat_linux.c
+++ b/arch/s390/kernel/compat_linux.c
@@ -708,7 +708,7 @@ asmlinkage long sys32_sendfile64(int out_fd, int in_fd,
 	return ret;
 }
 
-#ifdef CONFIG_SYSCTL
+#ifdef CONFIG_SYSCTL_SYSCALL
 struct __sysctl_args32 {
 	u32 name;
 	int nlen;
diff --git a/arch/sparc64/kernel/sys_sparc32.c b/arch/sparc64/kernel/sys_sparc32.c
index c88ae23ce812..69444f266e2d 100644
--- a/arch/sparc64/kernel/sys_sparc32.c
+++ b/arch/sparc64/kernel/sys_sparc32.c
@@ -1016,7 +1016,7 @@ struct __sysctl_args32 {
 
 asmlinkage long sys32_sysctl(struct __sysctl_args32 __user *args)
 {
-#ifndef CONFIG_SYSCTL
+#ifndef CONFIG_SYSCTL_SYSCALL
 	return -ENOSYS;
 #else
 	struct __sysctl_args32 tmp;
diff --git a/arch/x86_64/ia32/sys_ia32.c b/arch/x86_64/ia32/sys_ia32.c
index b0e82c7947dc..f280d3665f4b 100644
--- a/arch/x86_64/ia32/sys_ia32.c
+++ b/arch/x86_64/ia32/sys_ia32.c
@@ -648,7 +648,7 @@ sys32_pause(void)
 }
 
 
-#ifdef CONFIG_SYSCTL
+#ifdef CONFIG_SYSCTL_SYSCALL
 struct sysctl_ia32 {
 	unsigned int	name;
 	int		nlen;
diff --git a/fs/Kconfig b/fs/Kconfig
index a27002668bd3..d311198bba43 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -826,6 +826,25 @@ config PROC_VMCORE
         help
         Exports the dump image of crashed kernel in ELF format.
 
+config PROC_SYSCTL
+	bool "Sysctl support (/proc/sys)" if EMBEDDED
+	depends on PROC_FS
+	select SYSCTL
+	default y
+	---help---
+	  The sysctl interface provides a means of dynamically changing
+	  certain kernel parameters and variables on the fly without requiring
+	  a recompile of the kernel or reboot of the system.  The primary
+	  interface is through /proc/sys.  If you say Y here a tree of
+	  modifiable sysctl entries will be generated beneath the
+          /proc/sys directory. They are explained in the files
+	  in <file:Documentation/sysctl/>.  Note that enabling this
+	  option will enlarge the kernel by at least 8 KB.
+
+	  As it is generally a good thing, you should say Y here unless
+	  building a kernel for install/rescue disks or your system is very
+	  limited in memory.
+
 config SYSFS
 	bool "sysfs file system support" if EMBEDDED
 	default y
diff --git a/init/Kconfig b/init/Kconfig
index 9a7656f0b5ec..4381006dd666 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -273,21 +273,24 @@ config UID16
 	  This enables the legacy 16-bit UID syscall wrappers.
 
 config SYSCTL
-	bool "Sysctl support" if EMBEDDED
-	default y
+	bool
+
+config SYSCTL_SYSCALL
+	bool "Sysctl syscall support"
+	default n
+	select SYSCTL
 	---help---
-	  The sysctl interface provides a means of dynamically changing
-	  certain kernel parameters and variables on the fly without requiring
-	  a recompile of the kernel or reboot of the system.  The primary
-	  interface consists of a system call, but if you say Y to "/proc
-	  file system support", a tree of modifiable sysctl entries will be
-	  generated beneath the /proc/sys directory. They are explained in the
-	  files in <file:Documentation/sysctl/>.  Note that enabling this
-	  option will enlarge the kernel by at least 8 KB.
-
-	  As it is generally a good thing, you should say Y here unless
-	  building a kernel for install/rescue disks or your system is very
-	  limited in memory.
+	  Enable the deprecated sysctl system call.  sys_sysctl uses
+	  binary paths that have been found to be a major pain to maintain
+	  and use.  The interface in /proc/sys is now the primary and what
+	  everyone uses.
+
+	  Nothing has been using the binary sysctl interface for some time
+	  time now so nothing should break if you disable sysctl syscall
+	  support, and you kernel will get marginally smaller.
+
+	  Unless you have an application that uses the sys_syscall interface
+ 	  you should probably say N here.
 
 config KALLSYMS
 	 bool "Load all symbols for debugging/kksymoops" if EMBEDDED
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index bcb3a181dbb2..8bfa7d117c54 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -137,8 +137,11 @@ extern int no_unaligned_warning;
 extern int max_lock_depth;
 #endif
 
-static int parse_table(int __user *, int, void __user *, size_t __user *, void __user *, size_t,
-		       ctl_table *, void **);
+#ifdef CONFIG_SYSCTL_SYSCALL
+static int parse_table(int __user *, int, void __user *, size_t __user *,
+		void __user *, size_t, ctl_table *, void **);
+#endif
+
 static int proc_doutsstring(ctl_table *table, int write, struct file *filp,
 		  void __user *buffer, size_t *lenp, loff_t *ppos);
 
@@ -165,7 +168,7 @@ int sysctl_legacy_va_layout;
 
 /* /proc declarations: */
 
-#ifdef CONFIG_PROC_FS
+#ifdef CONFIG_PROC_SYSCTL
 
 static ssize_t proc_readsys(struct file *, char __user *, size_t, loff_t *);
 static ssize_t proc_writesys(struct file *, const char __user *, size_t, loff_t *);
@@ -1166,12 +1169,13 @@ static void start_unregistering(struct ctl_table_header *p)
 
 void __init sysctl_init(void)
 {
-#ifdef CONFIG_PROC_FS
+#ifdef CONFIG_PROC_SYSCTL
 	register_proc_table(root_table, proc_sys_root, &root_table_header);
 	init_irq_proc();
 #endif
 }
 
+#ifdef CONFIG_SYSCTL_SYSCALL
 int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp,
 	       void __user *newval, size_t newlen)
 {
@@ -1225,6 +1229,7 @@ asmlinkage long sys_sysctl(struct __sysctl_args __user *args)
 	unlock_kernel();
 	return error;
 }
+#endif /* CONFIG_SYSCTL_SYSCALL */
 
 /*
  * ctl_perm does NOT grant the superuser all rights automatically, because
@@ -1251,6 +1256,7 @@ static inline int ctl_perm(ctl_table *table, int op)
 	return test_perm(table->mode, op);
 }
 
+#ifdef CONFIG_SYSCTL_SYSCALL
 static int parse_table(int __user *name, int nlen,
 		       void __user *oldval, size_t __user *oldlenp,
 		       void __user *newval, size_t newlen,
@@ -1340,6 +1346,7 @@ int do_sysctl_strategy (ctl_table *table,
 	}
 	return 0;
 }
+#endif /* CONFIG_SYSCTL_SYSCALL */
 
 /**
  * register_sysctl_table - register a sysctl hierarchy
@@ -1427,7 +1434,7 @@ struct ctl_table_header *register_sysctl_table(ctl_table * table,
 	else
 		list_add_tail(&tmp->ctl_entry, &root_table_header.ctl_entry);
 	spin_unlock(&sysctl_lock);
-#ifdef CONFIG_PROC_FS
+#ifdef CONFIG_PROC_SYSCTL
 	register_proc_table(table, proc_sys_root, tmp);
 #endif
 	return tmp;
@@ -1445,18 +1452,31 @@ void unregister_sysctl_table(struct ctl_table_header * header)
 	might_sleep();
 	spin_lock(&sysctl_lock);
 	start_unregistering(header);
-#ifdef CONFIG_PROC_FS
+#ifdef CONFIG_PROC_SYSCTL
 	unregister_proc_table(header->ctl_table, proc_sys_root);
 #endif
 	spin_unlock(&sysctl_lock);
 	kfree(header);
 }
 
+#else /* !CONFIG_SYSCTL */
+struct ctl_table_header * register_sysctl_table(ctl_table * table,
+						int insert_at_head)
+{
+	return NULL;
+}
+
+void unregister_sysctl_table(struct ctl_table_header * table)
+{
+}
+
+#endif /* CONFIG_SYSCTL */
+
 /*
  * /proc/sys support
  */
 
-#ifdef CONFIG_PROC_FS
+#ifdef CONFIG_PROC_SYSCTL
 
 /* Scan the sysctl entries in table and add them all into /proc */
 static void register_proc_table(ctl_table * table, struct proc_dir_entry *root, void *set)
@@ -2318,6 +2338,7 @@ int proc_doulongvec_ms_jiffies_minmax(ctl_table *table, int write,
 #endif /* CONFIG_PROC_FS */
 
 
+#ifdef CONFIG_SYSCTL_SYSCALL
 /*
  * General sysctl support routines 
  */
@@ -2460,11 +2481,19 @@ int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen,
 	return 1;
 }
 
-#else /* CONFIG_SYSCTL */
+#else /* CONFIG_SYSCTL_SYSCALL */
 
 
 asmlinkage long sys_sysctl(struct __sysctl_args __user *args)
 {
+	static int msg_count;
+
+	if (msg_count < 5) {
+		msg_count++;
+		printk(KERN_INFO
+			"warning: process `%s' used the removed sysctl "
+			"system call\n", current->comm);
+	}
 	return -ENOSYS;
 }
 
@@ -2496,73 +2525,7 @@ int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen,
 	return -ENOSYS;
 }
 
-int proc_dostring(ctl_table *table, int write, struct file *filp,
-		  void __user *buffer, size_t *lenp, loff_t *ppos)
-{
-	return -ENOSYS;
-}
-
-int proc_dointvec(ctl_table *table, int write, struct file *filp,
-		  void __user *buffer, size_t *lenp, loff_t *ppos)
-{
-	return -ENOSYS;
-}
-
-int proc_dointvec_bset(ctl_table *table, int write, struct file *filp,
-			void __user *buffer, size_t *lenp, loff_t *ppos)
-{
-	return -ENOSYS;
-}
-
-int proc_dointvec_minmax(ctl_table *table, int write, struct file *filp,
-		    void __user *buffer, size_t *lenp, loff_t *ppos)
-{
-	return -ENOSYS;
-}
-
-int proc_dointvec_jiffies(ctl_table *table, int write, struct file *filp,
-			  void __user *buffer, size_t *lenp, loff_t *ppos)
-{
-	return -ENOSYS;
-}
-
-int proc_dointvec_userhz_jiffies(ctl_table *table, int write, struct file *filp,
-			  void __user *buffer, size_t *lenp, loff_t *ppos)
-{
-	return -ENOSYS;
-}
-
-int proc_dointvec_ms_jiffies(ctl_table *table, int write, struct file *filp,
-			     void __user *buffer, size_t *lenp, loff_t *ppos)
-{
-	return -ENOSYS;
-}
-
-int proc_doulongvec_minmax(ctl_table *table, int write, struct file *filp,
-		    void __user *buffer, size_t *lenp, loff_t *ppos)
-{
-	return -ENOSYS;
-}
-
-int proc_doulongvec_ms_jiffies_minmax(ctl_table *table, int write,
-				      struct file *filp,
-				      void __user *buffer,
-				      size_t *lenp, loff_t *ppos)
-{
-    return -ENOSYS;
-}
-
-struct ctl_table_header * register_sysctl_table(ctl_table * table, 
-						int insert_at_head)
-{
-	return NULL;
-}
-
-void unregister_sysctl_table(struct ctl_table_header * table)
-{
-}
-
-#endif /* CONFIG_SYSCTL */
+#endif /* CONFIG_SYSCTL_SYSCALL */
 
 /*
  * No sense putting this after each symbol definition, twice,
-- 
cgit v1.2.3


From c18258c6f0848f97e85287f6271c511a092bb784 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 27 Sep 2006 01:51:06 -0700
Subject: [PATCH] pid: Implement transfer_pid and use it to simplify de_thread

In de_thread we move pids from one process to another, a rather ugly case.
The function transfer_pid makes it clear what we are doing, and makes the
action atomic.  This is useful we ever want to atomically traverse the
process group and session lists, in a rcu safe manner.

Even if the atomic properties this change should be a win as transfer_pid
should be less code to execute than executing both attach_pid and
detach_pid, and this should make de_thread slightly smaller as only a
single function call needs to be emitted.  The only downside is that the
code might be slower to execute as the odds are against transfer_pid being
in cache.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/exec.c           | 11 ++++-------
 include/linux/pid.h |  2 ++
 kernel/pid.c        |  9 +++++++++
 3 files changed, 15 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/fs/exec.c b/fs/exec.c
index 54135df2a966..b7aa3d6422d6 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -696,23 +696,20 @@ static int de_thread(struct task_struct *tsk)
 		 */
 
 		/* Become a process group leader with the old leader's pid.
-		 * Note: The old leader also uses thispid until release_task
+		 * The old leader becomes a thread of the this thread group.
+		 * Note: The old leader also uses this pid until release_task
 		 *       is called.  Odd but simple and correct.
 		 */
 		detach_pid(current, PIDTYPE_PID);
 		current->pid = leader->pid;
 		attach_pid(current, PIDTYPE_PID,  current->pid);
-		attach_pid(current, PIDTYPE_PGID, current->signal->pgrp);
-		attach_pid(current, PIDTYPE_SID,  current->signal->session);
+		transfer_pid(leader, current, PIDTYPE_PGID);
+		transfer_pid(leader, current, PIDTYPE_SID);
 		list_replace_rcu(&leader->tasks, &current->tasks);
 
 		current->group_leader = current;
 		leader->group_leader = current;
 
-		/* Reduce leader to a thread */
-		detach_pid(leader, PIDTYPE_PGID);
-		detach_pid(leader, PIDTYPE_SID);
-
 		current->exit_signal = SIGCHLD;
 
 		BUG_ON(leader->exit_state != EXIT_ZOMBIE);
diff --git a/include/linux/pid.h b/include/linux/pid.h
index 29960b03bef7..93da7e2d9f30 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -76,6 +76,8 @@ extern int FASTCALL(attach_pid(struct task_struct *task,
 				enum pid_type type, int nr));
 
 extern void FASTCALL(detach_pid(struct task_struct *task, enum pid_type));
+extern void FASTCALL(transfer_pid(struct task_struct *old,
+				  struct task_struct *new, enum pid_type));
 
 /*
  * look up a PID in the hash table. Must be called with the tasklist_lock
diff --git a/kernel/pid.c b/kernel/pid.c
index 93e212f20671..6db82b68e2f8 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -252,6 +252,15 @@ void fastcall detach_pid(struct task_struct *task, enum pid_type type)
 	free_pid(pid);
 }
 
+/* transfer_pid is an optimization of attach_pid(new), detach_pid(old) */
+void fastcall transfer_pid(struct task_struct *old, struct task_struct *new,
+			   enum pid_type type)
+{
+	new->pids[type].pid = old->pids[type].pid;
+	hlist_replace_rcu(&old->pids[type].node, &new->pids[type].node);
+	old->pids[type].pid = NULL;
+}
+
 struct task_struct * fastcall pid_task(struct pid *pid, enum pid_type type)
 {
 	struct task_struct *result = NULL;
-- 
cgit v1.2.3


From 65800ac77e080cf159d6c1207b6886e18f22bc08 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 27 Sep 2006 01:51:11 -0700
Subject: [PATCH] pid: remove temporary debug code in attach_pid

With the patches flying between Oleg and myself somehow this temporary
debug code got left in pid.c.  It was never intended to make it to the
stable kernel.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/pid.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/pid.c b/kernel/pid.c
index 6db82b68e2f8..8387e8c68193 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -223,9 +223,6 @@ int fastcall attach_pid(struct task_struct *task, enum pid_type type, int nr)
 	struct pid_link *link;
 	struct pid *pid;
 
-	WARN_ON(!task->pid); /* to be removed soon */
-	WARN_ON(!nr); /* to be removed soon */
-
 	link = &task->pids[type];
 	link->pid = pid = find_pid(nr);
 	hlist_add_head_rcu(&link->node, &pid->tasks[type]);
-- 
cgit v1.2.3


From 910067d188d56d80801b71b0ca1f73aa400c7b8c Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew@wil.cx>
Date: Fri, 29 Sep 2006 01:58:36 -0700
Subject: [PATCH] remove generic__raw_read_trylock()

If the cpu has the lock held for write, is interrupted, and the interrupt
handler calls read_trylock(), it's an instant deadlock.

Now, Dave Miller has subsequently pointed out that we don't have any
situations where this can occur.  Nevertheless, we should delete
generic__raw_read_lock (and its associated EXPORT to make Arjan happy) so that
nobody thinks they can use it.

Acked-by: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/spinlock.c | 11 -----------
 1 file changed, 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index 9644a41e0bef..d48143eafbfd 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -21,17 +21,6 @@
 #include <linux/debug_locks.h>
 #include <linux/module.h>
 
-/*
- * Generic declaration of the raw read_trylock() function,
- * architectures are supposed to optimize this:
- */
-int __lockfunc generic__raw_read_trylock(raw_rwlock_t *lock)
-{
-	__raw_read_lock(lock);
-	return 1;
-}
-EXPORT_SYMBOL(generic__raw_read_trylock);
-
 int __lockfunc _spin_trylock(spinlock_t *lock)
 {
 	preempt_disable();
-- 
cgit v1.2.3


From d8c7649e99e4b081b624aefe1e77caa30b53cb18 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@xenotime.net>
Date: Fri, 29 Sep 2006 01:58:55 -0700
Subject: [PATCH] kernel/params: driver layer error checking

Check driver layer return values in kernel/params.c

Signed-off-by: Randy Dunlap <rdunlap@xenotime.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/params.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/params.c b/kernel/params.c
index 91aea7aa532e..63d432d0ebc0 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -547,6 +547,7 @@ static void __init kernel_param_sysfs_setup(const char *name,
 					    unsigned int name_skip)
 {
 	struct module_kobject *mk;
+	int ret;
 
 	mk = kzalloc(sizeof(struct module_kobject), GFP_KERNEL);
 	BUG_ON(!mk);
@@ -554,7 +555,8 @@ static void __init kernel_param_sysfs_setup(const char *name,
 	mk->mod = THIS_MODULE;
 	kobj_set_kset_s(mk, module_subsys);
 	kobject_set_name(&mk->kobj, name);
-	kobject_register(&mk->kobj);
+	ret = kobject_register(&mk->kobj);
+	BUG_ON(ret < 0);
 
 	/* no need to keep the kobject if no parameter is exported */
 	if (!param_sysfs_setup(mk, kparam, num_params, name_skip)) {
@@ -684,7 +686,14 @@ decl_subsys(module, &module_ktype, NULL);
  */
 static int __init param_sysfs_init(void)
 {
-	subsystem_register(&module_subsys);
+	int ret;
+
+	ret = subsystem_register(&module_subsys);
+	if (ret < 0) {
+		printk(KERN_WARNING "%s (%d): subsystem_register error: %d\n",
+			__FILE__, __LINE__, ret);
+		return ret;
+	}
 
 	param_sysfs_builtin();
 
-- 
cgit v1.2.3


From 4c78a6639386f9e7399fbc0d0a173d4cc1a3e9bf Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@xenotime.net>
Date: Fri, 29 Sep 2006 01:59:10 -0700
Subject: [PATCH] kernel-doc for relay interface

Add relay interface support to DocBook/kernel-api.tmpl.  Fix typos etc.  in
relay.c and relayfs.txt.

Signed-off-by: Randy Dunlap <rdunlap@xenotime.net>
Acked-by: Tom Zanussi <zanussi@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/DocBook/kernel-api.tmpl | 16 ++++++++++++++++
 kernel/relay.c                        | 36 +++++++++++++++++++++--------------
 2 files changed, 38 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/DocBook/kernel-api.tmpl b/Documentation/DocBook/kernel-api.tmpl
index f8fe882e33dc..ea6f375475b8 100644
--- a/Documentation/DocBook/kernel-api.tmpl
+++ b/Documentation/DocBook/kernel-api.tmpl
@@ -202,6 +202,22 @@ X!Ilib/string.c
      </sect1>
   </chapter>
 
+  <chapter id="relayfs">
+     <title>relay interface support</title>
+
+     <para>
+	Relay interface support
+	is designed to provide an efficient mechanism for tools and
+	facilities to relay large amounts of data from kernel space to
+	user space.
+     </para>
+
+     <sect1><title>relay interface</title>
+!Ekernel/relay.c
+!Ikernel/relay.c
+     </sect1>
+  </chapter>
+
   <chapter id="vfs">
      <title>The Linux VFS</title>
      <sect1><title>The Filesystem types</title>
diff --git a/kernel/relay.c b/kernel/relay.c
index 85786ff2a4f9..1d63ecddfa70 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -95,7 +95,7 @@ int relay_mmap_buf(struct rchan_buf *buf, struct vm_area_struct *vma)
  *	@buf: the buffer struct
  *	@size: total size of the buffer
  *
- *	Returns a pointer to the resulting buffer, NULL if unsuccessful. The
+ *	Returns a pointer to the resulting buffer, %NULL if unsuccessful. The
  *	passed in size will get page aligned, if it isn't already.
  */
 static void *relay_alloc_buf(struct rchan_buf *buf, size_t *size)
@@ -132,10 +132,9 @@ depopulate:
 
 /**
  *	relay_create_buf - allocate and initialize a channel buffer
- *	@alloc_size: size of the buffer to allocate
- *	@n_subbufs: number of sub-buffers in the channel
+ *	@chan: the relay channel
  *
- *	Returns channel buffer if successful, NULL otherwise
+ *	Returns channel buffer if successful, %NULL otherwise.
  */
 struct rchan_buf *relay_create_buf(struct rchan *chan)
 {
@@ -163,6 +162,7 @@ free_buf:
 
 /**
  *	relay_destroy_channel - free the channel struct
+ *	@kref: target kernel reference that contains the relay channel
  *
  *	Should only be called from kref_put().
  */
@@ -194,6 +194,7 @@ void relay_destroy_buf(struct rchan_buf *buf)
 
 /**
  *	relay_remove_buf - remove a channel buffer
+ *	@kref: target kernel reference that contains the relay buffer
  *
  *	Removes the file from the fileystem, which also frees the
  *	rchan_buf_struct and the channel buffer.  Should only be called from
@@ -374,7 +375,7 @@ void relay_reset(struct rchan *chan)
 }
 EXPORT_SYMBOL_GPL(relay_reset);
 
-/**
+/*
  *	relay_open_buf - create a new relay channel buffer
  *
  *	Internal - used by relay_open().
@@ -448,12 +449,12 @@ static inline void setup_callbacks(struct rchan *chan,
 /**
  *	relay_open - create a new relay channel
  *	@base_filename: base name of files to create
- *	@parent: dentry of parent directory, NULL for root directory
+ *	@parent: dentry of parent directory, %NULL for root directory
  *	@subbuf_size: size of sub-buffers
  *	@n_subbufs: number of sub-buffers
  *	@cb: client callback functions
  *
- *	Returns channel pointer if successful, NULL otherwise.
+ *	Returns channel pointer if successful, %NULL otherwise.
  *
  *	Creates a channel buffer for each cpu using the sizes and
  *	attributes specified.  The created channel buffer files
@@ -585,7 +586,7 @@ EXPORT_SYMBOL_GPL(relay_switch_subbuf);
  *	subbufs_consumed should be the number of sub-buffers newly consumed,
  *	not the total consumed.
  *
- *	NOTE: kernel clients don't need to call this function if the channel
+ *	NOTE: Kernel clients don't need to call this function if the channel
  *	mode is 'overwrite'.
  */
 void relay_subbufs_consumed(struct rchan *chan,
@@ -641,7 +642,7 @@ EXPORT_SYMBOL_GPL(relay_close);
  *	relay_flush - close the channel
  *	@chan: the channel
  *
- *	Flushes all channel buffers i.e. forces buffer switch.
+ *	Flushes all channel buffers, i.e. forces buffer switch.
  */
 void relay_flush(struct rchan *chan)
 {
@@ -729,7 +730,7 @@ static int relay_file_release(struct inode *inode, struct file *filp)
 	return 0;
 }
 
-/**
+/*
  *	relay_file_read_consume - update the consumed count for the buffer
  */
 static void relay_file_read_consume(struct rchan_buf *buf,
@@ -756,7 +757,7 @@ static void relay_file_read_consume(struct rchan_buf *buf,
 	}
 }
 
-/**
+/*
  *	relay_file_read_avail - boolean, are there unconsumed bytes available?
  */
 static int relay_file_read_avail(struct rchan_buf *buf, size_t read_pos)
@@ -793,6 +794,8 @@ static int relay_file_read_avail(struct rchan_buf *buf, size_t read_pos)
 
 /**
  *	relay_file_read_subbuf_avail - return bytes available in sub-buffer
+ *	@read_pos: file read position
+ *	@buf: relay channel buffer
  */
 static size_t relay_file_read_subbuf_avail(size_t read_pos,
 					   struct rchan_buf *buf)
@@ -818,6 +821,8 @@ static size_t relay_file_read_subbuf_avail(size_t read_pos,
 
 /**
  *	relay_file_read_start_pos - find the first available byte to read
+ *	@read_pos: file read position
+ *	@buf: relay channel buffer
  *
  *	If the read_pos is in the middle of padding, return the
  *	position of the first actually available byte, otherwise
@@ -844,6 +849,9 @@ static size_t relay_file_read_start_pos(size_t read_pos,
 
 /**
  *	relay_file_read_end_pos - return the new read position
+ *	@read_pos: file read position
+ *	@buf: relay channel buffer
+ *	@count: number of bytes to be read
  */
 static size_t relay_file_read_end_pos(struct rchan_buf *buf,
 				      size_t read_pos,
@@ -865,7 +873,7 @@ static size_t relay_file_read_end_pos(struct rchan_buf *buf,
 	return end_pos;
 }
 
-/**
+/*
  *	subbuf_read_actor - read up to one subbuf's worth of data
  */
 static int subbuf_read_actor(size_t read_start,
@@ -890,7 +898,7 @@ static int subbuf_read_actor(size_t read_start,
 	return ret;
 }
 
-/**
+/*
  *	subbuf_send_actor - send up to one subbuf's worth of data
  */
 static int subbuf_send_actor(size_t read_start,
@@ -933,7 +941,7 @@ typedef int (*subbuf_actor_t) (size_t read_start,
 			       read_descriptor_t *desc,
 			       read_actor_t actor);
 
-/**
+/*
  *	relay_file_read_subbufs - read count bytes, bridging subbuf boundaries
  */
 static inline ssize_t relay_file_read_subbufs(struct file *filp,
-- 
cgit v1.2.3


From 2ff6fd8f4a40b72ff35dbb1e08eb9ed6b64b6028 Mon Sep 17 00:00:00 2001
From: Yoichi Yuasa <yoichi_yuasa@tripeaks.co.jp>
Date: Fri, 29 Sep 2006 01:59:21 -0700
Subject: [PATCH] irq: fixed coding style

Signed-off-by: Yoichi Yuasa <yoichi_yuasa@tripeaks.co.jp>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/irq/chip.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index ac1f850d4937..3c3fec55f6d1 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -146,7 +146,7 @@ static void default_disable(unsigned int irq)
 	struct irq_desc *desc = irq_desc + irq;
 
 	if (!(desc->status & IRQ_DELAYED_DISABLE))
-		irq_desc[irq].chip->mask(irq);
+		desc->chip->mask(irq);
 }
 
 /*
-- 
cgit v1.2.3


From 538d9d532b0e0320c9dd326a560b5a72d73f910d Mon Sep 17 00:00:00 2001
From: Yoichi Yuasa <yoichi_yuasa@tripeaks.co.jp>
Date: Fri, 29 Sep 2006 01:59:22 -0700
Subject: [PATCH] irq: remove a extra line

Signed-off-by: Yoichi Yuasa <yoichi_yuasa@tripeaks.co.jp>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/irq/chip.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 3c3fec55f6d1..736cb0bd498f 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -40,10 +40,6 @@ int set_irq_chip(unsigned int irq, struct irq_chip *chip)
 	spin_lock_irqsave(&desc->lock, flags);
 	irq_chip_set_defaults(chip);
 	desc->chip = chip;
-	/*
-	 * For compatibility only:
-	 */
-	desc->chip = chip;
 	spin_unlock_irqrestore(&desc->lock, flags);
 
 	return 0;
-- 
cgit v1.2.3


From a49a4af759c0193d42aeaeefb4df7de8973dd713 Mon Sep 17 00:00:00 2001
From: Josh Triplett <josht@us.ibm.com>
Date: Fri, 29 Sep 2006 01:59:30 -0700
Subject: [PATCH] rcu: add lock annotations to
 rcu{,_bh}_torture_read_{lock,unlock}

rcu_torture_read_lock and rcu_bh_torture_read_lock acquire locks without
releasing them, and the matching functions rcu_torture_read_unlock and
rcu_bh_torture_read_unlock get called with the corresponding locks held and
release them.  Add lock annotations to these four functions so that sparse
can check callers for lock pairing, and so that sparse will not complain
about these functions since they intentionally use locks in this manner.

Signed-off-by: Josh Triplett <josh@freedesktop.org>
Acked-by: Paul McKenney <paulmck@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/rcutorture.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 4d1c3d247127..4f2c4272d59c 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -192,13 +192,13 @@ static struct rcu_torture_ops *cur_ops = NULL;
  * Definitions for rcu torture testing.
  */
 
-static int rcu_torture_read_lock(void)
+static int rcu_torture_read_lock(void) __acquires(RCU)
 {
 	rcu_read_lock();
 	return 0;
 }
 
-static void rcu_torture_read_unlock(int idx)
+static void rcu_torture_read_unlock(int idx) __releases(RCU)
 {
 	rcu_read_unlock();
 }
@@ -250,13 +250,13 @@ static struct rcu_torture_ops rcu_ops = {
  * Definitions for rcu_bh torture testing.
  */
 
-static int rcu_bh_torture_read_lock(void)
+static int rcu_bh_torture_read_lock(void) __acquires(RCU_BH)
 {
 	rcu_read_lock_bh();
 	return 0;
 }
 
-static void rcu_bh_torture_read_unlock(int idx)
+static void rcu_bh_torture_read_unlock(int idx) __releases(RCU_BH)
 {
 	rcu_read_unlock_bh();
 }
-- 
cgit v1.2.3


From d10be6d1bdb0c901b78244872de3cc1c1b6c3fb2 Mon Sep 17 00:00:00 2001
From: Mark Huang <mlhuang@CS.Princeton.EDU>
Date: Fri, 29 Sep 2006 01:59:34 -0700
Subject: [PATCH] module_subsys: initialize earlier

Initialize module_subsys earlier (or at least earlier than devices) since
it could be used very early in the boot process if kmod loads a module
before the device initcalls.  Otherwise, kmod will crash in
kernel/module.c:mod_sysfs_setup() since the kset in module_subsys is not
initialized yet.

I only noticed this problem because occasionally, kmod loads the modules
for my SCSI and Ethernet adapters very early, during the boot process
itself.  I don't quite understand why it loads them sometimes and doesn't
load them other times.  Or who is telling kmod to do so.  Can someone
explain?

Signed-off-by: Mark Huang <mlhuang@cs.princeton.edu>
Cc: Greg KH <greg@kroah.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/params.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/params.c b/kernel/params.c
index 63d432d0ebc0..f406655d6653 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -699,7 +699,7 @@ static int __init param_sysfs_init(void)
 
 	return 0;
 }
-__initcall(param_sysfs_init);
+subsys_initcall(param_sysfs_init);
 
 EXPORT_SYMBOL(param_set_byte);
 EXPORT_SYMBOL(param_get_byte);
-- 
cgit v1.2.3


From 89e7e374dde1015d69d2d70797ae4053b14fa9db Mon Sep 17 00:00:00 2001
From: Josh Triplett <josht@us.ibm.com>
Date: Fri, 29 Sep 2006 01:59:36 -0700
Subject: [PATCH] timer: add lock annotation to lock_timer_base

lock_timer_base acquires a lock and returns with that lock held.  Add a
lock annotation to this function so that sparse can check callers for lock
pairing, and so that sparse will not complain about this function since it
intentionally uses the lock in this manner.

Signed-off-by: Josh Triplett <josh@freedesktop.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/timer.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/timer.c b/kernel/timer.c
index 1d7dd6267c2d..6c9fa80088ed 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -175,6 +175,7 @@ static inline void detach_timer(struct timer_list *timer,
  */
 static tvec_base_t *lock_timer_base(struct timer_list *timer,
 					unsigned long *flags)
+	__acquires(timer->base->lock)
 {
 	tvec_base_t *base;
 
-- 
cgit v1.2.3


From 6c5c934153513dc72e2d6464f39e8ef1f27c0a3e Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Fri, 29 Sep 2006 01:59:40 -0700
Subject: [PATCH] ifdef blktrace debugging fields

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Acked-by: Jens Axboe <axboe@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 block/blktrace.c       | 6 ++++--
 block/ll_rw_blk.c      | 3 +--
 include/linux/blkdev.h | 4 ++--
 include/linux/sched.h  | 3 ++-
 kernel/fork.c          | 2 ++
 5 files changed, 11 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/block/blktrace.c b/block/blktrace.c
index 2b4ef2b89b8d..8ff33441d8a2 100644
--- a/block/blktrace.c
+++ b/block/blktrace.c
@@ -450,8 +450,10 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
  **/
 void blk_trace_shutdown(request_queue_t *q)
 {
-	blk_trace_startstop(q, 0);
-	blk_trace_remove(q);
+	if (q->blk_trace) {
+		blk_trace_startstop(q, 0);
+		blk_trace_remove(q);
+	}
 }
 
 /*
diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c
index 9c3a06bcb7ba..51dc0edf76e0 100644
--- a/block/ll_rw_blk.c
+++ b/block/ll_rw_blk.c
@@ -1847,8 +1847,7 @@ static void blk_release_queue(struct kobject *kobj)
 	if (q->queue_tags)
 		__blk_queue_free_tags(q);
 
-	if (q->blk_trace)
-		blk_trace_shutdown(q);
+	blk_trace_shutdown(q);
 
 	kmem_cache_free(requestq_cachep, q);
 }
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index c773ee545ebd..cfde8b3ee919 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -417,9 +417,9 @@ struct request_queue
 	unsigned int		sg_timeout;
 	unsigned int		sg_reserved_size;
 	int			node;
-
+#ifdef CONFIG_BLK_DEV_IO_TRACE
 	struct blk_trace	*blk_trace;
-
+#endif
 	/*
 	 * reserved for flush operations
 	 */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 27122575d902..3696f2f7126d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -784,8 +784,9 @@ struct task_struct {
 	struct prio_array *array;
 
 	unsigned short ioprio;
+#ifdef CONFIG_BLK_DEV_IO_TRACE
 	unsigned int btrace_seq;
-
+#endif
 	unsigned long sleep_avg;
 	unsigned long long timestamp, last_ran;
 	unsigned long long sched_time; /* sched_clock time spent running */
diff --git a/kernel/fork.c b/kernel/fork.c
index 802b1cf0e63f..bca6ce6d3ded 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -183,7 +183,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
 	/* One for us, one for whoever does the "release_task()" (usually parent) */
 	atomic_set(&tsk->usage,2);
 	atomic_set(&tsk->fs_excl, 0);
+#ifdef CONFIG_BLK_DEV_IO_TRACE
 	tsk->btrace_seq = 0;
+#endif
 	tsk->splice_pipe = NULL;
 	return tsk;
 }
-- 
cgit v1.2.3


From db630637b2f192bea2ba1c000e9cbe4e542a03ea Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Fri, 29 Sep 2006 01:59:44 -0700
Subject: [PATCH] clean up and remove some extra spinlocks from rtmutex

Oleg brought up some interesting points about grabbing the pi_lock for some
protections.  In this discussion, I realized that there are some places
that the pi_lock is being grabbed when it really wasn't necessary.  Also
this patch does a little bit of clean up.

This patch basically does three things:

1) renames the "boost" variable to "chain_walk".  Since it is used in
   the debugging case when it isn't going to be boosted.  It better
   describes what the test is going to do if it succeeds.

2) moves get_task_struct to just before the unlocking of the wait_lock.
   This removes duplicate code, and makes it a little easier to read.  The
   owner wont go away while either the pi_lock or the wait_lock are held.

3) removes the pi_locking and owner blocked checking completely from the
   debugging case.  This is because the grabbing the lock and doing the
   check, then releasing the lock is just so full of races.  It's just as
   good to go ahead and call the pi_chain_walk function, since after
   releasing the lock the owner can then block anyway, and we would have
   missed that.  For the debug case, we really do want to do the chain walk
   to test for deadlocks anyway.

[oleg@tv-sign.ru: more of the same]
Signed-of-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Esben Nielsen <nielsen.esben@googlemail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/rtmutex.c | 51 +++++++++++++++++++++++++--------------------------
 1 file changed, 25 insertions(+), 26 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index 3e13a1e5856f..4ab17da46fd8 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -251,6 +251,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
 
 	/* Grab the next task */
 	task = rt_mutex_owner(lock);
+	get_task_struct(task);
 	spin_lock_irqsave(&task->pi_lock, flags);
 
 	if (waiter == rt_mutex_top_waiter(lock)) {
@@ -269,7 +270,6 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
 		__rt_mutex_adjust_prio(task);
 	}
 
-	get_task_struct(task);
 	spin_unlock_irqrestore(&task->pi_lock, flags);
 
 	top_waiter = rt_mutex_top_waiter(lock);
@@ -409,7 +409,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
 	struct task_struct *owner = rt_mutex_owner(lock);
 	struct rt_mutex_waiter *top_waiter = waiter;
 	unsigned long flags;
-	int boost = 0, res;
+	int chain_walk = 0, res;
 
 	spin_lock_irqsave(&current->pi_lock, flags);
 	__rt_mutex_adjust_prio(current);
@@ -433,25 +433,23 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
 		plist_add(&waiter->pi_list_entry, &owner->pi_waiters);
 
 		__rt_mutex_adjust_prio(owner);
-		if (owner->pi_blocked_on) {
-			boost = 1;
-			/* gets dropped in rt_mutex_adjust_prio_chain()! */
-			get_task_struct(owner);
-		}
-		spin_unlock_irqrestore(&owner->pi_lock, flags);
-	}
-	else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock)) {
-		spin_lock_irqsave(&owner->pi_lock, flags);
-		if (owner->pi_blocked_on) {
-			boost = 1;
-			/* gets dropped in rt_mutex_adjust_prio_chain()! */
-			get_task_struct(owner);
-		}
+		if (owner->pi_blocked_on)
+			chain_walk = 1;
 		spin_unlock_irqrestore(&owner->pi_lock, flags);
 	}
-	if (!boost)
+	else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock))
+		chain_walk = 1;
+
+	if (!chain_walk)
 		return 0;
 
+	/*
+	 * The owner can't disappear while holding a lock,
+	 * so the owner struct is protected by wait_lock.
+	 * Gets dropped in rt_mutex_adjust_prio_chain()!
+	 */
+	get_task_struct(owner);
+
 	spin_unlock(&lock->wait_lock);
 
 	res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter,
@@ -532,7 +530,7 @@ static void remove_waiter(struct rt_mutex *lock,
 	int first = (waiter == rt_mutex_top_waiter(lock));
 	struct task_struct *owner = rt_mutex_owner(lock);
 	unsigned long flags;
-	int boost = 0;
+	int chain_walk = 0;
 
 	spin_lock_irqsave(&current->pi_lock, flags);
 	plist_del(&waiter->list_entry, &lock->wait_list);
@@ -554,19 +552,20 @@ static void remove_waiter(struct rt_mutex *lock,
 		}
 		__rt_mutex_adjust_prio(owner);
 
-		if (owner->pi_blocked_on) {
-			boost = 1;
-			/* gets dropped in rt_mutex_adjust_prio_chain()! */
-			get_task_struct(owner);
-		}
+		if (owner->pi_blocked_on)
+			chain_walk = 1;
+
 		spin_unlock_irqrestore(&owner->pi_lock, flags);
 	}
 
 	WARN_ON(!plist_node_empty(&waiter->pi_list_entry));
 
-	if (!boost)
+	if (!chain_walk)
 		return;
 
+	/* gets dropped in rt_mutex_adjust_prio_chain()! */
+	get_task_struct(owner);
+
 	spin_unlock(&lock->wait_lock);
 
 	rt_mutex_adjust_prio_chain(owner, 0, lock, NULL, current);
@@ -592,10 +591,10 @@ void rt_mutex_adjust_pi(struct task_struct *task)
 		return;
 	}
 
-	/* gets dropped in rt_mutex_adjust_prio_chain()! */
-	get_task_struct(task);
 	spin_unlock_irqrestore(&task->pi_lock, flags);
 
+	/* gets dropped in rt_mutex_adjust_prio_chain()! */
+	get_task_struct(task);
 	rt_mutex_adjust_prio_chain(task, 0, NULL, NULL, task);
 }
 
-- 
cgit v1.2.3


From 2aae4a108dab8b8bc92270335f6e4b5c146b32ae Mon Sep 17 00:00:00 2001
From: Rolf Eike Beer <eike-kernel@sf-tec.de>
Date: Fri, 29 Sep 2006 01:59:46 -0700
Subject: [PATCH] Fix kerneldoc comments in kernel/timer.c

Some of the kerneldoc comments in this file are ignored since the lead-in
is malformed, using either "/*" or "/***" instead of "/**".

[rdunlap@xenotime.net: kerneldoc fixes]
Signed-off-by: Rolf Eike Beer <eike-kernel@sf-tec.de>
Acked-by: Alan Cox <alan@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/timer.c | 29 +++++++++++++++++------------
 1 file changed, 17 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/timer.c b/kernel/timer.c
index 6c9fa80088ed..d644f4e9ca0c 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -136,7 +136,7 @@ static void internal_add_timer(tvec_base_t *base, struct timer_list *timer)
 	list_add_tail(&timer->entry, vec);
 }
 
-/***
+/**
  * init_timer - initialize a timer.
  * @timer: the timer to be initialized
  *
@@ -236,7 +236,7 @@ int __mod_timer(struct timer_list *timer, unsigned long expires)
 
 EXPORT_SYMBOL(__mod_timer);
 
-/***
+/**
  * add_timer_on - start a timer on a particular CPU
  * @timer: the timer to be added
  * @cpu: the CPU to start it on
@@ -256,9 +256,10 @@ void add_timer_on(struct timer_list *timer, int cpu)
 }
 
 
-/***
+/**
  * mod_timer - modify a timer's timeout
  * @timer: the timer to be modified
+ * @expires: new timeout in jiffies
  *
  * mod_timer is a more efficient way to update the expire field of an
  * active timer (if the timer is inactive it will be activated)
@@ -292,7 +293,7 @@ int mod_timer(struct timer_list *timer, unsigned long expires)
 
 EXPORT_SYMBOL(mod_timer);
 
-/***
+/**
  * del_timer - deactive a timer.
  * @timer: the timer to be deactivated
  *
@@ -324,7 +325,10 @@ int del_timer(struct timer_list *timer)
 EXPORT_SYMBOL(del_timer);
 
 #ifdef CONFIG_SMP
-/*
+/**
+ * try_to_del_timer_sync - Try to deactivate a timer
+ * @timer: timer do del
+ *
  * This function tries to deactivate a timer. Upon successful (ret >= 0)
  * exit the timer is not queued and the handler is not running on any CPU.
  *
@@ -352,7 +356,7 @@ out:
 	return ret;
 }
 
-/***
+/**
  * del_timer_sync - deactivate a timer and wait for the handler to finish.
  * @timer: the timer to be deactivated
  *
@@ -402,15 +406,15 @@ static int cascade(tvec_base_t *base, tvec_t *tv, int index)
 	return index;
 }
 
-/***
+#define INDEX(N) ((base->timer_jiffies >> (TVR_BITS + (N) * TVN_BITS)) & TVN_MASK)
+
+/**
  * __run_timers - run all expired timers (if any) on this CPU.
  * @base: the timer vector to be processed.
  *
  * This function cascades all vectors and executes all expired timer
  * vectors.
  */
-#define INDEX(N) ((base->timer_jiffies >> (TVR_BITS + (N) * TVN_BITS)) & TVN_MASK)
-
 static inline void __run_timers(tvec_base_t *base)
 {
 	struct timer_list *timer;
@@ -971,7 +975,7 @@ void __init timekeeping_init(void)
 
 
 static int timekeeping_suspended;
-/*
+/**
  * timekeeping_resume - Resumes the generic timekeeping subsystem.
  * @dev:	unused
  *
@@ -1107,7 +1111,7 @@ static void clocksource_adjust(struct clocksource *clock, s64 offset)
 	clock->error -= (interval - offset) << (TICK_LENGTH_SHIFT - clock->shift);
 }
 
-/*
+/**
  * update_wall_time - Uses the current clocksource to increment the wall time
  *
  * Called from the timer interrupt, must hold a write on xtime_lock.
@@ -1471,8 +1475,9 @@ asmlinkage long sys_gettid(void)
 	return current->pid;
 }
 
-/*
+/**
  * sys_sysinfo - fill in sysinfo struct
+ * @info: pointer to buffer to fill
  */ 
 asmlinkage long sys_sysinfo(struct sysinfo __user *info)
 {
-- 
cgit v1.2.3


From e6cab99bb478e067b1a7a120333ff326954a2412 Mon Sep 17 00:00:00 2001
From: Chuck Ebbert <76306.1226@compuserve.com>
Date: Fri, 29 Sep 2006 01:59:57 -0700
Subject: [PATCH] unwind: fix unused variable warning when !CONFIG_MODULES

Fix "variable defined but not used" compiler warning in unwind.c when
CONFIG_MODULES is not set.

Signed-off-by: Chuck Ebbert <76306.1226@compuserve.com>
Cc: Jan Beulich <jbeulich@novell.com>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/unwind.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/unwind.c b/kernel/unwind.c
index 3430475fcd88..2e2368607aab 100644
--- a/kernel/unwind.c
+++ b/kernel/unwind.c
@@ -102,7 +102,7 @@ static struct unwind_table {
 	unsigned long size;
 	struct unwind_table *link;
 	const char *name;
-} root_table, *last_table;
+} root_table;
 
 struct unwind_item {
 	enum item_location {
@@ -174,6 +174,8 @@ void __init unwind_init(void)
 
 #ifdef CONFIG_MODULES
 
+static struct unwind_table *last_table;
+
 /* Must be called with module_mutex held. */
 void *unwind_add_table(struct module *module,
                        const void *table_start,
-- 
cgit v1.2.3


From 3b9b8ab65d8eed784b9164d03807cb2bda7b5cd6 Mon Sep 17 00:00:00 2001
From: Kirill Korotaev <dev@sw.ru>
Date: Fri, 29 Sep 2006 02:00:05 -0700
Subject: [PATCH] Fix unserialized task->files changing

Fixed race on put_files_struct on exec with proc.  Restoring files on
current on error path may lead to proc having a pointer to already kfree-d
files_struct.

->files changing at exit.c and khtread.c are safe as exit_files() makes all
things under lock.

Found during OpenVZ stress testing.

[akpm@osdl.org: add export]
Signed-off-by: Pavel Emelianov <xemul@openvz.org>
Signed-off-by: Kirill Korotaev <dev@openvz.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/binfmt_elf.c      |  6 ++----
 fs/binfmt_misc.c     |  6 ++----
 fs/exec.c            |  3 +--
 include/linux/file.h |  1 +
 kernel/exit.c        | 12 ++++++++++++
 5 files changed, 18 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index dfd8cfb7fb5d..bb43da5cde5c 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1038,10 +1038,8 @@ out_free_interp:
 out_free_file:
 	sys_close(elf_exec_fileno);
 out_free_fh:
-	if (files) {
-		put_files_struct(current->files);
-		current->files = files;
-	}
+	if (files)
+		reset_files_struct(current, files);
 out_free_ph:
 	kfree(elf_phdata);
 	goto out;
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index 66ba137f8661..1713c48fef54 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -215,10 +215,8 @@ _error:
 	bprm->interp_flags = 0;
 	bprm->interp_data = 0;
 _unshare:
-	if (files) {
-		put_files_struct(current->files);
-		current->files = files;
-	}
+	if (files)
+		reset_files_struct(current, files);
 	goto _ret;
 }
 
diff --git a/fs/exec.c b/fs/exec.c
index 97df6e0aeaee..a8efe35176b0 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -898,8 +898,7 @@ int flush_old_exec(struct linux_binprm * bprm)
 	return 0;
 
 mmap_failed:
-	put_files_struct(current->files);
-	current->files = files;
+	reset_files_struct(current, files);
 out:
 	return retval;
 }
diff --git a/include/linux/file.h b/include/linux/file.h
index 9f7c2513866f..74183e6f7f45 100644
--- a/include/linux/file.h
+++ b/include/linux/file.h
@@ -112,5 +112,6 @@ struct task_struct;
 
 struct files_struct *get_files_struct(struct task_struct *);
 void FASTCALL(put_files_struct(struct files_struct *fs));
+void reset_files_struct(struct task_struct *, struct files_struct *);
 
 #endif /* __LINUX_FILE_H */
diff --git a/kernel/exit.c b/kernel/exit.c
index d891883420f7..4b6fb054b25d 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -487,6 +487,18 @@ void fastcall put_files_struct(struct files_struct *files)
 
 EXPORT_SYMBOL(put_files_struct);
 
+void reset_files_struct(struct task_struct *tsk, struct files_struct *files)
+{
+	struct files_struct *old;
+
+	old = tsk->files;
+	task_lock(tsk);
+	tsk->files = files;
+	task_unlock(tsk);
+	put_files_struct(old);
+}
+EXPORT_SYMBOL(reset_files_struct);
+
 static inline void __exit_files(struct task_struct *tsk)
 {
 	struct files_struct * files = tsk->files;
-- 
cgit v1.2.3


From f400e198b2ed26ce55b22a1412ded0896e7516ac Mon Sep 17 00:00:00 2001
From: Sukadev Bhattiprolu <sukadev@us.ibm.com>
Date: Fri, 29 Sep 2006 02:00:07 -0700
Subject: [PATCH] pidspace: is_init()

This is an updated version of Eric Biederman's is_init() patch.
(http://lkml.org/lkml/2006/2/6/280).  It applies cleanly to 2.6.18-rc3 and
replaces a few more instances of ->pid == 1 with is_init().

Further, is_init() checks pid and thus removes dependency on Eric's other
patches for now.

Eric's original description:

	There are a lot of places in the kernel where we test for init
	because we give it special properties.  Most  significantly init
	must not die.  This results in code all over the kernel test
	->pid == 1.

	Introduce is_init to capture this case.

	With multiple pid spaces for all of the cases affected we are
	looking for only the first process on the system, not some other
	process that has pid == 1.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Sukadev Bhattiprolu <sukadev@us.ibm.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Serge Hallyn <serue@us.ibm.com>
Cc: Cedric Le Goater <clg@fr.ibm.com>
Cc: <lxc-devel@lists.sourceforge.net>
Acked-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/alpha/mm/fault.c                |  2 +-
 arch/arm/mm/fault.c                  |  2 +-
 arch/arm26/mm/fault.c                |  2 +-
 arch/i386/lib/usercopy.c             |  2 +-
 arch/i386/mm/fault.c                 |  2 +-
 arch/ia64/mm/fault.c                 |  2 +-
 arch/m32r/mm/fault.c                 |  2 +-
 arch/m68k/mm/fault.c                 |  2 +-
 arch/mips/mm/fault.c                 |  2 +-
 arch/powerpc/mm/fault.c              |  2 +-
 arch/powerpc/platforms/pseries/ras.c |  2 +-
 arch/ppc/kernel/traps.c              |  2 +-
 arch/ppc/mm/fault.c                  |  2 +-
 arch/s390/mm/fault.c                 |  2 +-
 arch/sh/mm/fault.c                   |  2 +-
 arch/sh64/mm/fault.c                 |  6 +++---
 arch/um/kernel/trap.c                |  2 +-
 arch/x86_64/mm/fault.c               |  4 ++--
 arch/xtensa/mm/fault.c               |  2 +-
 drivers/char/sysrq.c                 |  2 +-
 include/linux/sched.h                | 10 ++++++++++
 kernel/capability.c                  |  2 +-
 kernel/cpuset.c                      |  2 +-
 kernel/exit.c                        |  2 +-
 kernel/kexec.c                       |  2 +-
 kernel/ptrace.c                      |  1 +
 kernel/sysctl.c                      |  2 +-
 mm/oom_kill.c                        |  2 +-
 security/commoncap.c                 |  2 +-
 29 files changed, 41 insertions(+), 30 deletions(-)

(limited to 'kernel')

diff --git a/arch/alpha/mm/fault.c b/arch/alpha/mm/fault.c
index 622dabd84680..8871529a34e2 100644
--- a/arch/alpha/mm/fault.c
+++ b/arch/alpha/mm/fault.c
@@ -193,7 +193,7 @@ do_page_fault(unsigned long address, unsigned long mmcsr,
 	/* We ran out of memory, or some other thing happened to us that
 	   made us unable to handle the page fault gracefully.  */
  out_of_memory:
-	if (current->pid == 1) {
+	if (is_init(current)) {
 		yield();
 		down_read(&mm->mmap_sem);
 		goto survive;
diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c
index a5b33ff3924e..5e658a874498 100644
--- a/arch/arm/mm/fault.c
+++ b/arch/arm/mm/fault.c
@@ -198,7 +198,7 @@ survive:
 		return fault;
 	}
 
-	if (tsk->pid != 1)
+	if (!is_init(tsk))
 		goto out;
 
 	/*
diff --git a/arch/arm26/mm/fault.c b/arch/arm26/mm/fault.c
index a7c4cc922095..a1f6d8a9cc32 100644
--- a/arch/arm26/mm/fault.c
+++ b/arch/arm26/mm/fault.c
@@ -185,7 +185,7 @@ survive:
 	}
 
 	fault = -3; /* out of memory */
-	if (tsk->pid != 1)
+	if (!is_init(tsk))
 		goto out;
 
 	/*
diff --git a/arch/i386/lib/usercopy.c b/arch/i386/lib/usercopy.c
index efc7e7d5f4d0..08502fc6d0cb 100644
--- a/arch/i386/lib/usercopy.c
+++ b/arch/i386/lib/usercopy.c
@@ -739,7 +739,7 @@ survive:
 			retval = get_user_pages(current, current->mm,
 					(unsigned long )to, 1, 1, 0, &pg, NULL);
 
-			if (retval == -ENOMEM && current->pid == 1) {
+			if (retval == -ENOMEM && is_init(current)) {
 				up_read(&current->mm->mmap_sem);
 				blk_congestion_wait(WRITE, HZ/50);
 				goto survive;
diff --git a/arch/i386/mm/fault.c b/arch/i386/mm/fault.c
index 50d8617391dd..2581575786c1 100644
--- a/arch/i386/mm/fault.c
+++ b/arch/i386/mm/fault.c
@@ -589,7 +589,7 @@ no_context:
  */
 out_of_memory:
 	up_read(&mm->mmap_sem);
-	if (tsk->pid == 1) {
+	if (is_init(tsk)) {
 		yield();
 		down_read(&mm->mmap_sem);
 		goto survive;
diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c
index d8b1b4ac7f26..59f3ab937615 100644
--- a/arch/ia64/mm/fault.c
+++ b/arch/ia64/mm/fault.c
@@ -280,7 +280,7 @@ ia64_do_page_fault (unsigned long address, unsigned long isr, struct pt_regs *re
 
   out_of_memory:
 	up_read(&mm->mmap_sem);
-	if (current->pid == 1) {
+	if (is_init(current)) {
 		yield();
 		down_read(&mm->mmap_sem);
 		goto survive;
diff --git a/arch/m32r/mm/fault.c b/arch/m32r/mm/fault.c
index dc18a33eefef..8d5f551b5754 100644
--- a/arch/m32r/mm/fault.c
+++ b/arch/m32r/mm/fault.c
@@ -299,7 +299,7 @@ no_context:
  */
 out_of_memory:
 	up_read(&mm->mmap_sem);
-	if (tsk->pid == 1) {
+	if (is_init(tsk)) {
 		yield();
 		down_read(&mm->mmap_sem);
 		goto survive;
diff --git a/arch/m68k/mm/fault.c b/arch/m68k/mm/fault.c
index 5e2d87c10c87..911f2ce3f53e 100644
--- a/arch/m68k/mm/fault.c
+++ b/arch/m68k/mm/fault.c
@@ -181,7 +181,7 @@ good_area:
  */
 out_of_memory:
 	up_read(&mm->mmap_sem);
-	if (current->pid == 1) {
+	if (is_init(current)) {
 		yield();
 		down_read(&mm->mmap_sem);
 		goto survive;
diff --git a/arch/mips/mm/fault.c b/arch/mips/mm/fault.c
index a4f8c45c4e8e..8423d8590779 100644
--- a/arch/mips/mm/fault.c
+++ b/arch/mips/mm/fault.c
@@ -171,7 +171,7 @@ no_context:
  */
 out_of_memory:
 	up_read(&mm->mmap_sem);
-	if (tsk->pid == 1) {
+	if (is_init(tsk)) {
 		yield();
 		down_read(&mm->mmap_sem);
 		goto survive;
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 77953f41d754..e8fa50624b70 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -386,7 +386,7 @@ bad_area_nosemaphore:
  */
 out_of_memory:
 	up_read(&mm->mmap_sem);
-	if (current->pid == 1) {
+	if (is_init(current)) {
 		yield();
 		down_read(&mm->mmap_sem);
 		goto survive;
diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c
index 903115d67fdc..311ed1993fc0 100644
--- a/arch/powerpc/platforms/pseries/ras.c
+++ b/arch/powerpc/platforms/pseries/ras.c
@@ -337,7 +337,7 @@ static int recover_mce(struct pt_regs *regs, struct rtas_error_log * err)
 		   err->disposition == RTAS_DISP_NOT_RECOVERED &&
 		   err->target == RTAS_TARGET_MEMORY &&
 		   err->type == RTAS_TYPE_ECC_UNCORR &&
-		   !(current->pid == 0 || current->pid == 1)) {
+		   !(current->pid == 0 || is_init(current))) {
 		/* Kill off a user process with an ECC error */
 		printk(KERN_ERR "MCE: uncorrectable ecc error for pid %d\n",
 		       current->pid);
diff --git a/arch/ppc/kernel/traps.c b/arch/ppc/kernel/traps.c
index d7a433049b48..aafc8e8893d1 100644
--- a/arch/ppc/kernel/traps.c
+++ b/arch/ppc/kernel/traps.c
@@ -119,7 +119,7 @@ void _exception(int signr, struct pt_regs *regs, int code, unsigned long addr)
 	 * generate the same exception over and over again and we get
 	 * nowhere.  Better to kill it and let the kernel panic.
 	 */
-	if (current->pid == 1) {
+	if (is_init(current)) {
 		__sighandler_t handler;
 
 		spin_lock_irq(&current->sighand->siglock);
diff --git a/arch/ppc/mm/fault.c b/arch/ppc/mm/fault.c
index bc776beb3136..465f451f3bc3 100644
--- a/arch/ppc/mm/fault.c
+++ b/arch/ppc/mm/fault.c
@@ -291,7 +291,7 @@ bad_area:
  */
 out_of_memory:
 	up_read(&mm->mmap_sem);
-	if (current->pid == 1) {
+	if (is_init(current)) {
 		yield();
 		down_read(&mm->mmap_sem);
 		goto survive;
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index f2b9a84dc2bf..9c3c19fe62fc 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -353,7 +353,7 @@ no_context:
 */
 out_of_memory:
 	up_read(&mm->mmap_sem);
-	if (tsk->pid == 1) {
+	if (is_init(tsk)) {
 		yield();
 		down_read(&mm->mmap_sem);
 		goto survive;
diff --git a/arch/sh/mm/fault.c b/arch/sh/mm/fault.c
index 507f28914706..68663b8f99ae 100644
--- a/arch/sh/mm/fault.c
+++ b/arch/sh/mm/fault.c
@@ -149,7 +149,7 @@ no_context:
  */
 out_of_memory:
 	up_read(&mm->mmap_sem);
-	if (current->pid == 1) {
+	if (is_init(current)) {
 		yield();
 		down_read(&mm->mmap_sem);
 		goto survive;
diff --git a/arch/sh64/mm/fault.c b/arch/sh64/mm/fault.c
index f08d0eaf6497..8e2f6c28b739 100644
--- a/arch/sh64/mm/fault.c
+++ b/arch/sh64/mm/fault.c
@@ -277,7 +277,7 @@ bad_area:
 			show_regs(regs);
 #endif
 		}
-		if (tsk->pid == 1) {
+		if (is_init(tsk)) {
 			panic("INIT had user mode bad_area\n");
 		}
 		tsk->thread.address = address;
@@ -319,14 +319,14 @@ no_context:
  * us unable to handle the page fault gracefully.
  */
 out_of_memory:
-	if (current->pid == 1) {
+	if (is_init(current)) {
 		panic("INIT out of memory\n");
 		yield();
 		goto survive;
 	}
 	printk("fault:Out of memory\n");
 	up_read(&mm->mmap_sem);
-	if (current->pid == 1) {
+	if (is_init(current)) {
 		yield();
 		down_read(&mm->mmap_sem);
 		goto survive;
diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c
index 61a23fff4395..c7b195c7e51f 100644
--- a/arch/um/kernel/trap.c
+++ b/arch/um/kernel/trap.c
@@ -120,7 +120,7 @@ out_nosemaphore:
  * us unable to handle the page fault gracefully.
  */
 out_of_memory:
-	if (current->pid == 1) {
+	if (is_init(current)) {
 		up_read(&mm->mmap_sem);
 		yield();
 		down_read(&mm->mmap_sem);
diff --git a/arch/x86_64/mm/fault.c b/arch/x86_64/mm/fault.c
index 9ba54cc2b5f6..3751b4788e28 100644
--- a/arch/x86_64/mm/fault.c
+++ b/arch/x86_64/mm/fault.c
@@ -244,7 +244,7 @@ static int is_errata93(struct pt_regs *regs, unsigned long address)
 
 int unhandled_signal(struct task_struct *tsk, int sig)
 {
-	if (tsk->pid == 1)
+	if (is_init(tsk))
 		return 1;
 	if (tsk->ptrace & PT_PTRACED)
 		return 0;
@@ -580,7 +580,7 @@ no_context:
  */
 out_of_memory:
 	up_read(&mm->mmap_sem);
-	if (current->pid == 1) { 
+	if (is_init(current)) {
 		yield();
 		goto again;
 	}
diff --git a/arch/xtensa/mm/fault.c b/arch/xtensa/mm/fault.c
index a945a33e85a1..dd0dbec2e57e 100644
--- a/arch/xtensa/mm/fault.c
+++ b/arch/xtensa/mm/fault.c
@@ -144,7 +144,7 @@ bad_area:
 	 */
 out_of_memory:
 	up_read(&mm->mmap_sem);
-	if (current->pid == 1) {
+	if (is_init(current)) {
 		yield();
 		down_read(&mm->mmap_sem);
 		goto survive;
diff --git a/drivers/char/sysrq.c b/drivers/char/sysrq.c
index ee3ca8f1768e..0ad6cb081db4 100644
--- a/drivers/char/sysrq.c
+++ b/drivers/char/sysrq.c
@@ -208,7 +208,7 @@ static void send_sig_all(int sig)
 	struct task_struct *p;
 
 	for_each_process(p) {
-		if (p->mm && p->pid != 1)
+		if (p->mm && !is_init(p))
 			/* Not swapper, init nor kernel thread */
 			force_sig(sig, p);
 	}
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 3696f2f7126d..ed2af8671589 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1033,6 +1033,16 @@ static inline int pid_alive(struct task_struct *p)
 	return p->pids[PIDTYPE_PID].pid != NULL;
 }
 
+/**
+ * is_init - check if a task structure is the first user space
+ *	     task the kernel created.
+ * @p: Task structure to be checked.
+ */
+static inline int is_init(struct task_struct *tsk)
+{
+	return tsk->pid == 1;
+}
+
 extern void free_task(struct task_struct *tsk);
 #define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0)
 
diff --git a/kernel/capability.c b/kernel/capability.c
index c7685ad00a97..edb845a6e84a 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -133,7 +133,7 @@ static inline int cap_set_all(kernel_cap_t *effective,
      int found = 0;
 
      do_each_thread(g, target) {
-             if (target == current || target->pid == 1)
+             if (target == current || is_init(target))
                      continue;
              found = 1;
 	     if (security_capset_check(target, effective, inheritable,
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 1b32c2c04c15..584bb4e6c042 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -240,7 +240,7 @@ static struct super_block *cpuset_sb;
  * A cpuset can only be deleted if both its 'count' of using tasks
  * is zero, and its list of 'children' cpusets is empty.  Since all
  * tasks in the system use _some_ cpuset, and since there is always at
- * least one task in the system (init, pid == 1), therefore, top_cpuset
+ * least one task in the system (init), therefore, top_cpuset
  * always has either children cpusets and/or using tasks.  So we don't
  * need a special hack to ensure that top_cpuset cannot be deleted.
  *
diff --git a/kernel/exit.c b/kernel/exit.c
index 4b6fb054b25d..9961192d6055 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -219,7 +219,7 @@ static int will_become_orphaned_pgrp(int pgrp, struct task_struct *ignored_task)
 	do_each_task_pid(pgrp, PIDTYPE_PGID, p) {
 		if (p == ignored_task
 				|| p->exit_state
-				|| p->real_parent->pid == 1)
+				|| is_init(p->real_parent))
 			continue;
 		if (process_group(p->real_parent) != pgrp
 			    && p->real_parent->signal->session == p->signal->session) {
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 50087ecf337e..37cad75cf494 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -40,7 +40,7 @@ struct resource crashk_res = {
 
 int kexec_should_crash(struct task_struct *p)
 {
-	if (in_interrupt() || !p->pid || p->pid == 1 || panic_on_oops)
+	if (in_interrupt() || !p->pid || is_init(p) || panic_on_oops)
 		return 1;
 	return 0;
 }
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 8aad0331d82e..4d50e06fd745 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -440,6 +440,7 @@ struct task_struct *ptrace_get_task_struct(pid_t pid)
 	child = find_task_by_pid(pid);
 	if (child)
 		get_task_struct(child);
+
 	read_unlock(&tasklist_lock);
 	if (!child)
 		return ERR_PTR(-ESRCH);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 8bfa7d117c54..9535a3839930 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1915,7 +1915,7 @@ int proc_dointvec_bset(ctl_table *table, int write, struct file *filp,
 		return -EPERM;
 	}
 
-	op = (current->pid == 1) ? OP_SET : OP_AND;
+	op = is_init(current) ? OP_SET : OP_AND;
 	return do_proc_dointvec(table,write,filp,buffer,lenp,ppos,
 				do_proc_dointvec_bset_conv,&op);
 }
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index bada3d03119f..f3dd79c1c367 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -255,7 +255,7 @@ static struct task_struct *select_bad_process(unsigned long *ppoints)
  */
 static void __oom_kill_task(struct task_struct *p, const char *message)
 {
-	if (p->pid == 1) {
+	if (is_init(p)) {
 		WARN_ON(1);
 		printk(KERN_WARNING "tried to kill init!\n");
 		return;
diff --git a/security/commoncap.c b/security/commoncap.c
index f50fc298cf80..5a5ef5ca7ea9 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -169,7 +169,7 @@ void cap_bprm_apply_creds (struct linux_binprm *bprm, int unsafe)
 	/* For init, we want to retain the capabilities set
 	 * in the init_task struct. Thus we skip the usual
 	 * capability rules */
-	if (current->pid != 1) {
+	if (!is_init(current)) {
 		current->cap_permitted = new_permitted;
 		current->cap_effective =
 		    cap_intersect (new_permitted, bprm->cap_effective);
-- 
cgit v1.2.3


From 99de055ac065e19ed69de961e97c6336a261b34e Mon Sep 17 00:00:00 2001
From: Dave Jones <davej@redhat.com>
Date: Fri, 29 Sep 2006 02:00:10 -0700
Subject: [PATCH] lockdep: print kernel version

Lets do the same thing we do for oopses - print out the version in the
report.  It's an extra line of output though.  We could tack it on the end
of the INFO: lines, but that screws up Ingo's pretty output.

Signed-off-by: Dave Jones <davej@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/lockdep.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index c088e5542e84..df1c3594de31 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -36,6 +36,7 @@
 #include <linux/stacktrace.h>
 #include <linux/debug_locks.h>
 #include <linux/irqflags.h>
+#include <linux/utsname.h>
 
 #include <asm/sections.h>
 
@@ -515,6 +516,13 @@ print_circular_bug_entry(struct lock_list *target, unsigned int depth)
 	return 0;
 }
 
+static void print_kernel_version(void)
+{
+	printk("%s %.*s\n", system_utsname.release,
+		(int)strcspn(system_utsname.version, " "),
+		system_utsname.version);
+}
+
 /*
  * When a circular dependency is detected, print the
  * header first:
@@ -531,6 +539,7 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth)
 
 	printk("\n=======================================================\n");
 	printk(  "[ INFO: possible circular locking dependency detected ]\n");
+	print_kernel_version();
 	printk(  "-------------------------------------------------------\n");
 	printk("%s/%d is trying to acquire lock:\n",
 		curr->comm, curr->pid);
@@ -712,6 +721,7 @@ print_bad_irq_dependency(struct task_struct *curr,
 	printk("\n======================================================\n");
 	printk(  "[ INFO: %s-safe -> %s-unsafe lock order detected ]\n",
 		irqclass, irqclass);
+	print_kernel_version();
 	printk(  "------------------------------------------------------\n");
 	printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n",
 		curr->comm, curr->pid,
@@ -793,6 +803,7 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,
 
 	printk("\n=============================================\n");
 	printk(  "[ INFO: possible recursive locking detected ]\n");
+	print_kernel_version();
 	printk(  "---------------------------------------------\n");
 	printk("%s/%d is trying to acquire lock:\n",
 		curr->comm, curr->pid);
@@ -1375,6 +1386,7 @@ print_irq_inversion_bug(struct task_struct *curr, struct lock_class *other,
 
 	printk("\n=========================================================\n");
 	printk(  "[ INFO: possible irq lock inversion dependency detected ]\n");
+	print_kernel_version();
 	printk(  "---------------------------------------------------------\n");
 	printk("%s/%d just changed the state of lock:\n",
 		curr->comm, curr->pid);
@@ -1469,6 +1481,7 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this,
 
 	printk("\n=================================\n");
 	printk(  "[ INFO: inconsistent lock state ]\n");
+	print_kernel_version();
 	printk(  "---------------------------------\n");
 
 	printk("inconsistent {%s} -> {%s} usage.\n",
-- 
cgit v1.2.3


From a45bce49545739a940f8bd4ca85c3b7435564893 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@us.ibm.com>
Date: Fri, 29 Sep 2006 02:00:11 -0700
Subject: [PATCH] memory ordering in __kfifo primitives

Both __kfifo_put() and __kfifo_get() have header comments stating that if
there is but one concurrent reader and one concurrent writer, locking is not
necessary.  This is almost the case, but a couple of memory barriers are
needed.  Another option would be to change the header comments to remove the
bit about locking not being needed, and to change the those callers who
currently don't use locking to add the required locking.  The attachment
analyzes this approach, but the patch below seems simpler.

Signed-off-by: Paul E. McKenney <paulmck@us.ibm.com>
Cc: Stelian Pop <stelian@popies.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/kfifo.c | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

(limited to 'kernel')

diff --git a/kernel/kfifo.c b/kernel/kfifo.c
index 64ab045c3d9d..5d1d907378a2 100644
--- a/kernel/kfifo.c
+++ b/kernel/kfifo.c
@@ -122,6 +122,13 @@ unsigned int __kfifo_put(struct kfifo *fifo,
 
 	len = min(len, fifo->size - fifo->in + fifo->out);
 
+	/*
+	 * Ensure that we sample the fifo->out index -before- we
+	 * start putting bytes into the kfifo.
+	 */
+
+	smp_mb();
+
 	/* first put the data starting from fifo->in to buffer end */
 	l = min(len, fifo->size - (fifo->in & (fifo->size - 1)));
 	memcpy(fifo->buffer + (fifo->in & (fifo->size - 1)), buffer, l);
@@ -129,6 +136,13 @@ unsigned int __kfifo_put(struct kfifo *fifo,
 	/* then put the rest (if any) at the beginning of the buffer */
 	memcpy(fifo->buffer, buffer + l, len - l);
 
+	/*
+	 * Ensure that we add the bytes to the kfifo -before-
+	 * we update the fifo->in index.
+	 */
+
+	smp_wmb();
+
 	fifo->in += len;
 
 	return len;
@@ -154,6 +168,13 @@ unsigned int __kfifo_get(struct kfifo *fifo,
 
 	len = min(len, fifo->in - fifo->out);
 
+	/*
+	 * Ensure that we sample the fifo->in index -before- we
+	 * start removing bytes from the kfifo.
+	 */
+
+	smp_rmb();
+
 	/* first get the data from fifo->out until the end of the buffer */
 	l = min(len, fifo->size - (fifo->out & (fifo->size - 1)));
 	memcpy(buffer, fifo->buffer + (fifo->out & (fifo->size - 1)), l);
@@ -161,6 +182,13 @@ unsigned int __kfifo_get(struct kfifo *fifo,
 	/* then get the rest (if any) from the beginning of the buffer */
 	memcpy(buffer + l, fifo->buffer, len - l);
 
+	/*
+	 * Ensure that we remove the bytes from the kfifo -before-
+	 * we update the fifo->out index.
+	 */
+
+	smp_mb();
+
 	fifo->out += len;
 
 	return len;
-- 
cgit v1.2.3


From 07dccf3344010f9b9df7fe725da7e73bca2992df Mon Sep 17 00:00:00 2001
From: Akinobu Mita <mita@miraclelinux.com>
Date: Fri, 29 Sep 2006 02:00:22 -0700
Subject: [PATCH] check return value of cpu_callback

Spawing ksoftirqd, migration, or watchdog, and calling init_timers_cpu()
may fail with small memory.  If it happens in initcalls, kernel NULL
pointer dereference happens later.  This patch makes crash happen
immediately in such cases.  It seems a bit better than getting kernel NULL
pointer dereference later.

Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Akinobu Mita <mita@miraclelinux.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c      | 4 +++-
 kernel/softirq.c    | 4 +++-
 kernel/softlockup.c | 3 ++-
 kernel/timer.c      | 4 +++-
 4 files changed, 11 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 5c848fd4e461..b946209d9c15 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5272,9 +5272,11 @@ static struct notifier_block __cpuinitdata migration_notifier = {
 int __init migration_init(void)
 {
 	void *cpu = (void *)(long)smp_processor_id();
+	int err;
 
 	/* Start one for the boot CPU: */
-	migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
+	err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
+	BUG_ON(err == NOTIFY_BAD);
 	migration_call(&migration_notifier, CPU_ONLINE, cpu);
 	register_cpu_notifier(&migration_notifier);
 
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 3789ca98197c..bf25015dce16 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -612,7 +612,9 @@ static struct notifier_block __cpuinitdata cpu_nfb = {
 __init int spawn_ksoftirqd(void)
 {
 	void *cpu = (void *)(long)smp_processor_id();
-	cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
+	int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
+
+	BUG_ON(err == NOTIFY_BAD);
 	cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
 	register_cpu_notifier(&cpu_nfb);
 	return 0;
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 03e6a2b0b787..50afeb813305 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -149,8 +149,9 @@ static struct notifier_block __cpuinitdata cpu_nfb = {
 __init void spawn_softlockup_task(void)
 {
 	void *cpu = (void *)(long)smp_processor_id();
+	int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
 
-	cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
+	BUG_ON(err == NOTIFY_BAD);
 	cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
 	register_cpu_notifier(&cpu_nfb);
 
diff --git a/kernel/timer.c b/kernel/timer.c
index d644f4e9ca0c..a2cb1ecb1b28 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1694,8 +1694,10 @@ static struct notifier_block __cpuinitdata timers_nb = {
 
 void __init init_timers(void)
 {
-	timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE,
+	int err = timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE,
 				(void *)(long)smp_processor_id());
+
+	BUG_ON(err == NOTIFY_BAD);
 	register_cpu_notifier(&timers_nb);
 	open_softirq(TIMER_SOFTIRQ, run_timer_softirq, NULL);
 }
-- 
cgit v1.2.3


From 1711ef3866b0360e102327389fe4b76c849bbe83 Mon Sep 17 00:00:00 2001
From: Toyo Abe <toyoa@mvista.com>
Date: Fri, 29 Sep 2006 02:00:28 -0700
Subject: [PATCH] posix-timers: Fix clock_nanosleep() doesn't return the
 remaining time in compatibility mode

The clock_nanosleep() function does not return the time remaining when the
sleep is interrupted by a signal.

This patch creates a new call out, compat_clock_nanosleep_restart(), which
handles returning the remaining time after a sleep is interrupted.  This
patch revives clock_nanosleep_restart().  It is now accessed via the new
call out.  The compat_clock_nanosleep_restart() is used for compatibility
access.

Since this is implemented in compatibility mode the normal path is
virtually unaffected - no real performance impact.

Signed-off-by: Toyo Abe <toyoa@mvista.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/hrtimer.h      |  1 +
 include/linux/posix-timers.h |  4 ++++
 kernel/compat.c              | 33 +++++++++++++++++++++++++++++++++
 kernel/hrtimer.c             | 20 ++++++++++----------
 kernel/posix-cpu-timers.c    | 17 ++++++++++++-----
 kernel/posix-timers.c        | 21 +++++++++++++++++++++
 6 files changed, 81 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 4fc379de6c2f..fca93025ab51 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -138,6 +138,7 @@ extern long hrtimer_nanosleep(struct timespec *rqtp,
 			      struct timespec __user *rmtp,
 			      const enum hrtimer_mode mode,
 			      const clockid_t clockid);
+extern long hrtimer_nanosleep_restart(struct restart_block *restart_block);
 
 extern void hrtimer_init_sleeper(struct hrtimer_sleeper *sl,
 				 struct task_struct *tsk);
diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index 95572c434bc9..a7dd38f30ade 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -72,6 +72,7 @@ struct k_clock {
 	int (*timer_create) (struct k_itimer *timer);
 	int (*nsleep) (const clockid_t which_clock, int flags,
 		       struct timespec *, struct timespec __user *);
+	long (*nsleep_restart) (struct restart_block *restart_block);
 	int (*timer_set) (struct k_itimer * timr, int flags,
 			  struct itimerspec * new_setting,
 			  struct itimerspec * old_setting);
@@ -97,6 +98,7 @@ int posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *ts);
 int posix_cpu_timer_create(struct k_itimer *timer);
 int posix_cpu_nsleep(const clockid_t which_clock, int flags,
 		     struct timespec *rqtp, struct timespec __user *rmtp);
+long posix_cpu_nsleep_restart(struct restart_block *restart_block);
 int posix_cpu_timer_set(struct k_itimer *timer, int flags,
 			struct itimerspec *new, struct itimerspec *old);
 int posix_cpu_timer_del(struct k_itimer *timer);
@@ -111,4 +113,6 @@ void posix_cpu_timers_exit_group(struct task_struct *task);
 void set_process_cpu_timer(struct task_struct *task, unsigned int clock_idx,
 			   cputime_t *newval, cputime_t *oldval);
 
+long clock_nanosleep_restart(struct restart_block *restart_block);
+
 #endif
diff --git a/kernel/compat.c b/kernel/compat.c
index 126dee9530aa..75573e5d27b0 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -22,6 +22,7 @@
 #include <linux/security.h>
 #include <linux/timex.h>
 #include <linux/migrate.h>
+#include <linux/posix-timers.h>
 
 #include <asm/uaccess.h>
 
@@ -601,6 +602,30 @@ long compat_sys_clock_getres(clockid_t which_clock,
 	return err;
 } 
 
+static long compat_clock_nanosleep_restart(struct restart_block *restart)
+{
+	long err;
+	mm_segment_t oldfs;
+	struct timespec tu;
+	struct compat_timespec *rmtp = (struct compat_timespec *)(restart->arg1);
+
+	restart->arg1 = (unsigned long) &tu;
+	oldfs = get_fs();
+	set_fs(KERNEL_DS);
+	err = clock_nanosleep_restart(restart);
+	set_fs(oldfs);
+
+	if ((err == -ERESTART_RESTARTBLOCK) && rmtp &&
+	    put_compat_timespec(&tu, rmtp))
+		return -EFAULT;
+
+	if (err == -ERESTART_RESTARTBLOCK) {
+		restart->fn = compat_clock_nanosleep_restart;
+		restart->arg1 = (unsigned long) rmtp;
+	}
+	return err;
+}
+
 long compat_sys_clock_nanosleep(clockid_t which_clock, int flags,
 			    struct compat_timespec __user *rqtp,
 			    struct compat_timespec __user *rmtp)
@@ -608,6 +633,7 @@ long compat_sys_clock_nanosleep(clockid_t which_clock, int flags,
 	long err;
 	mm_segment_t oldfs;
 	struct timespec in, out; 
+	struct restart_block *restart;
 
 	if (get_compat_timespec(&in, rqtp)) 
 		return -EFAULT;
@@ -618,9 +644,16 @@ long compat_sys_clock_nanosleep(clockid_t which_clock, int flags,
 				  (struct timespec __user *) &in,
 				  (struct timespec __user *) &out);
 	set_fs(oldfs);
+
 	if ((err == -ERESTART_RESTARTBLOCK) && rmtp &&
 	    put_compat_timespec(&out, rmtp))
 		return -EFAULT;
+
+	if (err == -ERESTART_RESTARTBLOCK) {
+		restart = &current_thread_info()->restart_block;
+		restart->fn = compat_clock_nanosleep_restart;
+		restart->arg1 = (unsigned long) rmtp;
+	}
 	return err;	
 } 
 
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 21c38a7e666b..d0ba190dfeb6 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -693,7 +693,7 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod
 	return t->task == NULL;
 }
 
-static long __sched nanosleep_restart(struct restart_block *restart)
+long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
 {
 	struct hrtimer_sleeper t;
 	struct timespec __user *rmtp;
@@ -702,13 +702,13 @@ static long __sched nanosleep_restart(struct restart_block *restart)
 
 	restart->fn = do_no_restart_syscall;
 
-	hrtimer_init(&t.timer, restart->arg3, HRTIMER_ABS);
-	t.timer.expires.tv64 = ((u64)restart->arg1 << 32) | (u64) restart->arg0;
+	hrtimer_init(&t.timer, restart->arg0, HRTIMER_ABS);
+	t.timer.expires.tv64 = ((u64)restart->arg3 << 32) | (u64) restart->arg2;
 
 	if (do_nanosleep(&t, HRTIMER_ABS))
 		return 0;
 
-	rmtp = (struct timespec __user *) restart->arg2;
+	rmtp = (struct timespec __user *) restart->arg1;
 	if (rmtp) {
 		time = ktime_sub(t.timer.expires, t.timer.base->get_time());
 		if (time.tv64 <= 0)
@@ -718,7 +718,7 @@ static long __sched nanosleep_restart(struct restart_block *restart)
 			return -EFAULT;
 	}
 
-	restart->fn = nanosleep_restart;
+	restart->fn = hrtimer_nanosleep_restart;
 
 	/* The other values in restart are already filled in */
 	return -ERESTART_RESTARTBLOCK;
@@ -751,11 +751,11 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
 	}
 
 	restart = &current_thread_info()->restart_block;
-	restart->fn = nanosleep_restart;
-	restart->arg0 = t.timer.expires.tv64 & 0xFFFFFFFF;
-	restart->arg1 = t.timer.expires.tv64 >> 32;
-	restart->arg2 = (unsigned long) rmtp;
-	restart->arg3 = (unsigned long) t.timer.base->index;
+	restart->fn = hrtimer_nanosleep_restart;
+	restart->arg0 = (unsigned long) t.timer.base->index;
+	restart->arg1 = (unsigned long) rmtp;
+	restart->arg2 = t.timer.expires.tv64 & 0xFFFFFFFF;
+	restart->arg3 = t.timer.expires.tv64 >> 32;
 
 	return -ERESTART_RESTARTBLOCK;
 }
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index d38d9ec3276c..5667fef89dd1 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -1393,8 +1393,6 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
 	}
 }
 
-static long posix_cpu_clock_nanosleep_restart(struct restart_block *);
-
 int posix_cpu_nsleep(const clockid_t which_clock, int flags,
 		     struct timespec *rqtp, struct timespec __user *rmtp)
 {
@@ -1471,7 +1469,7 @@ int posix_cpu_nsleep(const clockid_t which_clock, int flags,
 		    copy_to_user(rmtp, &it.it_value, sizeof *rmtp))
 			return -EFAULT;
 
-		restart_block->fn = posix_cpu_clock_nanosleep_restart;
+		restart_block->fn = posix_cpu_nsleep_restart;
 		/* Caller already set restart_block->arg1 */
 		restart_block->arg0 = which_clock;
 		restart_block->arg1 = (unsigned long) rmtp;
@@ -1484,8 +1482,7 @@ int posix_cpu_nsleep(const clockid_t which_clock, int flags,
 	return error;
 }
 
-static long
-posix_cpu_clock_nanosleep_restart(struct restart_block *restart_block)
+long posix_cpu_nsleep_restart(struct restart_block *restart_block)
 {
 	clockid_t which_clock = restart_block->arg0;
 	struct timespec __user *rmtp;
@@ -1524,6 +1521,10 @@ static int process_cpu_nsleep(const clockid_t which_clock, int flags,
 {
 	return posix_cpu_nsleep(PROCESS_CLOCK, flags, rqtp, rmtp);
 }
+static long process_cpu_nsleep_restart(struct restart_block *restart_block)
+{
+	return -EINVAL;
+}
 static int thread_cpu_clock_getres(const clockid_t which_clock,
 				   struct timespec *tp)
 {
@@ -1544,6 +1545,10 @@ static int thread_cpu_nsleep(const clockid_t which_clock, int flags,
 {
 	return -EINVAL;
 }
+static long thread_cpu_nsleep_restart(struct restart_block *restart_block)
+{
+	return -EINVAL;
+}
 
 static __init int init_posix_cpu_timers(void)
 {
@@ -1553,6 +1558,7 @@ static __init int init_posix_cpu_timers(void)
 		.clock_set = do_posix_clock_nosettime,
 		.timer_create = process_cpu_timer_create,
 		.nsleep = process_cpu_nsleep,
+		.nsleep_restart = process_cpu_nsleep_restart,
 	};
 	struct k_clock thread = {
 		.clock_getres = thread_cpu_clock_getres,
@@ -1560,6 +1566,7 @@ static __init int init_posix_cpu_timers(void)
 		.clock_set = do_posix_clock_nosettime,
 		.timer_create = thread_cpu_timer_create,
 		.nsleep = thread_cpu_nsleep,
+		.nsleep_restart = thread_cpu_nsleep_restart,
 	};
 
 	register_posix_clock(CLOCK_PROCESS_CPUTIME_ID, &process);
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index ac6dc8744429..e5ebcc1ec3a0 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -973,3 +973,24 @@ sys_clock_nanosleep(const clockid_t which_clock, int flags,
 	return CLOCK_DISPATCH(which_clock, nsleep,
 			      (which_clock, flags, &t, rmtp));
 }
+
+/*
+ * nanosleep_restart for monotonic and realtime clocks
+ */
+static int common_nsleep_restart(struct restart_block *restart_block)
+{
+	return hrtimer_nanosleep_restart(restart_block);
+}
+
+/*
+ * This will restart clock_nanosleep. This is required only by
+ * compat_clock_nanosleep_restart for now.
+ */
+long
+clock_nanosleep_restart(struct restart_block *restart_block)
+{
+	clockid_t which_clock = restart_block->arg0;
+
+	return CLOCK_DISPATCH(which_clock, nsleep_restart,
+			      (restart_block));
+}
-- 
cgit v1.2.3


From e4b765551aa6355eae60b644bed851a9477c4e2b Mon Sep 17 00:00:00 2001
From: Toyo Abe <toyoa@mvista.com>
Date: Fri, 29 Sep 2006 02:00:29 -0700
Subject: [PATCH] posix-timers: Fix the flags handling in posix_cpu_nsleep()

When a posix_cpu_nsleep() sleep is interrupted by a signal more than twice, it
incorrectly reports the sleep time remaining to the user.  Because
posix_cpu_nsleep() doesn't report back to the user when it's called from
restart function due to the wrong flags handling.

This patch, which applies after previous one, moves the nanosleep() function
from posix_cpu_nsleep() to do_cpu_nanosleep() and cleans up the flags handling
appropriately.

Signed-off-by: Toyo Abe <toyoa@mvista.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/posix-cpu-timers.c | 84 ++++++++++++++++++++++++++++++++---------------
 1 file changed, 58 insertions(+), 26 deletions(-)

(limited to 'kernel')

diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 5667fef89dd1..479b16b44f79 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -1393,22 +1393,12 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
 	}
 }
 
-int posix_cpu_nsleep(const clockid_t which_clock, int flags,
-		     struct timespec *rqtp, struct timespec __user *rmtp)
+static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
+			    struct timespec *rqtp, struct itimerspec *it)
 {
-	struct restart_block *restart_block =
-	    &current_thread_info()->restart_block;
 	struct k_itimer timer;
 	int error;
 
-	/*
-	 * Diagnose required errors first.
-	 */
-	if (CPUCLOCK_PERTHREAD(which_clock) &&
-	    (CPUCLOCK_PID(which_clock) == 0 ||
-	     CPUCLOCK_PID(which_clock) == current->pid))
-		return -EINVAL;
-
 	/*
 	 * Set up a temporary timer and then wait for it to go off.
 	 */
@@ -1420,11 +1410,12 @@ int posix_cpu_nsleep(const clockid_t which_clock, int flags,
 	timer.it_process = current;
 	if (!error) {
 		static struct itimerspec zero_it;
-		struct itimerspec it = { .it_value = *rqtp,
-					 .it_interval = {} };
+
+		memset(it, 0, sizeof *it);
+		it->it_value = *rqtp;
 
 		spin_lock_irq(&timer.it_lock);
-		error = posix_cpu_timer_set(&timer, flags, &it, NULL);
+		error = posix_cpu_timer_set(&timer, flags, it, NULL);
 		if (error) {
 			spin_unlock_irq(&timer.it_lock);
 			return error;
@@ -1452,33 +1443,56 @@ int posix_cpu_nsleep(const clockid_t which_clock, int flags,
 		 * We were interrupted by a signal.
 		 */
 		sample_to_timespec(which_clock, timer.it.cpu.expires, rqtp);
-		posix_cpu_timer_set(&timer, 0, &zero_it, &it);
+		posix_cpu_timer_set(&timer, 0, &zero_it, it);
 		spin_unlock_irq(&timer.it_lock);
 
-		if ((it.it_value.tv_sec | it.it_value.tv_nsec) == 0) {
+		if ((it->it_value.tv_sec | it->it_value.tv_nsec) == 0) {
 			/*
 			 * It actually did fire already.
 			 */
 			return 0;
 		}
 
+		error = -ERESTART_RESTARTBLOCK;
+	}
+
+	return error;
+}
+
+int posix_cpu_nsleep(const clockid_t which_clock, int flags,
+		     struct timespec *rqtp, struct timespec __user *rmtp)
+{
+	struct restart_block *restart_block =
+	    &current_thread_info()->restart_block;
+	struct itimerspec it;
+	int error;
+
+	/*
+	 * Diagnose required errors first.
+	 */
+	if (CPUCLOCK_PERTHREAD(which_clock) &&
+	    (CPUCLOCK_PID(which_clock) == 0 ||
+	     CPUCLOCK_PID(which_clock) == current->pid))
+		return -EINVAL;
+
+	error = do_cpu_nanosleep(which_clock, flags, rqtp, &it);
+
+	if (error == -ERESTART_RESTARTBLOCK) {
+
+	       	if (flags & TIMER_ABSTIME)
+			return -ERESTARTNOHAND;
 		/*
-		 * Report back to the user the time still remaining.
-		 */
-		if (rmtp != NULL && !(flags & TIMER_ABSTIME) &&
-		    copy_to_user(rmtp, &it.it_value, sizeof *rmtp))
+	 	 * Report back to the user the time still remaining.
+	 	 */
+		if (rmtp != NULL && copy_to_user(rmtp, &it.it_value, sizeof *rmtp))
 			return -EFAULT;
 
 		restart_block->fn = posix_cpu_nsleep_restart;
-		/* Caller already set restart_block->arg1 */
 		restart_block->arg0 = which_clock;
 		restart_block->arg1 = (unsigned long) rmtp;
 		restart_block->arg2 = rqtp->tv_sec;
 		restart_block->arg3 = rqtp->tv_nsec;
-
-		error = -ERESTART_RESTARTBLOCK;
 	}
-
 	return error;
 }
 
@@ -1487,13 +1501,31 @@ long posix_cpu_nsleep_restart(struct restart_block *restart_block)
 	clockid_t which_clock = restart_block->arg0;
 	struct timespec __user *rmtp;
 	struct timespec t;
+	struct itimerspec it;
+	int error;
 
 	rmtp = (struct timespec __user *) restart_block->arg1;
 	t.tv_sec = restart_block->arg2;
 	t.tv_nsec = restart_block->arg3;
 
 	restart_block->fn = do_no_restart_syscall;
-	return posix_cpu_nsleep(which_clock, TIMER_ABSTIME, &t, rmtp);
+	error = do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t, &it);
+
+	if (error == -ERESTART_RESTARTBLOCK) {
+		/*
+	 	 * Report back to the user the time still remaining.
+	 	 */
+		if (rmtp != NULL && copy_to_user(rmtp, &it.it_value, sizeof *rmtp))
+			return -EFAULT;
+
+		restart_block->fn = posix_cpu_nsleep_restart;
+		restart_block->arg0 = which_clock;
+		restart_block->arg1 = (unsigned long) rmtp;
+		restart_block->arg2 = t.tv_sec;
+		restart_block->arg3 = t.tv_nsec;
+	}
+	return error;
+
 }
 
 
-- 
cgit v1.2.3


From b9ecb2bd5d3ab8904752685696cb76aac1f3fef2 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Fri, 29 Sep 2006 02:00:31 -0700
Subject: [PATCH] has_stopped_jobs() cleanup

This check has been obsolete since the introduction of TASK_TRACED.  Now
TASK_STOPPED always means job control stop.

Signed-off-by: Roland McGrath <roland@redhat.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/exit.c | 11 -----------
 1 file changed, 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 9961192d6055..3ec7b10eae38 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -249,17 +249,6 @@ static int has_stopped_jobs(int pgrp)
 	do_each_task_pid(pgrp, PIDTYPE_PGID, p) {
 		if (p->state != TASK_STOPPED)
 			continue;
-
-		/* If p is stopped by a debugger on a signal that won't
-		   stop it, then don't count p as stopped.  This isn't
-		   perfect but it's a good approximation.  */
-		if (unlikely (p->ptrace)
-		    && p->exit_code != SIGSTOP
-		    && p->exit_code != SIGTSTP
-		    && p->exit_code != SIGTTOU
-		    && p->exit_code != SIGTTIN)
-			continue;
-
 		retval = 1;
 		break;
 	} while_each_task_pid(pgrp, PIDTYPE_PGID, p);
-- 
cgit v1.2.3


From 27d91e07f9b863fa94491ffafe250580f0c2ce78 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Fri, 29 Sep 2006 02:00:31 -0700
Subject: [PATCH] __dequeue_signal() cleanup

This tightens up __dequeue_signal a little.  It also avoids doing
recalc_sigpending twice in a row, instead doing it once in dequeue_signal.

Signed-off-by: Roland McGrath <roland@redhat.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/signal.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 05853a7337e3..fb5da6d19f14 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -417,9 +417,8 @@ static int collect_signal(int sig, struct sigpending *list, siginfo_t *info)
 static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
 			siginfo_t *info)
 {
-	int sig = 0;
+	int sig = next_signal(pending, mask);
 
-	sig = next_signal(pending, mask);
 	if (sig) {
 		if (current->notifier) {
 			if (sigismember(current->notifier_mask, sig)) {
@@ -432,9 +431,7 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
 
 		if (!collect_signal(sig, pending, info))
 			sig = 0;
-				
 	}
-	recalc_sigpending();
 
 	return sig;
 }
@@ -451,6 +448,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
 	if (!signr)
 		signr = __dequeue_signal(&tsk->signal->shared_pending,
 					 mask, info);
+	recalc_sigpending_tsk(tsk);
  	if (signr && unlikely(sig_kernel_stop(signr))) {
  		/*
  		 * Set a marker that we have dequeued a stop signal.  Our
-- 
cgit v1.2.3


From 3171a0305d62e6627a24bff35af4f997e4988a80 Mon Sep 17 00:00:00 2001
From: Atsushi Nemoto <anemo@mba.ocn.ne.jp>
Date: Fri, 29 Sep 2006 02:00:32 -0700
Subject: [PATCH] simplify update_times (avoid jiffies/jiffies_64 aliasing
 problem)

Pass ticks to do_timer() and update_times(), and adjust x86_64 and s390
timer interrupt handler with this change.

Currently update_times() calculates ticks by "jiffies - wall_jiffies", but
callers of do_timer() should know how many ticks to update.  Passing ticks
get rid of this redundant calculation.  Also there are another redundancy
pointed out by Martin Schwidefsky.

This cleanup make a barrier added by
5aee405c662ca644980c184774277fc6d0769a84 needless.  So this patch removes
it.

As a bonus, this cleanup make wall_jiffies can be removed easily, since now
wall_jiffies is always synced with jiffies.  (This patch does not really
remove wall_jiffies.  It would be another cleanup patch)

Signed-off-by: Atsushi Nemoto <anemo@mba.ocn.ne.jp>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: john stultz <johnstul@us.ibm.com>
Cc: Andi Kleen <ak@muc.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Acked-by: Russell King <rmk@arm.linux.org.uk>
Cc: Ian Molton <spyro@f2s.com>
Cc: Mikael Starvik <starvik@axis.com>
Acked-by: David Howells <dhowells@redhat.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Cc: Hirokazu Takata <takata.hirokazu@renesas.com>
Acked-by: Ralf Baechle <ralf@linux-mips.org>
Cc: Kyle McMartin <kyle@mcmartin.ca>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Kazumoto Kojima <kkojima@rr.iij4u.or.jp>
Cc: Richard Curnow <rc@rc0.org.uk>
Cc: William Lee Irwin III <wli@holomorphy.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Jeff Dike <jdike@addtoit.com>
Cc: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
Cc: Miles Bader <uclinux-v850@lsi.nec.co.jp>
Cc: Chris Zankel <chris@zankel.net>
Acked-by: "Luck, Tony" <tony.luck@intel.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Roman Zippel <zippel@linux-m68k.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/alpha/kernel/time.c                 |  2 +-
 arch/arm/kernel/time.c                   |  2 +-
 arch/arm26/kernel/time.c                 |  2 +-
 arch/avr32/kernel/time.c                 |  2 +-
 arch/cris/arch-v10/kernel/time.c         |  2 +-
 arch/cris/arch-v32/kernel/time.c         |  2 +-
 arch/frv/kernel/time.c                   |  2 +-
 arch/h8300/kernel/time.c                 |  2 +-
 arch/ia64/kernel/time.c                  |  2 +-
 arch/m32r/kernel/time.c                  |  2 +-
 arch/m68k/kernel/time.c                  |  2 +-
 arch/m68k/sun3/sun3ints.c                |  2 +-
 arch/m68knommu/kernel/time.c             |  2 +-
 arch/mips/au1000/common/time.c           |  6 +++---
 arch/mips/gt64120/common/time.c          |  2 +-
 arch/mips/kernel/time.c                  |  2 +-
 arch/mips/momentum/ocelot_g/gt-irq.c     |  2 +-
 arch/mips/sgi-ip27/ip27-timer.c          |  2 +-
 arch/parisc/kernel/time.c                |  2 +-
 arch/powerpc/kernel/time.c               |  2 +-
 arch/ppc/kernel/time.c                   |  2 +-
 arch/s390/kernel/time.c                  |  9 ++++-----
 arch/sh/kernel/time.c                    |  2 +-
 arch/sh64/kernel/time.c                  |  2 +-
 arch/sparc/kernel/pcic.c                 |  2 +-
 arch/sparc/kernel/time.c                 |  2 +-
 arch/sparc64/kernel/time.c               |  4 ++--
 arch/um/kernel/time.c                    |  2 +-
 arch/v850/kernel/time.c                  |  2 +-
 arch/x86_64/kernel/time.c                |  8 ++++----
 arch/xtensa/kernel/time.c                |  2 +-
 include/asm-arm/arch-clps711x/time.h     |  2 +-
 include/asm-arm/arch-l7200/time.h        |  2 +-
 include/asm-i386/mach-default/do_timer.h |  2 +-
 include/asm-i386/mach-visws/do_timer.h   |  2 +-
 include/asm-i386/mach-voyager/do_timer.h |  2 +-
 include/linux/sched.h                    |  2 +-
 kernel/timer.c                           | 19 ++++++-------------
 38 files changed, 52 insertions(+), 60 deletions(-)

(limited to 'kernel')

diff --git a/arch/alpha/kernel/time.c b/arch/alpha/kernel/time.c
index b191cc759737..7c1e44420a78 100644
--- a/arch/alpha/kernel/time.c
+++ b/arch/alpha/kernel/time.c
@@ -132,7 +132,7 @@ irqreturn_t timer_interrupt(int irq, void *dev, struct pt_regs * regs)
 	nticks = delta >> FIX_SHIFT;
 
 	while (nticks > 0) {
-		do_timer(regs);
+		do_timer(1);
 #ifndef CONFIG_SMP
 		update_process_times(user_mode(regs));
 #endif
diff --git a/arch/arm/kernel/time.c b/arch/arm/kernel/time.c
index d4dceb5f06e9..f7d5165796ef 100644
--- a/arch/arm/kernel/time.c
+++ b/arch/arm/kernel/time.c
@@ -337,7 +337,7 @@ void timer_tick(struct pt_regs *regs)
 	profile_tick(CPU_PROFILING, regs);
 	do_leds();
 	do_set_rtc();
-	do_timer(regs);
+	do_timer(1);
 #ifndef CONFIG_SMP
 	update_process_times(user_mode(regs));
 #endif
diff --git a/arch/arm26/kernel/time.c b/arch/arm26/kernel/time.c
index db63d75d0715..80adbd005fc5 100644
--- a/arch/arm26/kernel/time.c
+++ b/arch/arm26/kernel/time.c
@@ -194,7 +194,7 @@ EXPORT_SYMBOL(do_settimeofday);
 
 static irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
 {
-        do_timer(regs);
+        do_timer(1);
 #ifndef CONFIG_SMP
 	update_process_times(user_mode(regs));
 #endif
diff --git a/arch/avr32/kernel/time.c b/arch/avr32/kernel/time.c
index b0e6b5855a38..3e56b9f4358a 100644
--- a/arch/avr32/kernel/time.c
+++ b/arch/avr32/kernel/time.c
@@ -148,7 +148,7 @@ timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
 	 * Call the generic timer interrupt handler
 	 */
 	write_seqlock(&xtime_lock);
-	do_timer(regs);
+	do_timer(1);
 	write_sequnlock(&xtime_lock);
 
 	/*
diff --git a/arch/cris/arch-v10/kernel/time.c b/arch/cris/arch-v10/kernel/time.c
index 9c22b76e129a..ebacf1457d91 100644
--- a/arch/cris/arch-v10/kernel/time.c
+++ b/arch/cris/arch-v10/kernel/time.c
@@ -227,7 +227,7 @@ timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
 	
 	/* call the real timer interrupt handler */
 
-	do_timer(regs);
+	do_timer(1);
 	
         cris_do_profile(regs); /* Save profiling information */
 
diff --git a/arch/cris/arch-v32/kernel/time.c b/arch/cris/arch-v32/kernel/time.c
index 50f3f93293d6..be0a01657d4f 100644
--- a/arch/cris/arch-v32/kernel/time.c
+++ b/arch/cris/arch-v32/kernel/time.c
@@ -219,7 +219,7 @@ timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
 		return IRQ_HANDLED;
 
 	/* call the real timer interrupt handler */
-	do_timer(regs);
+	do_timer(1);
 
 	/*
 	 * If we have an externally synchronized Linux clock, then update
diff --git a/arch/frv/kernel/time.c b/arch/frv/kernel/time.c
index 3d0284bccb94..7e55884135ed 100644
--- a/arch/frv/kernel/time.c
+++ b/arch/frv/kernel/time.c
@@ -70,7 +70,7 @@ static irqreturn_t timer_interrupt(int irq, void *dummy, struct pt_regs * regs)
 	 */
 	write_seqlock(&xtime_lock);
 
-	do_timer(regs);
+	do_timer(1);
 	update_process_times(user_mode(regs));
 	profile_tick(CPU_PROFILING, regs);
 
diff --git a/arch/h8300/kernel/time.c b/arch/h8300/kernel/time.c
index 688a5100604c..e569d17b4ae6 100644
--- a/arch/h8300/kernel/time.c
+++ b/arch/h8300/kernel/time.c
@@ -41,7 +41,7 @@ static void timer_interrupt(int irq, void *dummy, struct pt_regs * regs)
 	/* may need to kick the hardware timer */
 	platform_timer_eoi();
 
-	do_timer(regs);
+	do_timer(1);
 #ifndef CONFIG_SMP
 	update_process_times(user_mode(regs));
 #endif
diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c
index 6928ef0d64d8..16262687a103 100644
--- a/arch/ia64/kernel/time.c
+++ b/arch/ia64/kernel/time.c
@@ -78,7 +78,7 @@ timer_interrupt (int irq, void *dev_id, struct pt_regs *regs)
 			 * xtime_lock.
 			 */
 			write_seqlock(&xtime_lock);
-			do_timer(regs);
+			do_timer(1);
 			local_cpu_data->itm_next = new_itm;
 			write_sequnlock(&xtime_lock);
 		} else
diff --git a/arch/m32r/kernel/time.c b/arch/m32r/kernel/time.c
index ded0be07a476..7a896893cd28 100644
--- a/arch/m32r/kernel/time.c
+++ b/arch/m32r/kernel/time.c
@@ -202,7 +202,7 @@ irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
 #ifndef CONFIG_SMP
 	profile_tick(CPU_PROFILING, regs);
 #endif
-	do_timer(regs);
+	do_timer(1);
 
 #ifndef CONFIG_SMP
 	update_process_times(user_mode(regs));
diff --git a/arch/m68k/kernel/time.c b/arch/m68k/kernel/time.c
index 98e4b1adfa29..1072e4946a4a 100644
--- a/arch/m68k/kernel/time.c
+++ b/arch/m68k/kernel/time.c
@@ -40,7 +40,7 @@ static inline int set_rtc_mmss(unsigned long nowtime)
  */
 static irqreturn_t timer_interrupt(int irq, void *dummy, struct pt_regs * regs)
 {
-	do_timer(regs);
+	do_timer(1);
 #ifndef CONFIG_SMP
 	update_process_times(user_mode(regs));
 #endif
diff --git a/arch/m68k/sun3/sun3ints.c b/arch/m68k/sun3/sun3ints.c
index f18b9d3ef16d..dc4ea7e074a6 100644
--- a/arch/m68k/sun3/sun3ints.c
+++ b/arch/m68k/sun3/sun3ints.c
@@ -65,7 +65,7 @@ static irqreturn_t sun3_int5(int irq, void *dev_id, struct pt_regs *fp)
 #ifdef CONFIG_SUN3
 	intersil_clear();
 #endif
-        do_timer(fp);
+        do_timer(1);
 #ifndef CONFIG_SMP
 	update_process_times(user_mode(fp));
 #endif
diff --git a/arch/m68knommu/kernel/time.c b/arch/m68knommu/kernel/time.c
index 1db987272220..db1e1ce0a349 100644
--- a/arch/m68knommu/kernel/time.c
+++ b/arch/m68knommu/kernel/time.c
@@ -51,7 +51,7 @@ static irqreturn_t timer_interrupt(int irq, void *dummy, struct pt_regs * regs)
 
 	write_seqlock(&xtime_lock);
 
-	do_timer(regs);
+	do_timer(1);
 #ifndef CONFIG_SMP
 	update_process_times(user_mode(regs));
 #endif
diff --git a/arch/mips/au1000/common/time.c b/arch/mips/au1000/common/time.c
index 7fbea1bf7b48..0a067f3113a5 100644
--- a/arch/mips/au1000/common/time.c
+++ b/arch/mips/au1000/common/time.c
@@ -96,7 +96,7 @@ void mips_timer_interrupt(struct pt_regs *regs)
 		timerlo = count;
 
 		kstat_this_cpu.irqs[irq]++;
-		do_timer(regs);
+		do_timer(1);
 #ifndef CONFIG_SMP
 		update_process_times(user_mode(regs));
 #endif
@@ -137,7 +137,7 @@ irqreturn_t counter0_irq(int irq, void *dev_id, struct pt_regs *regs)
 	}
 
 	while (time_elapsed > 0) {
-		do_timer(regs);
+		do_timer(1);
 #ifndef CONFIG_SMP
 		update_process_times(user_mode(regs));
 #endif
@@ -156,7 +156,7 @@ irqreturn_t counter0_irq(int irq, void *dev_id, struct pt_regs *regs)
 
 	if (jiffie_drift >= 999) {
 		jiffie_drift -= 999;
-		do_timer(regs); /* increment jiffies by one */
+		do_timer(1); /* increment jiffies by one */
 #ifndef CONFIG_SMP
 		update_process_times(user_mode(regs));
 #endif
diff --git a/arch/mips/gt64120/common/time.c b/arch/mips/gt64120/common/time.c
index d837b26fbe51..7feca49350d1 100644
--- a/arch/mips/gt64120/common/time.c
+++ b/arch/mips/gt64120/common/time.c
@@ -34,7 +34,7 @@ static void gt64120_irq(int irq, void *dev_id, struct pt_regs *regs)
 	if (irq_src & 0x00000800) {	/* Check for timer interrupt */
 		handled = 1;
 		irq_src &= ~0x00000800;
-		do_timer(regs);
+		do_timer(1);
 #ifndef CONFIG_SMP
 		update_process_times(user_mode(regs));
 #endif
diff --git a/arch/mips/kernel/time.c b/arch/mips/kernel/time.c
index 170cb67f4ede..6ab8d975a974 100644
--- a/arch/mips/kernel/time.c
+++ b/arch/mips/kernel/time.c
@@ -434,7 +434,7 @@ irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
 	/*
 	 * call the generic timer interrupt handling
 	 */
-	do_timer(regs);
+	do_timer(1);
 
 	/*
 	 * If we have an externally synchronized Linux clock, then update
diff --git a/arch/mips/momentum/ocelot_g/gt-irq.c b/arch/mips/momentum/ocelot_g/gt-irq.c
index 9fb2493fff02..6cd87cf0195a 100644
--- a/arch/mips/momentum/ocelot_g/gt-irq.c
+++ b/arch/mips/momentum/ocelot_g/gt-irq.c
@@ -133,7 +133,7 @@ static irqreturn_t gt64240_p0int_irq(int irq, void *dev, struct pt_regs *regs)
 		MV_WRITE(TIMER_COUNTER_0_3_INTERRUPT_CAUSE, 0x0);
 
 		/* handle the timer call */
-		do_timer(regs);
+		do_timer(1);
 #ifndef CONFIG_SMP
 		update_process_times(user_mode(regs));
 #endif
diff --git a/arch/mips/sgi-ip27/ip27-timer.c b/arch/mips/sgi-ip27/ip27-timer.c
index b029ba79c27a..c62a3a9ef867 100644
--- a/arch/mips/sgi-ip27/ip27-timer.c
+++ b/arch/mips/sgi-ip27/ip27-timer.c
@@ -111,7 +111,7 @@ again:
 	kstat_this_cpu.irqs[irq]++;		/* kstat only for bootcpu? */
 
 	if (cpu == 0)
-		do_timer(regs);
+		do_timer(1);
 
 	update_process_times(user_mode(regs));
 
diff --git a/arch/parisc/kernel/time.c b/arch/parisc/kernel/time.c
index 5facc9bff4ef..700df10924dd 100644
--- a/arch/parisc/kernel/time.c
+++ b/arch/parisc/kernel/time.c
@@ -79,7 +79,7 @@ irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
 #endif
 		if (cpu == 0) {
 			write_seqlock(&xtime_lock);
-			do_timer(regs);
+			do_timer(1);
 			write_sequnlock(&xtime_lock);
 		}
 	}
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 7a3c3f791ade..71f71da98e7d 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -693,7 +693,7 @@ void timer_interrupt(struct pt_regs * regs)
 		tb_next_jiffy = tb_last_jiffy + tb_ticks_per_jiffy;
 		if (per_cpu(last_jiffy, cpu) >= tb_next_jiffy) {
 			tb_last_jiffy = tb_next_jiffy;
-			do_timer(regs);
+			do_timer(1);
 			timer_recalc_offset(tb_last_jiffy);
 			timer_check_rtc();
 		}
diff --git a/arch/ppc/kernel/time.c b/arch/ppc/kernel/time.c
index 6ab8cc7226ab..1e1f31554767 100644
--- a/arch/ppc/kernel/time.c
+++ b/arch/ppc/kernel/time.c
@@ -153,7 +153,7 @@ void timer_interrupt(struct pt_regs * regs)
 		/* We are in an interrupt, no need to save/restore flags */
 		write_seqlock(&xtime_lock);
 		tb_last_stamp = jiffy_stamp;
-		do_timer(regs);
+		do_timer(1);
 
 		/*
 		 * update the rtc when needed, this should be performed on the
diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c
index 1981c6199fa2..abab42e9f5f8 100644
--- a/arch/s390/kernel/time.c
+++ b/arch/s390/kernel/time.c
@@ -166,7 +166,7 @@ EXPORT_SYMBOL(do_settimeofday);
 void account_ticks(struct pt_regs *regs)
 {
 	__u64 tmp;
-	__u32 ticks, xticks;
+	__u32 ticks;
 
 	/* Calculate how many ticks have passed. */
 	if (S390_lowcore.int_clock < S390_lowcore.jiffy_timer) {
@@ -204,6 +204,7 @@ void account_ticks(struct pt_regs *regs)
 	 */
 	write_seqlock(&xtime_lock);
 	if (S390_lowcore.jiffy_timer > xtime_cc) {
+		__u32 xticks;
 		tmp = S390_lowcore.jiffy_timer - xtime_cc;
 		if (tmp >= 2*CLK_TICKS_PER_JIFFY) {
 			xticks = __div(tmp, CLK_TICKS_PER_JIFFY);
@@ -212,13 +213,11 @@ void account_ticks(struct pt_regs *regs)
 			xticks = 1;
 			xtime_cc += CLK_TICKS_PER_JIFFY;
 		}
-		while (xticks--)
-			do_timer(regs);
+		do_timer(xticks);
 	}
 	write_sequnlock(&xtime_lock);
 #else
-	for (xticks = ticks; xticks > 0; xticks--)
-		do_timer(regs);
+	do_timer(ticks);
 #endif
 
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING
diff --git a/arch/sh/kernel/time.c b/arch/sh/kernel/time.c
index 149d9713eddf..f664a196c4f5 100644
--- a/arch/sh/kernel/time.c
+++ b/arch/sh/kernel/time.c
@@ -117,7 +117,7 @@ static long last_rtc_update;
  */
 void handle_timer_tick(struct pt_regs *regs)
 {
-	do_timer(regs);
+	do_timer(1);
 #ifndef CONFIG_SMP
 	update_process_times(user_mode(regs));
 #endif
diff --git a/arch/sh64/kernel/time.c b/arch/sh64/kernel/time.c
index b8162e59030e..3b61e06f9d72 100644
--- a/arch/sh64/kernel/time.c
+++ b/arch/sh64/kernel/time.c
@@ -298,7 +298,7 @@ static inline void do_timer_interrupt(int irq, struct pt_regs *regs)
 	asm ("getcon cr62, %0" : "=r" (current_ctc));
 	ctc_last_interrupt = (unsigned long) current_ctc;
 
-	do_timer(regs);
+	do_timer(1);
 #ifndef CONFIG_SMP
 	update_process_times(user_mode(regs));
 #endif
diff --git a/arch/sparc/kernel/pcic.c b/arch/sparc/kernel/pcic.c
index bfd31aac2df3..e19b1bad9bc5 100644
--- a/arch/sparc/kernel/pcic.c
+++ b/arch/sparc/kernel/pcic.c
@@ -712,7 +712,7 @@ static irqreturn_t pcic_timer_handler (int irq, void *h, struct pt_regs *regs)
 {
 	write_seqlock(&xtime_lock);	/* Dummy, to show that we remember */
 	pcic_clear_clock_irq();
-	do_timer(regs);
+	do_timer(1);
 #ifndef CONFIG_SMP
 	update_process_times(user_mode(regs));
 #endif
diff --git a/arch/sparc/kernel/time.c b/arch/sparc/kernel/time.c
index 845081b01267..6f84fa1b58e5 100644
--- a/arch/sparc/kernel/time.c
+++ b/arch/sparc/kernel/time.c
@@ -128,7 +128,7 @@ irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs * regs)
 #endif
 	clear_clock_irq();
 
-	do_timer(regs);
+	do_timer(1);
 #ifndef CONFIG_SMP
 	update_process_times(user_mode(regs));
 #endif
diff --git a/arch/sparc64/kernel/time.c b/arch/sparc64/kernel/time.c
index b0b4feeec098..ca1193482f07 100644
--- a/arch/sparc64/kernel/time.c
+++ b/arch/sparc64/kernel/time.c
@@ -465,7 +465,7 @@ irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs * regs)
 		profile_tick(CPU_PROFILING, regs);
 		update_process_times(user_mode(regs));
 #endif
-		do_timer(regs);
+		do_timer(1);
 
 		/* Guarantee that the following sequences execute
 		 * uninterrupted.
@@ -496,7 +496,7 @@ void timer_tick_interrupt(struct pt_regs *regs)
 {
 	write_seqlock(&xtime_lock);
 
-	do_timer(regs);
+	do_timer(1);
 
 	timer_check_rtc();
 
diff --git a/arch/um/kernel/time.c b/arch/um/kernel/time.c
index 820affbf3e16..a92965f8f9cd 100644
--- a/arch/um/kernel/time.c
+++ b/arch/um/kernel/time.c
@@ -93,7 +93,7 @@ irqreturn_t um_timer(int irq, void *dev, struct pt_regs *regs)
 
 	write_seqlock_irqsave(&xtime_lock, flags);
 
-	do_timer(regs);
+	do_timer(1);
 
 	nsecs = get_time();
 	xtime.tv_sec = nsecs / NSEC_PER_SEC;
diff --git a/arch/v850/kernel/time.c b/arch/v850/kernel/time.c
index a0b46695f186..f4d1a4d3cdc2 100644
--- a/arch/v850/kernel/time.c
+++ b/arch/v850/kernel/time.c
@@ -51,7 +51,7 @@ static irqreturn_t timer_interrupt (int irq, void *dummy, struct pt_regs *regs)
 	if (mach_tick)
 	  mach_tick ();
 
-	do_timer (regs);
+	do_timer (1);
 #ifndef CONFIG_SMP
 	update_process_times(user_mode(regs));
 #endif
diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c
index 1c255ee76e7c..7ea3bf2a858c 100644
--- a/arch/x86_64/kernel/time.c
+++ b/arch/x86_64/kernel/time.c
@@ -415,16 +415,16 @@ void main_timer_handler(struct pt_regs *regs)
 				(((long) offset << US_SCALE) / vxtime.tsc_quot) - 1;
 	}
 
-	if (lost > 0) {
+	if (lost > 0)
 		handle_lost_ticks(lost, regs);
-		jiffies += lost;
-	}
+	else
+		lost = 0;
 
 /*
  * Do the timer stuff.
  */
 
-	do_timer(regs);
+	do_timer(lost + 1);
 #ifndef CONFIG_SMP
 	update_process_times(user_mode(regs));
 #endif
diff --git a/arch/xtensa/kernel/time.c b/arch/xtensa/kernel/time.c
index 412ab32de391..241db201f40e 100644
--- a/arch/xtensa/kernel/time.c
+++ b/arch/xtensa/kernel/time.c
@@ -175,7 +175,7 @@ again:
 
 		last_ccount_stamp = next;
 		next += CCOUNT_PER_JIFFY;
-		do_timer (regs); /* Linux handler in kernel/timer.c */
+		do_timer (1); /* Linux handler in kernel/timer.c */
 
 		if (ntp_synced() &&
 		    xtime.tv_sec - last_rtc_update >= 659 &&
diff --git a/include/asm-arm/arch-clps711x/time.h b/include/asm-arm/arch-clps711x/time.h
index 9cb27cd4e6ae..0e4a3901d3b3 100644
--- a/include/asm-arm/arch-clps711x/time.h
+++ b/include/asm-arm/arch-clps711x/time.h
@@ -29,7 +29,7 @@ static irqreturn_t
 p720t_timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
 {
 	do_leds();
-	do_timer(regs);
+	do_timer(1);
 #ifndef CONFIG_SMP
 	update_process_times(user_mode(regs));
 #endif
diff --git a/include/asm-arm/arch-l7200/time.h b/include/asm-arm/arch-l7200/time.h
index 7b98b533e63a..c69cb508735f 100644
--- a/include/asm-arm/arch-l7200/time.h
+++ b/include/asm-arm/arch-l7200/time.h
@@ -45,7 +45,7 @@
 static irqreturn_t
 timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
 {
-	do_timer(regs);
+	do_timer(1);
 #ifndef CONFIG_SMP
 	update_process_times(user_mode(regs));
 #endif
diff --git a/include/asm-i386/mach-default/do_timer.h b/include/asm-i386/mach-default/do_timer.h
index 6312c3e79814..4182c347ef85 100644
--- a/include/asm-i386/mach-default/do_timer.h
+++ b/include/asm-i386/mach-default/do_timer.h
@@ -16,7 +16,7 @@
 
 static inline void do_timer_interrupt_hook(struct pt_regs *regs)
 {
-	do_timer(regs);
+	do_timer(1);
 #ifndef CONFIG_SMP
 	update_process_times(user_mode_vm(regs));
 #endif
diff --git a/include/asm-i386/mach-visws/do_timer.h b/include/asm-i386/mach-visws/do_timer.h
index 95568e6ca91c..8db618c5a72b 100644
--- a/include/asm-i386/mach-visws/do_timer.h
+++ b/include/asm-i386/mach-visws/do_timer.h
@@ -9,7 +9,7 @@ static inline void do_timer_interrupt_hook(struct pt_regs *regs)
 	/* Clear the interrupt */
 	co_cpu_write(CO_CPU_STAT,co_cpu_read(CO_CPU_STAT) & ~CO_STAT_TIMEINTR);
 
-	do_timer(regs);
+	do_timer(1);
 #ifndef CONFIG_SMP
 	update_process_times(user_mode_vm(regs));
 #endif
diff --git a/include/asm-i386/mach-voyager/do_timer.h b/include/asm-i386/mach-voyager/do_timer.h
index eaf518098981..099fe9f5c1b2 100644
--- a/include/asm-i386/mach-voyager/do_timer.h
+++ b/include/asm-i386/mach-voyager/do_timer.h
@@ -3,7 +3,7 @@
 
 static inline void do_timer_interrupt_hook(struct pt_regs *regs)
 {
-	do_timer(regs);
+	do_timer(1);
 #ifndef CONFIG_SMP
 	update_process_times(user_mode_vm(regs));
 #endif
diff --git a/include/linux/sched.h b/include/linux/sched.h
index ed2af8671589..503dea61ff99 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1206,7 +1206,7 @@ extern void switch_uid(struct user_struct *);
 
 #include <asm/current.h>
 
-extern void do_timer(struct pt_regs *);
+extern void do_timer(unsigned long ticks);
 
 extern int FASTCALL(wake_up_state(struct task_struct * tsk, unsigned int state));
 extern int FASTCALL(wake_up_process(struct task_struct * tsk));
diff --git a/kernel/timer.c b/kernel/timer.c
index a2cb1ecb1b28..4f55622b0d38 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1222,10 +1222,8 @@ static inline void calc_load(unsigned long ticks)
 	unsigned long active_tasks; /* fixed-point */
 	static int count = LOAD_FREQ;
 
-	count -= ticks;
-	if (count < 0) {
-		count += LOAD_FREQ;
-		active_tasks = count_active_tasks();
+	active_tasks = count_active_tasks();
+	for (count -= ticks; count < 0; count += LOAD_FREQ) {
 		CALC_LOAD(avenrun[0], EXP_1, active_tasks);
 		CALC_LOAD(avenrun[1], EXP_5, active_tasks);
 		CALC_LOAD(avenrun[2], EXP_15, active_tasks);
@@ -1270,11 +1268,8 @@ void run_local_timers(void)
  * Called by the timer interrupt. xtime_lock must already be taken
  * by the timer IRQ!
  */
-static inline void update_times(void)
+static inline void update_times(unsigned long ticks)
 {
-	unsigned long ticks;
-
-	ticks = jiffies - wall_jiffies;
 	wall_jiffies += ticks;
 	update_wall_time();
 	calc_load(ticks);
@@ -1286,12 +1281,10 @@ static inline void update_times(void)
  * jiffies is defined in the linker script...
  */
 
-void do_timer(struct pt_regs *regs)
+void do_timer(unsigned long ticks)
 {
-	jiffies_64++;
-	/* prevent loading jiffies before storing new jiffies_64 value. */
-	barrier();
-	update_times();
+	jiffies_64 += ticks;
+	update_times(ticks);
 }
 
 #ifdef __ARCH_WANT_SYS_ALARM
-- 
cgit v1.2.3


From 0b4a8a789a328af6aac613734c362cf6aad72201 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Fri, 29 Sep 2006 02:00:39 -0700
Subject: [PATCH] kexec warning fix

This fixes a couple of compiler warnings, and adds paranoia checks as well.

Signed-off-by: Roland McGrath <roland@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/kexec.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kexec.c b/kernel/kexec.c
index 37cad75cf494..fcdd5d2bc3f4 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -995,7 +995,8 @@ asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments,
 	image = xchg(dest_image, image);
 
 out:
-	xchg(&kexec_lock, 0); /* Release the mutex */
+	locked = xchg(&kexec_lock, 0); /* Release the mutex */
+	BUG_ON(!locked);
 	kimage_free(image);
 
 	return result;
@@ -1061,7 +1062,8 @@ void crash_kexec(struct pt_regs *regs)
 			machine_crash_shutdown(&fixed_regs);
 			machine_kexec(kexec_crash_image);
 		}
-		xchg(&kexec_lock, 0);
+		locked = xchg(&kexec_lock, 0);
+		BUG_ON(!locked);
 	}
 }
 
-- 
cgit v1.2.3


From 54306cf04c0ea0a8c432603dbc657ab62a438668 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@lxorguk.ukuu.org.uk>
Date: Fri, 29 Sep 2006 02:00:42 -0700
Subject: [PATCH] exit: fix crash case

If we are going to BUG() not panic() here then we should cover the case of
the BUG being compiled out

Signed-off-by: Alan Cox <alan@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/exit.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 3ec7b10eae38..a51b347ae67b 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -963,7 +963,8 @@ fastcall NORET_TYPE void do_exit(long code)
 	schedule();
 	BUG();
 	/* Avoid "noreturn function does return".  */
-	for (;;) ;
+	for (;;)
+		cpu_relax();	/* For when BUG is null */
 }
 
 EXPORT_SYMBOL_GPL(do_exit);
-- 
cgit v1.2.3


From 111dbe0c8a21dffa473239861be47ebc87f593b3 Mon Sep 17 00:00:00 2001
From: Björn Steinbrink <B.Steinbrink@gmx.de>
Date: Fri, 29 Sep 2006 02:00:46 -0700
Subject: [PATCH] Fix ____call_usermodehelper errors being silently ignored
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If ____call_usermodehelper fails, we're not interested in the child
process' exit value, but the real error, so let's stop wait_for_helper from
overwriting it in that case.

Issue discovered by Benedikt Böhm while working on a Linux-VServer usermode
helper.

Signed-off-by: Björn Steinbrink <B.Steinbrink@gmx.de>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/kmod.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/kmod.c b/kernel/kmod.c
index 5c470c57fb57..842f8015d7fd 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -176,6 +176,8 @@ static int wait_for_helper(void *data)
 	if (pid < 0) {
 		sub_info->retval = pid;
 	} else {
+		int ret;
+
 		/*
 		 * Normally it is bogus to call wait4() from in-kernel because
 		 * wait4() wants to write the exit code to a userspace address.
@@ -185,7 +187,15 @@ static int wait_for_helper(void *data)
 		 *
 		 * Thus the __user pointer cast is valid here.
 		 */
-		sys_wait4(pid, (int __user *) &sub_info->retval, 0, NULL);
+		sys_wait4(pid, (int __user *)&ret, 0, NULL);
+
+		/*
+		 * If ret is 0, either ____call_usermodehelper failed and the
+		 * real error code is already in sub_info->retval or
+		 * sub_info->retval is 0 anyway, so don't mess with it then.
+		 */
+		if (ret)
+			sub_info->retval = ret;
 	}
 
 	complete(sub_info->complete);
-- 
cgit v1.2.3


From c9472e0f28cd2f0695a0ac3a0b4bd33f21714a7e Mon Sep 17 00:00:00 2001
From: Cal Peake <cp@absolutedigital.net>
Date: Fri, 29 Sep 2006 02:00:47 -0700
Subject: [PATCH] kill extraneous printk in kernel_restart()

Get rid of an extraneous printk in kernel_restart().

Signed-off-by: Cal Peake <cp@absolutedigital.net>
Acked-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sys.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sys.c b/kernel/sys.c
index 3f894775488d..8647061c084a 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -612,7 +612,6 @@ void kernel_restart(char *cmd)
 	} else {
 		printk(KERN_EMERG "Restarting system with command '%s'.\n", cmd);
 	}
-	printk(".\n");
 	machine_restart(cmd);
 }
 EXPORT_SYMBOL_GPL(kernel_restart);
-- 
cgit v1.2.3


From 5fe1d75f34974046fffcca5e22fb8a7b42fded33 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 29 Sep 2006 02:00:48 -0700
Subject: [PATCH] do_sched_setscheduler(): don't take tasklist_lock

Use rcu locks instead. sched_setscheduler() now takes ->siglock
before reading ->signal->rlim[].

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 28 ++++++++++++++++++----------
 1 file changed, 18 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index b946209d9c15..f9b3c6a414f1 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4080,6 +4080,8 @@ static void __setscheduler(struct task_struct *p, int policy, int prio)
  * @p: the task in question.
  * @policy: new policy.
  * @param: structure containing the new RT priority.
+ *
+ * NOTE: the task may be already dead
  */
 int sched_setscheduler(struct task_struct *p, int policy,
 		       struct sched_param *param)
@@ -4115,19 +4117,26 @@ recheck:
 	 * Allow unprivileged RT tasks to decrease priority:
 	 */
 	if (!capable(CAP_SYS_NICE)) {
+		unsigned long rlim_rtprio;
+		unsigned long flags;
+
+		if (!lock_task_sighand(p, &flags))
+			return -ESRCH;
+		rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur;
+		unlock_task_sighand(p, &flags);
+
 		/*
 		 * can't change policy, except between SCHED_NORMAL
 		 * and SCHED_BATCH:
 		 */
 		if (((policy != SCHED_NORMAL && p->policy != SCHED_BATCH) &&
 			(policy != SCHED_BATCH && p->policy != SCHED_NORMAL)) &&
-				!p->signal->rlim[RLIMIT_RTPRIO].rlim_cur)
+				!rlim_rtprio)
 			return -EPERM;
 		/* can't increase priority */
 		if ((policy != SCHED_NORMAL && policy != SCHED_BATCH) &&
 		    param->sched_priority > p->rt_priority &&
-		    param->sched_priority >
-				p->signal->rlim[RLIMIT_RTPRIO].rlim_cur)
+		    param->sched_priority > rlim_rtprio)
 			return -EPERM;
 		/* can't change other user's priorities */
 		if ((current->euid != p->euid) &&
@@ -4193,14 +4202,13 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
 		return -EINVAL;
 	if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
 		return -EFAULT;
-	read_lock_irq(&tasklist_lock);
+
+	rcu_read_lock();
+	retval = -ESRCH;
 	p = find_process_by_pid(pid);
-	if (!p) {
-		read_unlock_irq(&tasklist_lock);
-		return -ESRCH;
-	}
-	retval = sched_setscheduler(p, policy, &lparam);
-	read_unlock_irq(&tasklist_lock);
+	if (p != NULL)
+		retval = sched_setscheduler(p, policy, &lparam);
+	rcu_read_unlock();
 
 	return retval;
 }
-- 
cgit v1.2.3


From 57a6f51c4281aa3119975473c70dce0480d322bd Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 29 Sep 2006 02:00:49 -0700
Subject: [PATCH] introduce is_rt_policy() helper

Imho, makes the code a bit easier to read.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/sched.h | 4 ++--
 kernel/sched.c        | 5 ++---
 2 files changed, 4 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 503dea61ff99..fbc69cc3923d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -504,8 +504,8 @@ struct signal_struct {
 #define rt_prio(prio)		unlikely((prio) < MAX_RT_PRIO)
 #define rt_task(p)		rt_prio((p)->prio)
 #define batch_task(p)		(unlikely((p)->policy == SCHED_BATCH))
-#define has_rt_policy(p) \
-	unlikely((p)->policy != SCHED_NORMAL && (p)->policy != SCHED_BATCH)
+#define is_rt_policy(p)		((p) != SCHED_NORMAL && (p) != SCHED_BATCH)
+#define has_rt_policy(p)	unlikely(is_rt_policy((p)->policy))
 
 /*
  * Some day this will be a full-fledged user tracking system..
diff --git a/kernel/sched.c b/kernel/sched.c
index f9b3c6a414f1..c3c718aea618 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4109,8 +4109,7 @@ recheck:
 	    (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
 	    (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
 		return -EINVAL;
-	if ((policy == SCHED_NORMAL || policy == SCHED_BATCH)
-					!= (param->sched_priority == 0))
+	if (is_rt_policy(policy) != (param->sched_priority != 0))
 		return -EINVAL;
 
 	/*
@@ -4134,7 +4133,7 @@ recheck:
 				!rlim_rtprio)
 			return -EPERM;
 		/* can't increase priority */
-		if ((policy != SCHED_NORMAL && policy != SCHED_BATCH) &&
+		if (is_rt_policy(policy) &&
 		    param->sched_priority > p->rt_priority &&
 		    param->sched_priority > rlim_rtprio)
 			return -EPERM;
-- 
cgit v1.2.3


From 8dc3e9099e01dfdd53f27f329c268eced03dfef6 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 29 Sep 2006 02:00:50 -0700
Subject: [PATCH] sched_setscheduler: fix? policy checks

I am not sure this patch is correct: I can't understand what the current
code does, and I don't know what it was supposed to do.

The comment says:

		 * can't change policy, except between SCHED_NORMAL
		 * and SCHED_BATCH:

The code:

		if (((policy != SCHED_NORMAL && p->policy != SCHED_BATCH) &&
			(policy != SCHED_BATCH && p->policy != SCHED_NORMAL)) &&

But this is equivalent to:

		if ( (is_rt_policy(policy) && has_rt_policy(p)) &&

which means something different.  We can't _decrease_ the current
->rt_priority with such a check (if rlim[RLIMIT_RTPRIO] == 0).

Probably, it was supposed to be:

		if (	!(policy == SCHED_NORMAL && p->policy == SCHED_BATCH)  &&
			!(policy == SCHED_BATCH  && p->policy == SCHED_NORMAL)

this matches the comment, but strange: it doesn't allow to _drop_ the
realtime priority when rlim[RLIMIT_RTPRIO] == 0.

I think the right check would be:

		/* can't set/change rt policy */
		if (is_rt_policy(policy) &&
				policy != p->policy &&
				!rlim_rtprio)
			return -EPERM;

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 38 ++++++++++++++++++--------------------
 1 file changed, 18 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index c3c718aea618..155a33da7aa7 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4116,27 +4116,25 @@ recheck:
 	 * Allow unprivileged RT tasks to decrease priority:
 	 */
 	if (!capable(CAP_SYS_NICE)) {
-		unsigned long rlim_rtprio;
-		unsigned long flags;
-
-		if (!lock_task_sighand(p, &flags))
-			return -ESRCH;
-		rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur;
-		unlock_task_sighand(p, &flags);
+		if (is_rt_policy(policy)) {
+			unsigned long rlim_rtprio;
+			unsigned long flags;
+
+			if (!lock_task_sighand(p, &flags))
+				return -ESRCH;
+			rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur;
+			unlock_task_sighand(p, &flags);
+
+			/* can't set/change the rt policy */
+			if (policy != p->policy && !rlim_rtprio)
+				return -EPERM;
+
+			/* can't increase priority */
+			if (param->sched_priority > p->rt_priority &&
+			    param->sched_priority > rlim_rtprio)
+				return -EPERM;
+		}
 
-		/*
-		 * can't change policy, except between SCHED_NORMAL
-		 * and SCHED_BATCH:
-		 */
-		if (((policy != SCHED_NORMAL && p->policy != SCHED_BATCH) &&
-			(policy != SCHED_BATCH && p->policy != SCHED_NORMAL)) &&
-				!rlim_rtprio)
-			return -EPERM;
-		/* can't increase priority */
-		if (is_rt_policy(policy) &&
-		    param->sched_priority > p->rt_priority &&
-		    param->sched_priority > rlim_rtprio)
-			return -EPERM;
 		/* can't change other user's priorities */
 		if ((current->euid != p->euid) &&
 		    (current->euid != p->uid))
-- 
cgit v1.2.3


From 1c573afebc6213e4372e0d8352034c23d5262e1f Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 29 Sep 2006 02:00:51 -0700
Subject: [PATCH] reparent_to_init(): use has_rt_policy()

Remove open-coded has_rt_policy(), no changes in kernel/exit.o

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/exit.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index a51b347ae67b..4a280856acd2 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -281,9 +281,7 @@ static void reparent_to_init(void)
 	/* Set the exit signal to SIGCHLD so we signal init on exit */
 	current->exit_signal = SIGCHLD;
 
-	if ((current->policy == SCHED_NORMAL ||
-			current->policy == SCHED_BATCH)
-				&& (task_nice(current) < 0))
+	if (!has_rt_policy(current) && (task_nice(current) < 0))
 		set_user_nice(current, 0);
 	/* cpus_allowed? */
 	/* rt_priority? */
-- 
cgit v1.2.3


From 5b160f5ecd2f1b6df2e0015dc1f319c8ef803d62 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 29 Sep 2006 02:00:52 -0700
Subject: [PATCH] copy_process: cosmetic ->ioprio tweak

copy_process:
// holds tasklist_lock + ->siglock
       /*
        * inherit ioprio
        */
       p->ioprio = current->ioprio;

Why?  ->ioprio was already copied in dup_task_struct().  I guess this is
needed to ensure that the child can't escape
sys_ioprio_set(IOPRIO_WHO_{PGRP,USER}), yes?

In that case we don't need ->siglock held, and the comment should be
updated.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Jens Axboe <axboe@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/fork.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index bca6ce6d3ded..1c999f3e0b47 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1150,7 +1150,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 
 	/* Our parent execution domain becomes current domain
 	   These must match for thread signalling to apply */
-	   
 	p->parent_exec_id = p->self_exec_id;
 
 	/* ok, now we should be set up.. */
@@ -1173,6 +1172,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	/* Need tasklist lock for parent etc handling! */
 	write_lock_irq(&tasklist_lock);
 
+	/* for sys_ioprio_set(IOPRIO_WHO_PGRP) */
+	p->ioprio = current->ioprio;
+
 	/*
 	 * The task hasn't been attached yet, so its cpus_allowed mask will
 	 * not be changed, nor will its assigned CPU.
@@ -1232,11 +1234,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 		}
 	}
 
-	/*
-	 * inherit ioprio
-	 */
-	p->ioprio = current->ioprio;
-
 	if (likely(p->pid)) {
 		add_parent(p);
 		if (unlikely(p->ptrace & PT_PTRACED))
-- 
cgit v1.2.3


From d359b549bf3d7f42f0084918a4816ea4572e507c Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 29 Sep 2006 02:00:55 -0700
Subject: [PATCH] futex_find_get_task(): don't take tasklist_lock

It is ok to do find_task_by_pid() + get_task_struct() under
rcu_read_lock(), we cand drop tasklist_lock.

Note that testing of ->exit_state is racy with or without tasklist anyway.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/futex.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index 9d260e838cff..ca8ef11feb65 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -389,7 +389,7 @@ static struct task_struct * futex_find_get_task(pid_t pid)
 {
 	struct task_struct *p;
 
-	read_lock(&tasklist_lock);
+	rcu_read_lock();
 	p = find_task_by_pid(pid);
 	if (!p)
 		goto out_unlock;
@@ -403,7 +403,7 @@ static struct task_struct * futex_find_get_task(pid_t pid)
 	}
 	get_task_struct(p);
 out_unlock:
-	read_unlock(&tasklist_lock);
+	rcu_read_unlock();
 
 	return p;
 }
-- 
cgit v1.2.3


From aaa2a97eb9c0e91d7abc66bf76811a9599fdb3ee Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 29 Sep 2006 02:00:55 -0700
Subject: [PATCH] sys_get_robust_list(): don't take tasklist_lock

use rcu locks for find_task_by_pid().

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/futex.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index ca8ef11feb65..4b6770e9806d 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1624,7 +1624,7 @@ sys_get_robust_list(int pid, struct robust_list_head __user **head_ptr,
 		struct task_struct *p;
 
 		ret = -ESRCH;
-		read_lock(&tasklist_lock);
+		rcu_read_lock();
 		p = find_task_by_pid(pid);
 		if (!p)
 			goto err_unlock;
@@ -1633,7 +1633,7 @@ sys_get_robust_list(int pid, struct robust_list_head __user **head_ptr,
 				!capable(CAP_SYS_PTRACE))
 			goto err_unlock;
 		head = p->robust_list;
-		read_unlock(&tasklist_lock);
+		rcu_read_unlock();
 	}
 
 	if (put_user(sizeof(*head), len_ptr))
@@ -1641,7 +1641,7 @@ sys_get_robust_list(int pid, struct robust_list_head __user **head_ptr,
 	return put_user(head, head_ptr);
 
 err_unlock:
-	read_unlock(&tasklist_lock);
+	rcu_read_unlock();
 
 	return ret;
 }
-- 
cgit v1.2.3


From 29b884921634e1e01cbd276e1c9b8fc07a7e4a90 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 29 Sep 2006 02:01:09 -0700
Subject: [PATCH] set EXIT_DEAD state in do_exit(), not in schedule()

schedule() checks PF_DEAD on every context switch and sets ->state = EXIT_DEAD
to ensure that the exiting task will be deactivated.  Note that this EXIT_DEAD
is in fact a "random" value, we can use any bit except normal TASK_XXX values.

It is better to set this state in do_exit() along with PF_DEAD flag and remove
that check in schedule().

We are safe wrt concurrent try_to_wake_up() (for example ptrace, tkill), it
can not change task's ->state: the 'state' argument of try_to_wake_up() can't
have EXIT_DEAD bit.  And in case when try_to_wake_up() sees a stale value of
->state == TASK_RUNNING it will do nothing.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/exit.c  | 1 +
 kernel/sched.c | 3 ---
 2 files changed, 1 insertion(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 4a280856acd2..3d759c98fb11 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -957,6 +957,7 @@ fastcall NORET_TYPE void do_exit(long code)
 	preempt_disable();
 	BUG_ON(tsk->flags & PF_DEAD);
 	tsk->flags |= PF_DEAD;
+	tsk->state = EXIT_DEAD;
 
 	schedule();
 	BUG();
diff --git a/kernel/sched.c b/kernel/sched.c
index 155a33da7aa7..e1646b044b69 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3348,9 +3348,6 @@ need_resched_nonpreemptible:
 
 	spin_lock_irq(&rq->lock);
 
-	if (unlikely(prev->flags & PF_DEAD))
-		prev->state = EXIT_DEAD;
-
 	switch_count = &prev->nivcsw;
 	if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
 		switch_count = &prev->nvcsw;
-- 
cgit v1.2.3


From 55a101f8f71a3d3dbda7b5c77083ffe47552f831 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 29 Sep 2006 02:01:10 -0700
Subject: [PATCH] kill PF_DEAD flag

After the previous change (->flags & PF_DEAD) <=> (->state == EXIT_DEAD), we
don't need PF_DEAD any longer.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/sched.h |  1 -
 kernel/exit.c         |  6 ++----
 kernel/sched.c        | 16 ++++++++--------
 mm/oom_kill.c         |  4 ++--
 4 files changed, 12 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index fbc69cc3923d..9763de334f09 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1061,7 +1061,6 @@ static inline void put_task_struct(struct task_struct *t)
 					/* Not implemented yet, only for 486*/
 #define PF_STARTING	0x00000002	/* being created */
 #define PF_EXITING	0x00000004	/* getting shut down */
-#define PF_DEAD		0x00000008	/* Dead */
 #define PF_FORKNOEXEC	0x00000040	/* forked but didn't exec */
 #define PF_SUPERPRIV	0x00000100	/* used super-user privileges */
 #define PF_DUMPCORE	0x00000200	/* dumped core */
diff --git a/kernel/exit.c b/kernel/exit.c
index 3d759c98fb11..9dd5f1336da2 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -953,10 +953,8 @@ fastcall NORET_TYPE void do_exit(long code)
 	if (tsk->splice_pipe)
 		__free_pipe_info(tsk->splice_pipe);
 
-	/* PF_DEAD causes final put_task_struct after we schedule. */
 	preempt_disable();
-	BUG_ON(tsk->flags & PF_DEAD);
-	tsk->flags |= PF_DEAD;
+	/* causes final put_task_struct in finish_task_switch(). */
 	tsk->state = EXIT_DEAD;
 
 	schedule();
@@ -972,7 +970,7 @@ NORET_TYPE void complete_and_exit(struct completion *comp, long code)
 {
 	if (comp)
 		complete(comp);
-	
+
 	do_exit(code);
 }
 
diff --git a/kernel/sched.c b/kernel/sched.c
index e1646b044b69..a9405d7cc6ab 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1755,27 +1755,27 @@ static inline void finish_task_switch(struct rq *rq, struct task_struct *prev)
 	__releases(rq->lock)
 {
 	struct mm_struct *mm = rq->prev_mm;
-	unsigned long prev_task_flags;
+	long prev_state;
 
 	rq->prev_mm = NULL;
 
 	/*
 	 * A task struct has one reference for the use as "current".
-	 * If a task dies, then it sets EXIT_ZOMBIE in tsk->exit_state and
-	 * calls schedule one last time. The schedule call will never return,
-	 * and the scheduled task must drop that reference.
-	 * The test for EXIT_ZOMBIE must occur while the runqueue locks are
+	 * If a task dies, then it sets EXIT_DEAD in tsk->state and calls
+	 * schedule one last time. The schedule call will never return, and
+	 * the scheduled task must drop that reference.
+	 * The test for EXIT_DEAD must occur while the runqueue locks are
 	 * still held, otherwise prev could be scheduled on another cpu, die
 	 * there before we look at prev->state, and then the reference would
 	 * be dropped twice.
 	 *		Manfred Spraul <manfred@colorfullife.com>
 	 */
-	prev_task_flags = prev->flags;
+	prev_state = prev->state;
 	finish_arch_switch(prev);
 	finish_lock_switch(rq, prev);
 	if (mm)
 		mmdrop(mm);
-	if (unlikely(prev_task_flags & PF_DEAD)) {
+	if (unlikely(prev_state == EXIT_DEAD)) {
 		/*
 		 * Remove function-return probe instances associated with this
 		 * task and put them back on the free list.
@@ -5153,7 +5153,7 @@ static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
 	BUG_ON(p->exit_state != EXIT_ZOMBIE && p->exit_state != EXIT_DEAD);
 
 	/* Cannot have done final schedule yet: would have vanished. */
-	BUG_ON(p->flags & PF_DEAD);
+	BUG_ON(p->state == EXIT_DEAD);
 
 	get_task_struct(p);
 
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index f3dd79c1c367..202f186a753a 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -226,8 +226,8 @@ static struct task_struct *select_bad_process(unsigned long *ppoints)
 		releasing = test_tsk_thread_flag(p, TIF_MEMDIE) ||
 						p->flags & PF_EXITING;
 		if (releasing) {
-			/* PF_DEAD tasks have already released their mm */
-			if (p->flags & PF_DEAD)
+			/* TASK_DEAD tasks have already released their mm */
+			if (p->state == EXIT_DEAD)
 				continue;
 			if (p->flags & PF_EXITING && p == current) {
 				chosen = p;
-- 
cgit v1.2.3


From c394cc9fbb367f87faa2228ec2eabacd2d4701c6 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 29 Sep 2006 02:01:11 -0700
Subject: [PATCH] introduce TASK_DEAD state

I am not sure about this patch, I am asking Ingo to take a decision.

task_struct->state == EXIT_DEAD is a very special case, to avoid a confusion
it makes sense to introduce a new state, TASK_DEAD, while EXIT_DEAD should
live only in ->exit_state as documented in sched.h.

Note that this state is not visible to user-space, get_task_state() masks off
unsuitable states.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/sched.h | 1 +
 kernel/exit.c         | 2 +-
 kernel/sched.c        | 8 ++++----
 mm/oom_kill.c         | 2 +-
 4 files changed, 7 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 9763de334f09..a06fc89cf6e5 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -148,6 +148,7 @@ extern unsigned long weighted_cpuload(const int cpu);
 #define EXIT_DEAD		32
 /* in tsk->state again */
 #define TASK_NONINTERACTIVE	64
+#define TASK_DEAD		128
 
 #define __set_task_state(tsk, state_value)		\
 	do { (tsk)->state = (state_value); } while (0)
diff --git a/kernel/exit.c b/kernel/exit.c
index 9dd5f1336da2..2e4c13cba95a 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -955,7 +955,7 @@ fastcall NORET_TYPE void do_exit(long code)
 
 	preempt_disable();
 	/* causes final put_task_struct in finish_task_switch(). */
-	tsk->state = EXIT_DEAD;
+	tsk->state = TASK_DEAD;
 
 	schedule();
 	BUG();
diff --git a/kernel/sched.c b/kernel/sched.c
index a9405d7cc6ab..74f169ac0773 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1761,10 +1761,10 @@ static inline void finish_task_switch(struct rq *rq, struct task_struct *prev)
 
 	/*
 	 * A task struct has one reference for the use as "current".
-	 * If a task dies, then it sets EXIT_DEAD in tsk->state and calls
+	 * If a task dies, then it sets TASK_DEAD in tsk->state and calls
 	 * schedule one last time. The schedule call will never return, and
 	 * the scheduled task must drop that reference.
-	 * The test for EXIT_DEAD must occur while the runqueue locks are
+	 * The test for TASK_DEAD must occur while the runqueue locks are
 	 * still held, otherwise prev could be scheduled on another cpu, die
 	 * there before we look at prev->state, and then the reference would
 	 * be dropped twice.
@@ -1775,7 +1775,7 @@ static inline void finish_task_switch(struct rq *rq, struct task_struct *prev)
 	finish_lock_switch(rq, prev);
 	if (mm)
 		mmdrop(mm);
-	if (unlikely(prev_state == EXIT_DEAD)) {
+	if (unlikely(prev_state == TASK_DEAD)) {
 		/*
 		 * Remove function-return probe instances associated with this
 		 * task and put them back on the free list.
@@ -5153,7 +5153,7 @@ static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
 	BUG_ON(p->exit_state != EXIT_ZOMBIE && p->exit_state != EXIT_DEAD);
 
 	/* Cannot have done final schedule yet: would have vanished. */
-	BUG_ON(p->state == EXIT_DEAD);
+	BUG_ON(p->state == TASK_DEAD);
 
 	get_task_struct(p);
 
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 202f186a753a..21f0a7e8e514 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -227,7 +227,7 @@ static struct task_struct *select_bad_process(unsigned long *ppoints)
 						p->flags & PF_EXITING;
 		if (releasing) {
 			/* TASK_DEAD tasks have already released their mm */
-			if (p->state == EXIT_DEAD)
+			if (p->state == TASK_DEAD)
 				continue;
 			if (p->flags & PF_EXITING && p == current) {
 				chosen = p;
-- 
cgit v1.2.3


From 38837fc75acb7fa9b0e111b0241fe4fe76c5d4b3 Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Fri, 29 Sep 2006 02:01:16 -0700
Subject: [PATCH] cpuset: top_cpuset tracks hotplug changes to node_online_map

Change the list of memory nodes allowed to tasks in the top (root) nodeset
to dynamically track what cpus are online, using a call to a cpuset hook
from the memory hotplug code.  Make this top cpus file read-only.

On systems that have cpusets configured in their kernel, but that aren't
actively using cpusets (for some distros, this covers the majority of
systems) all tasks end up in the top cpuset.

If that system does support memory hotplug, then these tasks cannot make
use of memory nodes that are added after system boot, because the memory
nodes are not allowed in the top cpuset.  This is a surprising regression
over earlier kernels that didn't have cpusets enabled.

One key motivation for this change is to remain consistent with the
behaviour for the top_cpuset's 'cpus', which is also read-only, and which
automatically tracks the cpu_online_map.

This change also has the minor benefit that it fixes a long standing,
little noticed, minor bug in cpusets.  The cpuset performance tweak to
short circuit the cpuset_zone_allowed() check on systems with just a single
cpuset (see 'number_of_cpusets', in linux/cpuset.h) meant that simply
changing the 'mems' of the top_cpuset had no affect, even though the change
(the write system call) appeared to succeed.  With the following change,
that write to the 'mems' file fails -EACCES, and the 'mems' file stubbornly
refuses to be changed via user space writes.  Thus no one should be mislead
into thinking they've changed the top_cpusets's 'mems' when in affect they
haven't.

In order to keep the behaviour of cpusets consistent between systems
actively making use of them and systems not using them, this patch changes
the behaviour of the 'mems' file in the top (root) cpuset, making it read
only, and making it automatically track the value of node_online_map.  Thus
tasks in the top cpuset will have automatic use of hot plugged memory nodes
allowed by their cpuset.

[akpm@osdl.org: build fix]
[bunk@stusta.de: build fix]
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/cpusets.txt | 10 +++++-----
 include/linux/cpuset.h    |  4 ++++
 kernel/cpuset.c           | 28 +++++++++++++++++++++++++---
 mm/memory_hotplug.c       |  3 +++
 4 files changed, 37 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/cpusets.txt b/Documentation/cpusets.txt
index 76b44290c154..842f0d1ab216 100644
--- a/Documentation/cpusets.txt
+++ b/Documentation/cpusets.txt
@@ -217,11 +217,11 @@ exclusive cpuset.  Also, the use of a Linux virtual file system (vfs)
 to represent the cpuset hierarchy provides for a familiar permission
 and name space for cpusets, with a minimum of additional kernel code.
 
-The cpus file in the root (top_cpuset) cpuset is read-only.
-It automatically tracks the value of cpu_online_map, using a CPU
-hotplug notifier.  If and when memory nodes can be hotplugged,
-we expect to make the mems file in the root cpuset read-only
-as well, and have it track the value of node_online_map.
+The cpus and mems files in the root (top_cpuset) cpuset are
+read-only.  The cpus file automatically tracks the value of
+cpu_online_map using a CPU hotplug notifier, and the mems file
+automatically tracks the value of node_online_map using the
+cpuset_track_online_nodes() hook.
 
 
 1.4 What are exclusive cpusets ?
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 9354722a9217..4d8adf663681 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -63,6 +63,8 @@ static inline int cpuset_do_slab_mem_spread(void)
 	return current->flags & PF_SPREAD_SLAB;
 }
 
+extern void cpuset_track_online_nodes(void);
+
 #else /* !CONFIG_CPUSETS */
 
 static inline int cpuset_init_early(void) { return 0; }
@@ -126,6 +128,8 @@ static inline int cpuset_do_slab_mem_spread(void)
 	return 0;
 }
 
+static inline void cpuset_track_online_nodes(void) {}
+
 #endif /* !CONFIG_CPUSETS */
 
 #endif /* _LINUX_CPUSET_H */
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 584bb4e6c042..794af5024c2f 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -912,6 +912,10 @@ static int update_nodemask(struct cpuset *cs, char *buf)
 	int fudge;
 	int retval;
 
+	/* top_cpuset.mems_allowed tracks node_online_map; it's read-only */
+	if (cs == &top_cpuset)
+		return -EACCES;
+
 	trialcs = *cs;
 	retval = nodelist_parse(buf, trialcs.mems_allowed);
 	if (retval < 0)
@@ -2042,9 +2046,8 @@ out:
  * (of no affect) on systems that are actively using CPU hotplug
  * but making no active use of cpusets.
  *
- * This handles CPU hotplug (cpuhp) events.  If someday Memory
- * Nodes can be hotplugged (dynamically changing node_online_map)
- * then we should handle that too, perhaps in a similar way.
+ * This routine ensures that top_cpuset.cpus_allowed tracks
+ * cpu_online_map on each CPU hotplug (cpuhp) event.
  */
 
 #ifdef CONFIG_HOTPLUG_CPU
@@ -2063,6 +2066,25 @@ static int cpuset_handle_cpuhp(struct notifier_block *nb,
 }
 #endif
 
+/*
+ * Keep top_cpuset.mems_allowed tracking node_online_map.
+ * Call this routine anytime after you change node_online_map.
+ * See also the previous routine cpuset_handle_cpuhp().
+ */
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+void cpuset_track_online_nodes()
+{
+	mutex_lock(&manage_mutex);
+	mutex_lock(&callback_mutex);
+
+	top_cpuset.mems_allowed = node_online_map;
+
+	mutex_unlock(&callback_mutex);
+	mutex_unlock(&manage_mutex);
+}
+#endif
+
 /**
  * cpuset_init_smp - initialize cpus_allowed
  *
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index c37319542b70..9576ed920c0a 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -21,6 +21,7 @@
 #include <linux/highmem.h>
 #include <linux/vmalloc.h>
 #include <linux/ioport.h>
+#include <linux/cpuset.h>
 
 #include <asm/tlbflush.h>
 
@@ -283,6 +284,8 @@ int add_memory(int nid, u64 start, u64 size)
 	/* we online node here. we can't roll back from here. */
 	node_set_online(nid);
 
+	cpuset_track_online_nodes();
+
 	if (new_pgdat) {
 		ret = register_one_node(nid);
 		/*
-- 
cgit v1.2.3


From b1aac8bb824c658ddebd296b088a8bff5029c288 Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Fri, 29 Sep 2006 02:01:17 -0700
Subject: [PATCH] cpuset: hotunplug cpus and mems in all cpusets

The cpuset code handling hot unplug of CPUs or Memory Nodes was incorrect -
it could remove a CPU or Node from the top cpuset, while leaving it still
in some child cpusets.

One basic rule of cpusets is that each cpusets cpus and mems are subsets of
its parents.  The cpuset hot unplug code violated this rule.

So the cpuset hotunplug handler must walk down the tree, removing any
removed CPU or Node from all cpusets.

However, it is not allowed to make a cpusets cpus or mems become empty.
They can only transition from empty to non-empty, not back.

So if the last CPU or Node would be removed from a cpuset by the above
walk, we scan back up the cpuset hierarchy, finding the nearest ancestor
that still has something online, and copy its CPU or Memory placement.

Signed-off-by: Paul Jackson <pj@sgi.com>
Cc: Nathan Lynch <ntl@pobox.com>
Cc: Anton Blanchard <anton@samba.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/cpuset.c | 87 ++++++++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 70 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 794af5024c2f..cc0395d7eba1 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2040,6 +2040,73 @@ out:
 	return err;
 }
 
+#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_MEMORY_HOTPLUG)
+/*
+ * If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs
+ * or memory nodes, we need to walk over the cpuset hierarchy,
+ * removing that CPU or node from all cpusets.  If this removes the
+ * last CPU or node from a cpuset, then the guarantee_online_cpus()
+ * or guarantee_online_mems() code will use that emptied cpusets
+ * parent online CPUs or nodes.  Cpusets that were already empty of
+ * CPUs or nodes are left empty.
+ *
+ * This routine is intentionally inefficient in a couple of regards.
+ * It will check all cpusets in a subtree even if the top cpuset of
+ * the subtree has no offline CPUs or nodes.  It checks both CPUs and
+ * nodes, even though the caller could have been coded to know that
+ * only one of CPUs or nodes needed to be checked on a given call.
+ * This was done to minimize text size rather than cpu cycles.
+ *
+ * Call with both manage_mutex and callback_mutex held.
+ *
+ * Recursive, on depth of cpuset subtree.
+ */
+
+static void guarantee_online_cpus_mems_in_subtree(const struct cpuset *cur)
+{
+	struct cpuset *c;
+
+	/* Each of our child cpusets mems must be online */
+	list_for_each_entry(c, &cur->children, sibling) {
+		guarantee_online_cpus_mems_in_subtree(c);
+		if (!cpus_empty(c->cpus_allowed))
+			guarantee_online_cpus(c, &c->cpus_allowed);
+		if (!nodes_empty(c->mems_allowed))
+			guarantee_online_mems(c, &c->mems_allowed);
+	}
+}
+
+/*
+ * The cpus_allowed and mems_allowed nodemasks in the top_cpuset track
+ * cpu_online_map and node_online_map.  Force the top cpuset to track
+ * whats online after any CPU or memory node hotplug or unplug event.
+ *
+ * To ensure that we don't remove a CPU or node from the top cpuset
+ * that is currently in use by a child cpuset (which would violate
+ * the rule that cpusets must be subsets of their parent), we first
+ * call the recursive routine guarantee_online_cpus_mems_in_subtree().
+ *
+ * Since there are two callers of this routine, one for CPU hotplug
+ * events and one for memory node hotplug events, we could have coded
+ * two separate routines here.  We code it as a single common routine
+ * in order to minimize text size.
+ */
+
+static void common_cpu_mem_hotplug_unplug(void)
+{
+	mutex_lock(&manage_mutex);
+	mutex_lock(&callback_mutex);
+
+	guarantee_online_cpus_mems_in_subtree(&top_cpuset);
+	top_cpuset.cpus_allowed = cpu_online_map;
+	top_cpuset.mems_allowed = node_online_map;
+
+	mutex_unlock(&callback_mutex);
+	mutex_unlock(&manage_mutex);
+}
+#endif
+
+#ifdef CONFIG_HOTPLUG_CPU
 /*
  * The top_cpuset tracks what CPUs and Memory Nodes are online,
  * period.  This is necessary in order to make cpusets transparent
@@ -2050,38 +2117,24 @@ out:
  * cpu_online_map on each CPU hotplug (cpuhp) event.
  */
 
-#ifdef CONFIG_HOTPLUG_CPU
 static int cpuset_handle_cpuhp(struct notifier_block *nb,
 				unsigned long phase, void *cpu)
 {
-	mutex_lock(&manage_mutex);
-	mutex_lock(&callback_mutex);
-
-	top_cpuset.cpus_allowed = cpu_online_map;
-
-	mutex_unlock(&callback_mutex);
-	mutex_unlock(&manage_mutex);
-
+	common_cpu_mem_hotplug_unplug();
 	return 0;
 }
 #endif
 
+#ifdef CONFIG_MEMORY_HOTPLUG
 /*
  * Keep top_cpuset.mems_allowed tracking node_online_map.
  * Call this routine anytime after you change node_online_map.
  * See also the previous routine cpuset_handle_cpuhp().
  */
 
-#ifdef CONFIG_MEMORY_HOTPLUG
 void cpuset_track_online_nodes()
 {
-	mutex_lock(&manage_mutex);
-	mutex_lock(&callback_mutex);
-
-	top_cpuset.mems_allowed = node_online_map;
-
-	mutex_unlock(&callback_mutex);
-	mutex_unlock(&manage_mutex);
+	common_cpu_mem_hotplug_unplug();
 }
 #endif
 
-- 
cgit v1.2.3


From 04b1db9fd7eea63c9663072feece616ea41b0a79 Mon Sep 17 00:00:00 2001
From: "Ian S. Nelson" <nelsonian@comcast.net>
Date: Fri, 29 Sep 2006 02:01:31 -0700
Subject: [PATCH] /sys/modules: allow full length section names

I've been using systemtap for some debugging and I noticed that it can't
probe a lot of modules.  Turns out it's kind of silly, the sections section
of /sys/module is limited to 32byte filenames and many of the actual
sections are a a bit longer than that.

[akpm@osdl.org: rewrite to use dymanic allocation]
Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 CREDITS                |  3 ++-
 include/linux/module.h |  4 ++--
 kernel/module.c        | 26 ++++++++++++++++++++------
 3 files changed, 24 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/CREDITS b/CREDITS
index 8feb2bb49e35..66e82466dde8 100644
--- a/CREDITS
+++ b/CREDITS
@@ -2478,7 +2478,8 @@ S: Derbyshire DE4 3RL
 S: United Kingdom
 
 N: Ian S. Nelson
-E: ian.nelson@echostar.com
+E: nelsonis@earthlink.net
+P: 1024D/00D3D983 3EFD 7B86 B888 D7E2 29B6  9E97 576F 1B97 00D3 D983
 D: Minor mmap and ide hacks
 S: 1370 Atlantis Ave.
 S: Lafayette CO, 80026
diff --git a/include/linux/module.h b/include/linux/module.h
index d4486cc2e7fe..2c599175c583 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -232,17 +232,17 @@ enum module_state
 };
 
 /* Similar stuff for section attributes. */
-#define MODULE_SECT_NAME_LEN 32
 struct module_sect_attr
 {
 	struct module_attribute mattr;
-	char name[MODULE_SECT_NAME_LEN];
+	char *name;
 	unsigned long address;
 };
 
 struct module_sect_attrs
 {
 	struct attribute_group grp;
+	int nsections;
 	struct module_sect_attr attrs[0];
 };
 
diff --git a/kernel/module.c b/kernel/module.c
index b7fe6e840963..05625d5dc758 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -933,6 +933,15 @@ static ssize_t module_sect_show(struct module_attribute *mattr,
 	return sprintf(buf, "0x%lx\n", sattr->address);
 }
 
+static void free_sect_attrs(struct module_sect_attrs *sect_attrs)
+{
+	int section;
+
+	for (section = 0; section < sect_attrs->nsections; section++)
+		kfree(sect_attrs->attrs[section].name);
+	kfree(sect_attrs);
+}
+
 static void add_sect_attrs(struct module *mod, unsigned int nsect,
 		char *secstrings, Elf_Shdr *sechdrs)
 {
@@ -949,21 +958,26 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect,
 			+ nloaded * sizeof(sect_attrs->attrs[0]),
 			sizeof(sect_attrs->grp.attrs[0]));
 	size[1] = (nloaded + 1) * sizeof(sect_attrs->grp.attrs[0]);
-	if (! (sect_attrs = kmalloc(size[0] + size[1], GFP_KERNEL)))
+	sect_attrs = kzalloc(size[0] + size[1], GFP_KERNEL);
+	if (sect_attrs == NULL)
 		return;
 
 	/* Setup section attributes. */
 	sect_attrs->grp.name = "sections";
 	sect_attrs->grp.attrs = (void *)sect_attrs + size[0];
 
+	sect_attrs->nsections = 0;
 	sattr = &sect_attrs->attrs[0];
 	gattr = &sect_attrs->grp.attrs[0];
 	for (i = 0; i < nsect; i++) {
 		if (! (sechdrs[i].sh_flags & SHF_ALLOC))
 			continue;
 		sattr->address = sechdrs[i].sh_addr;
-		strlcpy(sattr->name, secstrings + sechdrs[i].sh_name,
-			MODULE_SECT_NAME_LEN);
+		sattr->name = kstrdup(secstrings + sechdrs[i].sh_name,
+					GFP_KERNEL);
+		if (sattr->name == NULL)
+			goto out;
+		sect_attrs->nsections++;
 		sattr->mattr.show = module_sect_show;
 		sattr->mattr.store = NULL;
 		sattr->mattr.attr.name = sattr->name;
@@ -979,7 +993,7 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect,
 	mod->sect_attrs = sect_attrs;
 	return;
   out:
-	kfree(sect_attrs);
+	free_sect_attrs(sect_attrs);
 }
 
 static void remove_sect_attrs(struct module *mod)
@@ -989,13 +1003,13 @@ static void remove_sect_attrs(struct module *mod)
 				   &mod->sect_attrs->grp);
 		/* We are positive that no one is using any sect attrs
 		 * at this point.  Deallocate immediately. */
-		kfree(mod->sect_attrs);
+		free_sect_attrs(mod->sect_attrs);
 		mod->sect_attrs = NULL;
 	}
 }
 
-
 #else
+
 static inline void add_sect_attrs(struct module *mod, unsigned int nsect,
 		char *sectstrings, Elf_Shdr *sechdrs)
 {
-- 
cgit v1.2.3


From e5582ca21af82929d5cd3613321ac9233c492ebc Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Fri, 29 Sep 2006 02:01:35 -0700
Subject: [PATCH] stop_machine.c copyright

I had to look back: this code was extracted from the module.c code in 2005.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/stop_machine.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 51cacd111dbd..12458040e665 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -1,3 +1,6 @@
+/* Copyright 2005 Rusty Russell rusty@rustcorp.com.au IBM Corporation.
+ * GPL v2 and any later version.
+ */
 #include <linux/stop_machine.h>
 #include <linux/kthread.h>
 #include <linux/sched.h>
-- 
cgit v1.2.3


From eb84a20e9e6b98dcb33023ad22241d79107a08a7 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@lxorguk.ukuu.org.uk>
Date: Fri, 29 Sep 2006 02:01:41 -0700
Subject: [PATCH] audit/accounting: tty locking

Add tty locking around the audit and accounting code.

The whole current->signal-> locking is all deeply strange but it's for
someone else to sort out.  Add rather than replace the lock for acct.c

Signed-off-by: Alan Cox <alan@redhat.com>
Acked-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/acct.c    | 6 +++++-
 kernel/auditsc.c | 5 +++++
 2 files changed, 10 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/acct.c b/kernel/acct.c
index 2a7c933651c7..f4330acead46 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -483,10 +483,14 @@ static void do_acct_process(struct file *file)
 	ac.ac_ppid = current->parent->tgid;
 #endif
 
-	read_lock(&tasklist_lock);	/* pin current->signal */
+	mutex_lock(&tty_mutex);
+	/* FIXME: Whoever is responsible for current->signal locking needs
+	   to use the same locking all over the kernel and document it */
+	read_lock(&tasklist_lock);
 	ac.ac_tty = current->signal->tty ?
 		old_encode_dev(tty_devnum(current->signal->tty)) : 0;
 	read_unlock(&tasklist_lock);
+	mutex_unlock(&tty_mutex);
 
 	spin_lock_irq(&current->sighand->siglock);
 	ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime)));
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index fb83c5cb8c32..105147631753 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -817,6 +817,8 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 		audit_log_format(ab, " success=%s exit=%ld", 
 				 (context->return_valid==AUDITSC_SUCCESS)?"yes":"no",
 				 context->return_code);
+
+	mutex_lock(&tty_mutex);
 	if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name)
 		tty = tsk->signal->tty->name;
 	else
@@ -838,6 +840,9 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 		  context->gid,
 		  context->euid, context->suid, context->fsuid,
 		  context->egid, context->sgid, context->fsgid, tty);
+
+	mutex_unlock(&tty_mutex);
+
 	audit_log_task_info(ab, tsk);
 	if (context->filterkey) {
 		audit_log_format(ab, " key=");
-- 
cgit v1.2.3


From 03cbc358aab3faf34bfeaa02206fa660127e9b3f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 29 Sep 2006 02:01:46 -0700
Subject: [PATCH] lockdep core: improve the lock-chain-hash

With CONFIG_DEBUG_LOCK_ALLOC turned off i was getting sporadic failures in
the locking self-test:

  ------------>
  | Locking API testsuite:
  ----------------------------------------------------------------------------
                                   | spin |wlock |rlock |mutex | wsem | rsem |
    --------------------------------------------------------------------------
                       A-A deadlock:  ok  |  ok  |  ok  |  ok  |  ok  |  ok  |
                   A-B-B-A deadlock:  ok  |  ok  |  ok  |  ok  |  ok  |  ok  |
               A-B-B-C-C-A deadlock:  ok  |  ok  |  ok  |  ok  |  ok  |  ok  |
               A-B-C-A-B-C deadlock:  ok  |  ok  |  ok  |  ok  |  ok  |  ok  |
           A-B-B-C-C-D-D-A deadlock:  ok  |FAILED|  ok  |  ok  |  ok  |  ok  |
           A-B-C-D-B-D-D-A deadlock:  ok  |  ok  |  ok  |  ok  |  ok  |  ok  |
           A-B-C-D-B-C-D-A deadlock:  ok  |  ok  |  ok  |  ok  |  ok  |FAILED|

after much debugging it turned out to be caused by accidental chain-hash
key collisions.  The current hash is:

 #define iterate_chain_key(key1, key2) \
	(((key1) << MAX_LOCKDEP_KEYS_BITS/2) ^ \
	((key1) >> (64-MAX_LOCKDEP_KEYS_BITS/2)) ^ \
 	(key2))

where MAX_LOCKDEP_KEYS_BITS is 11.  This hash is pretty good as it will
shift by 5 bits in every iteration, where every new ID 'mixed' into the
hash would have up to 11 bits.  But because there was a 6 bits overlap
between subsequent IDs and their high bits tended to be similar, there was
a chance for accidental chain-hash collision for a low number of locks
held.

the solution is to shift by 11 bits:

 #define iterate_chain_key(key1, key2) \
	(((key1) << MAX_LOCKDEP_KEYS_BITS) ^ \
	((key1) >> (64-MAX_LOCKDEP_KEYS_BITS)) ^ \
 	(key2))

This keeps the hash perfect up to 5 locks held, but even above that the
hash is still good because 11 bits is a relative prime to the total 64
bits, so a complete match will only occur after 64 held locks (which doesnt
happen in Linux).  Even after 5 locks held, entropy of the 5 IDs mixed into
the hash is already good enough so that overlap doesnt generate a colliding
hash ID.

with this change the false positives went away.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/lockdep.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index df1c3594de31..e596525669ed 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -122,8 +122,8 @@ static struct list_head chainhash_table[CHAINHASH_SIZE];
  * unique.
  */
 #define iterate_chain_key(key1, key2) \
-	(((key1) << MAX_LOCKDEP_KEYS_BITS/2) ^ \
-	((key1) >> (64-MAX_LOCKDEP_KEYS_BITS/2)) ^ \
+	(((key1) << MAX_LOCKDEP_KEYS_BITS) ^ \
+	((key1) >> (64-MAX_LOCKDEP_KEYS_BITS)) ^ \
 	(key2))
 
 void lockdep_off(void)
-- 
cgit v1.2.3


From 181b64803661209cda64e5e874ad75f373a69de8 Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Fri, 29 Sep 2006 02:01:48 -0700
Subject: [PATCH] cpuset: fix obscure attach_task vs exiting race

Fix obscure race condition in kernel/cpuset.c attach_task() code.

There is basically zero chance of anyone accidentally being harmed by this
race.

It requires a special 'micro-stress' load and a special timing loop hacks
in the kernel to hit in less than an hour, and even then you'd have to hit
it hundreds or thousands of times, followed by some unusual and senseless
cpuset configuration requests, including removing the top cpuset, to cause
any visibly harm affects.

One could, with perhaps a few days or weeks of such effort, get the
reference count on the top cpuset below zero, and manage to crash the
kernel by asking to remove the top cpuset.

I found it by code inspection.

The race was introduced when 'the_top_cpuset_hack' was introduced, and one
piece of code was not updated.  An old check for a possibly null task
cpuset pointer needed to be changed to a check for a task marked
PF_EXITING.  The pointer can't be null anymore, thanks to
the_top_cpuset_hack (documented in kernel/cpuset.c).  But the task could
have gone into PF_EXITING state after it was found in the task_list scan.

If a task is PF_EXITING in this code, it is possible that its task->cpuset
pointer is pointing to the top cpuset due to the_top_cpuset_hack, rather
than because the top_cpuset was that tasks last valid cpuset.  In that
case, the wrong cpuset reference counter would be decremented.

The fix is trivial.  Instead of failing the system call if the tasks cpuset
pointer is null here, fail it if the task is in PF_EXITING state.

The code for 'the_top_cpuset_hack' that changes an exiting tasks cpuset to
the top_cpuset is done without locking, so could happen at anytime.  But it
is done during the exit handling, after the PF_EXITING flag is set.  So if
we verify that a task is still not PF_EXITING after we copy out its cpuset
pointer (into 'oldcs', below), we know that 'oldcs' is not one of these
hack references to the top_cpuset.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/cpuset.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index cc0395d7eba1..8c3c400cce91 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1225,7 +1225,12 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
 
 	task_lock(tsk);
 	oldcs = tsk->cpuset;
-	if (!oldcs) {
+	/*
+	 * After getting 'oldcs' cpuset ptr, be sure still not exiting.
+	 * If 'oldcs' might be the top_cpuset due to the_top_cpuset_hack
+	 * then fail this attach_task(), to avoid breaking top_cpuset.count.
+	 */
+	if (tsk->flags & PF_EXITING) {
 		task_unlock(tsk);
 		mutex_unlock(&callback_mutex);
 		put_task_struct(tsk);
-- 
cgit v1.2.3


From 29cbc78b90a73ad80f2f58ba2927956cf663abed Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Sat, 30 Sep 2006 01:47:55 +0200
Subject: [PATCH] x86: Clean up x86 NMI sysctls

Use prototypes in headers
Don't define panic_on_unrecovered_nmi for all architectures

Cc: dzickus@redhat.com

Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/i386/kernel/nmi.c   |  3 +++
 arch/i386/kernel/traps.c |  2 ++
 arch/x86_64/kernel/nmi.c |  4 ++++
 include/asm-i386/nmi.h   |  6 ++++++
 include/asm-x86_64/nmi.h |  7 +++++++
 kernel/panic.c           |  1 -
 kernel/sysctl.c          | 11 ++++-------
 7 files changed, 26 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c
index dbda706fdd14..0fc4997fb143 100644
--- a/arch/i386/kernel/nmi.c
+++ b/arch/i386/kernel/nmi.c
@@ -31,6 +31,9 @@
 
 #include "mach_traps.h"
 
+int unknown_nmi_panic;
+int nmi_watchdog_enabled;
+
 /* perfctr_nmi_owner tracks the ownership of the perfctr registers:
  * evtsel_nmi_owner tracks the ownership of the event selection
  * - different performance counters/ event selection may be reserved for
diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c
index a13037fe0ee3..6820b8d643c7 100644
--- a/arch/i386/kernel/traps.c
+++ b/arch/i386/kernel/traps.c
@@ -57,6 +57,8 @@
 
 #include "mach_traps.h"
 
+int panic_on_unrecovered_nmi;
+
 asmlinkage int system_call(void);
 
 struct desc_struct default_ldt[] = { { 0, 0 }, { 0, 0 }, { 0, 0 },
diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c
index 4d6fb047952e..7af9cb3e2d99 100644
--- a/arch/x86_64/kernel/nmi.c
+++ b/arch/x86_64/kernel/nmi.c
@@ -28,6 +28,10 @@
 #include <asm/mce.h>
 #include <asm/intel_arch_perfmon.h>
 
+int unknown_nmi_panic;
+int nmi_watchdog_enabled;
+int panic_on_unrecovered_nmi;
+
 /* perfctr_nmi_owner tracks the ownership of the perfctr registers:
  * evtsel_nmi_owner tracks the ownership of the event selection
  * - different performance counters/ event selection may be reserved for
diff --git a/include/asm-i386/nmi.h b/include/asm-i386/nmi.h
index 303bcd4592bb..269d315719ca 100644
--- a/include/asm-i386/nmi.h
+++ b/include/asm-i386/nmi.h
@@ -36,4 +36,10 @@ extern unsigned int nmi_watchdog;
 #define NMI_LOCAL_APIC	2
 #define NMI_INVALID	3
 
+struct ctl_table;
+struct file;
+extern int proc_nmi_enabled(struct ctl_table *, int , struct file *,
+			void __user *, size_t *, loff_t *);
+extern int unknown_nmi_panic;
+
 #endif /* ASM_NMI_H */
diff --git a/include/asm-x86_64/nmi.h b/include/asm-x86_64/nmi.h
index cbf2669bca71..f367d4014b42 100644
--- a/include/asm-x86_64/nmi.h
+++ b/include/asm-x86_64/nmi.h
@@ -70,4 +70,11 @@ extern unsigned int nmi_watchdog;
 #define NMI_LOCAL_APIC	2
 #define NMI_INVALID	3
 
+struct ctl_table;
+struct file;
+extern int proc_nmi_enabled(struct ctl_table *, int , struct file *,
+			void __user *, size_t *, loff_t *);
+
+extern int unknown_nmi_panic;
+
 #endif /* ASM_NMI_H */
diff --git a/kernel/panic.c b/kernel/panic.c
index 6ceb664fb52a..525e365f7239 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -21,7 +21,6 @@
 #include <linux/debug_locks.h>
 
 int panic_on_oops;
-int panic_on_unrecovered_nmi;
 int tainted;
 static int pause_on_oops;
 static int pause_on_oops_flag;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 9535a3839930..c57c4532e296 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -52,6 +52,10 @@
 extern int proc_nr_files(ctl_table *table, int write, struct file *filp,
                      void __user *buffer, size_t *lenp, loff_t *ppos);
 
+#ifdef CONFIG_X86
+#include <asm/nmi.h>
+#endif
+
 #if defined(CONFIG_SYSCTL)
 
 /* External variables not in a header file. */
@@ -74,13 +78,6 @@ extern int sysctl_drop_caches;
 extern int percpu_pagelist_fraction;
 extern int compat_log;
 
-#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
-int unknown_nmi_panic;
-int nmi_watchdog_enabled;
-extern int proc_nmi_enabled(struct ctl_table *, int , struct file *,
-			void __user *, size_t *, loff_t *);
-#endif
-
 /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
 static int maxolduid = 65535;
 static int minolduid;
-- 
cgit v1.2.3


From 34596dc9e59d7bece16fe5aba08116b49465da26 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Sat, 30 Sep 2006 01:47:55 +0200
Subject: [PATCH] Define vsyscall cache as blob to make clearer that user space
 shouldn't use it

Signed-off-by: Andi Kleen <ak@suse.de>
---
 arch/x86_64/kernel/vsyscall.c |  8 ++++----
 include/linux/getcpu.h        | 12 +++++++-----
 kernel/sys.c                  |  8 ++++----
 3 files changed, 15 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86_64/kernel/vsyscall.c b/arch/x86_64/kernel/vsyscall.c
index ac48c3857ddb..07c086382059 100644
--- a/arch/x86_64/kernel/vsyscall.c
+++ b/arch/x86_64/kernel/vsyscall.c
@@ -155,8 +155,8 @@ vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache)
 	   We do this here because otherwise user space would do it on
 	   its own in a likely inferior way (no access to jiffies).
 	   If you don't like it pass NULL. */
-	if (tcache && tcache->t0 == (j = __jiffies)) {
-		p = tcache->t1;
+	if (tcache && tcache->blob[0] == (j = __jiffies)) {
+		p = tcache->blob[1];
 	} else if (__vgetcpu_mode == VGETCPU_RDTSCP) {
 		/* Load per CPU data from RDTSCP */
 		rdtscp(dummy, dummy, p);
@@ -165,8 +165,8 @@ vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache)
 		asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG));
 	}
 	if (tcache) {
-		tcache->t0 = j;
-		tcache->t1 = p;
+		tcache->blob[0] = j;
+		tcache->blob[1] = p;
 	}
 	if (cpu)
 		*cpu = p & 0xfff;
diff --git a/include/linux/getcpu.h b/include/linux/getcpu.h
index 031ed3780e45..c7372d7a97be 100644
--- a/include/linux/getcpu.h
+++ b/include/linux/getcpu.h
@@ -1,16 +1,18 @@
 #ifndef _LINUX_GETCPU_H
 #define _LINUX_GETCPU_H 1
 
-/* Cache for getcpu() to speed it up. Results might be upto a jiffie
+/* Cache for getcpu() to speed it up. Results might be a short time
    out of date, but will be faster.
+
    User programs should not refer to the contents of this structure.
-   It is only a cache for vgetcpu(). It might change in future kernels.
+   I repeat they should not refer to it. If they do they will break
+   in future kernels.
+
+   It is only a private cache for vgetcpu(). It will change in future kernels.
    The user program must store this information per thread (__thread)
    If you want 100% accurate information pass NULL instead. */
 struct getcpu_cache {
-	unsigned long t0;
-	unsigned long t1;
-	unsigned long res[4];
+	unsigned long blob[128 / sizeof(long)];
 };
 
 #endif
diff --git a/kernel/sys.c b/kernel/sys.c
index 8647061c084a..b88806c66244 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2083,12 +2083,12 @@ asmlinkage long sys_getcpu(unsigned __user *cpup, unsigned __user *nodep,
 		 * padding
 		 */
 		unsigned long t0, t1;
-		get_user(t0, &cache->t0);
-		get_user(t1, &cache->t1);
+		get_user(t0, &cache->blob[0]);
+		get_user(t1, &cache->blob[1]);
 		t0++;
 		t1++;
-		put_user(t0, &cache->t0);
-		put_user(t1, &cache->t1);
+		put_user(t0, &cache->blob[0]);
+		put_user(t1, &cache->blob[1]);
 	}
 	return err ? -EFAULT : 0;
 }
-- 
cgit v1.2.3


From 0d67a46df0125e20d14f12dbd3646f1f1bf23e8c Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 29 Aug 2006 19:05:56 +0100
Subject: [PATCH] BLOCK: Remove duplicate declaration of exit_io_context() [try
 #6]

Remove the duplicate declaration of exit_io_context() from linux/sched.h.

Signed-Off-By: David Howells <dhowells@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/sched.h | 1 -
 kernel/exit.c         | 1 +
 2 files changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index a06fc89cf6e5..fc4a9873ec10 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -710,7 +710,6 @@ extern unsigned int max_cache_size;
 
 
 struct io_context;			/* See blkdev.h */
-void exit_io_context(void);
 struct cpuset;
 
 #define NGROUPS_SMALL		32
diff --git a/kernel/exit.c b/kernel/exit.c
index 2e4c13cba95a..c189de2927ab 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -38,6 +38,7 @@
 #include <linux/pipe_fs_i.h>
 #include <linux/audit.h> /* for audit_free() */
 #include <linux/resource.h>
+#include <linux/blkdev.h>
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
-- 
cgit v1.2.3


From 07f3f05c1e3052b8656129b2a5aca9f888241a34 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sat, 30 Sep 2006 20:52:18 +0200
Subject: [PATCH] BLOCK: Move extern declarations out of fs/*.c into header
 files [try #6]

Create a new header file, fs/internal.h, for common definitions local to the
sources in the fs/ directory.

Move extern definitions that should be in header files from fs/*.c to
fs/internal.h or other main header files where they span directories.

Signed-Off-By: David Howells <dhowells@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 arch/mips/kernel/signal_n32.c |  4 ++--
 fs/binfmt_elf.c               |  1 -
 fs/block_dev.c                |  1 +
 fs/char_dev.c                 |  1 +
 fs/compat.c                   | 10 +++-------
 fs/compat_ioctl.c             |  2 --
 fs/dcache.c                   |  4 +---
 fs/fs-writeback.c             |  3 +--
 fs/internal.h                 | 36 ++++++++++++++++++++++++++++++++++++
 fs/namespace.c                |  3 +--
 include/linux/ramfs.h         |  1 +
 include/linux/tty.h           |  3 +++
 kernel/compat.c               |  2 ++
 13 files changed, 52 insertions(+), 19 deletions(-)
 create mode 100644 fs/internal.h

(limited to 'kernel')

diff --git a/arch/mips/kernel/signal_n32.c b/arch/mips/kernel/signal_n32.c
index 477c5334ec1b..50c17eaa7f25 100644
--- a/arch/mips/kernel/signal_n32.c
+++ b/arch/mips/kernel/signal_n32.c
@@ -42,6 +42,8 @@
 
 #include "signal-common.h"
 
+extern void sigset_from_compat(sigset_t *set, compat_sigset_t *compat);
+
 /*
  * Including <asm/unistd.h> would give use the 64-bit syscall numbers ...
  */
@@ -81,8 +83,6 @@ struct rt_sigframe_n32 {
 #endif
 };
 
-extern void sigset_from_compat (sigset_t *set, compat_sigset_t *compat);
-
 save_static_function(sysn32_rt_sigsuspend);
 __attribute_used__ noinline static int
 _sysn32_rt_sigsuspend(nabi_no_regargs struct pt_regs regs)
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 6eb48e1446ec..bad52433de69 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -46,7 +46,6 @@
 static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs);
 static int load_elf_library(struct file *);
 static unsigned long elf_map (struct file *, unsigned long, struct elf_phdr *, int, int);
-extern int dump_fpu (struct pt_regs *, elf_fpregset_t *);
 
 #ifndef elf_addr_t
 #define elf_addr_t unsigned long
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 4346468139e8..20f7333ba372 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -22,6 +22,7 @@
 #include <linux/uio.h>
 #include <linux/namei.h>
 #include <asm/uaccess.h>
+#include "internal.h"
 
 struct bdev_inode {
 	struct block_device bdev;
diff --git a/fs/char_dev.c b/fs/char_dev.c
index 1f3285affa39..a885f46ca001 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -24,6 +24,7 @@
 #ifdef CONFIG_KMOD
 #include <linux/kmod.h>
 #endif
+#include "internal.h"
 
 /*
  * capabilities for /dev/mem, /dev/kmem and similar directly mappable character
diff --git a/fs/compat.c b/fs/compat.c
index ce982f6e8c80..122b4e3992b5 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -52,11 +52,12 @@
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
 #include <asm/ioctls.h>
-
-extern void sigset_from_compat(sigset_t *set, compat_sigset_t *compat);
+#include "internal.h"
 
 int compat_log = 1;
 
+extern void sigset_from_compat(sigset_t *set, compat_sigset_t *compat);
+
 int compat_printk(const char *fmt, ...)
 {
 	va_list ap;
@@ -313,9 +314,6 @@ out:
 #define IOCTL_HASHSIZE 256
 static struct ioctl_trans *ioctl32_hash_table[IOCTL_HASHSIZE];
 
-extern struct ioctl_trans ioctl_start[];
-extern int ioctl_table_size;
-
 static inline unsigned long ioctl32_hash(unsigned long cmd)
 {
 	return (((cmd >> 6) ^ (cmd >> 4) ^ cmd)) % IOCTL_HASHSIZE;
@@ -838,8 +836,6 @@ static int do_nfs4_super_data_conv(void *raw_data)
 	return 0;
 }
 
-extern int copy_mount_options (const void __user *, unsigned long *);
-
 #define SMBFS_NAME      "smbfs"
 #define NCPFS_NAME      "ncpfs"
 #define NFS4_NAME	"nfs4"
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 4063a9396977..ab74c9bd55fe 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -1279,8 +1279,6 @@ static int loop_status(unsigned int fd, unsigned int cmd, unsigned long arg)
 	return err;
 }
 
-extern int tty_ioctl(struct inode * inode, struct file * file, unsigned int cmd, unsigned long arg);
-
 #ifdef CONFIG_VT
 
 static int vt_check(struct file *file)
diff --git a/fs/dcache.c b/fs/dcache.c
index 17b392a2049e..fc2faa44f8d1 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -32,6 +32,7 @@
 #include <linux/seqlock.h>
 #include <linux/swap.h>
 #include <linux/bootmem.h>
+#include "internal.h"
 
 
 int sysctl_vfs_cache_pressure __read_mostly = 100;
@@ -1877,9 +1878,6 @@ kmem_cache_t *filp_cachep __read_mostly;
 
 EXPORT_SYMBOL(d_genocide);
 
-extern void bdev_cache_init(void);
-extern void chrdev_init(void);
-
 void __init vfs_caches_init_early(void)
 {
 	dcache_init_early();
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 892643dc9af1..0639024d83a9 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -22,8 +22,7 @@
 #include <linux/blkdev.h>
 #include <linux/backing-dev.h>
 #include <linux/buffer_head.h>
-
-extern struct super_block *blockdev_superblock;
+#include "internal.h"
 
 /**
  *	__mark_inode_dirty -	internal function
diff --git a/fs/internal.h b/fs/internal.h
new file mode 100644
index 000000000000..c21ecd37b1e7
--- /dev/null
+++ b/fs/internal.h
@@ -0,0 +1,36 @@
+/* fs/ internal definitions
+ *
+ * Copyright (C) 2006 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/ioctl32.h>
+
+/*
+ * block_dev.c
+ */
+extern struct super_block *blockdev_superblock;
+extern void __init bdev_cache_init(void);
+
+/*
+ * char_dev.c
+ */
+extern void __init chrdev_init(void);
+
+/*
+ * compat_ioctl.c
+ */
+#ifdef CONFIG_COMPAT
+extern struct ioctl_trans ioctl_start[];
+extern int ioctl_table_size;
+#endif
+
+/*
+ * namespace.c
+ */
+extern int copy_mount_options(const void __user *, unsigned long *);
diff --git a/fs/namespace.c b/fs/namespace.c
index 6ede3a539ed8..66d921e14fee 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -24,12 +24,11 @@
 #include <linux/namei.h>
 #include <linux/security.h>
 #include <linux/mount.h>
+#include <linux/ramfs.h>
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
 #include "pnode.h"
 
-extern int __init init_rootfs(void);
-
 /* spinlock for vfsmount related operations, inplace of dcache_lock */
 __cacheline_aligned_in_smp DEFINE_SPINLOCK(vfsmount_lock);
 
diff --git a/include/linux/ramfs.h b/include/linux/ramfs.h
index 00b340ba6612..b160fb18e8d6 100644
--- a/include/linux/ramfs.h
+++ b/include/linux/ramfs.h
@@ -17,5 +17,6 @@ extern int ramfs_nommu_mmap(struct file *file, struct vm_area_struct *vma);
 
 extern const struct file_operations ramfs_file_operations;
 extern struct vm_operations_struct generic_file_vm_ops;
+extern int __init init_rootfs(void);
 
 #endif
diff --git a/include/linux/tty.h b/include/linux/tty.h
index ea4c2605f8da..44091c0db0b4 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -307,6 +307,9 @@ extern void tty_ldisc_put(int);
 extern void tty_wakeup(struct tty_struct *tty);
 extern void tty_ldisc_flush(struct tty_struct *tty);
 
+extern int tty_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
+		     unsigned long arg);
+
 extern struct mutex tty_mutex;
 
 /* n_tty.c */
diff --git a/kernel/compat.c b/kernel/compat.c
index 75573e5d27b0..b4fbd838cd77 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -26,6 +26,8 @@
 
 #include <asm/uaccess.h>
 
+extern void sigset_from_compat(sigset_t *set, compat_sigset_t *compat);
+
 int get_compat_timespec(struct timespec *ts, const struct compat_timespec __user *cts)
 {
 	return (!access_ok(VERIFY_READ, cts, sizeof(*cts)) ||
-- 
cgit v1.2.3


From 9361401eb7619c033e2394e4f9f6d410d6719ac7 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sat, 30 Sep 2006 20:45:40 +0200
Subject: [PATCH] BLOCK: Make it possible to disable the block layer [try #6]

Make it possible to disable the block layer.  Not all embedded devices require
it, some can make do with just JFFS2, NFS, ramfs, etc - none of which require
the block layer to be present.

This patch does the following:

 (*) Introduces CONFIG_BLOCK to disable the block layer, buffering and blockdev
     support.

 (*) Adds dependencies on CONFIG_BLOCK to any configuration item that controls
     an item that uses the block layer.  This includes:

     (*) Block I/O tracing.

     (*) Disk partition code.

     (*) All filesystems that are block based, eg: Ext3, ReiserFS, ISOFS.

     (*) The SCSI layer.  As far as I can tell, even SCSI chardevs use the
     	 block layer to do scheduling.  Some drivers that use SCSI facilities -
     	 such as USB storage - end up disabled indirectly from this.

     (*) Various block-based device drivers, such as IDE and the old CDROM
     	 drivers.

     (*) MTD blockdev handling and FTL.

     (*) JFFS - which uses set_bdev_super(), something it could avoid doing by
     	 taking a leaf out of JFFS2's book.

 (*) Makes most of the contents of linux/blkdev.h, linux/buffer_head.h and
     linux/elevator.h contingent on CONFIG_BLOCK being set.  sector_div() is,
     however, still used in places, and so is still available.

 (*) Also made contingent are the contents of linux/mpage.h, linux/genhd.h and
     parts of linux/fs.h.

 (*) Makes a number of files in fs/ contingent on CONFIG_BLOCK.

 (*) Makes mm/bounce.c (bounce buffering) contingent on CONFIG_BLOCK.

 (*) set_page_dirty() doesn't call __set_page_dirty_buffers() if CONFIG_BLOCK
     is not enabled.

 (*) fs/no-block.c is created to hold out-of-line stubs and things that are
     required when CONFIG_BLOCK is not set:

     (*) Default blockdev file operations (to give error ENODEV on opening).

 (*) Makes some /proc changes:

     (*) /proc/devices does not list any blockdevs.

     (*) /proc/diskstats and /proc/partitions are contingent on CONFIG_BLOCK.

 (*) Makes some compat ioctl handling contingent on CONFIG_BLOCK.

 (*) If CONFIG_BLOCK is not defined, makes sys_quotactl() return -ENODEV if
     given command other than Q_SYNC or if a special device is specified.

 (*) In init/do_mounts.c, no reference is made to the blockdev routines if
     CONFIG_BLOCK is not defined.  This does not prohibit NFS roots or JFFS2.

 (*) The bdflush, ioprio_set and ioprio_get syscalls can now be absent (return
     error ENOSYS by way of cond_syscall if so).

 (*) The seclvl_bd_claim() and seclvl_bd_release() security calls do nothing if
     CONFIG_BLOCK is not set, since they can't then happen.

Signed-Off-By: David Howells <dhowells@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/Kconfig                | 20 ++++++++++++++++++
 block/Kconfig.iosched        |  3 +++
 block/Makefile               |  2 +-
 drivers/block/Kconfig        |  4 ++++
 drivers/cdrom/Kconfig        |  2 +-
 drivers/char/Kconfig         |  1 +
 drivers/char/random.c        |  4 ++++
 drivers/ide/Kconfig          |  4 ++++
 drivers/md/Kconfig           |  3 +++
 drivers/message/i2o/Kconfig  |  2 +-
 drivers/mmc/Kconfig          |  2 +-
 drivers/mmc/Makefile         |  3 ++-
 drivers/mtd/Kconfig          | 12 +++++------
 drivers/mtd/devices/Kconfig  |  2 +-
 drivers/s390/block/Kconfig   |  2 +-
 drivers/scsi/Kconfig         |  2 ++
 fs/Kconfig                   | 31 ++++++++++++++++++++-------
 fs/Makefile                  | 14 +++++++++----
 fs/compat_ioctl.c            | 18 ++++++++++++++++
 fs/internal.h                |  6 ++++++
 fs/no-block.c                | 22 +++++++++++++++++++
 fs/partitions/Makefile       |  2 +-
 fs/proc/proc_misc.c          | 11 +++++++++-
 fs/quota.c                   | 44 ++++++++++++++++++++++++++------------
 fs/super.c                   |  4 ++++
 fs/xfs/Kconfig               |  1 +
 include/linux/blkdev.h       | 50 +++++++++++++++++++++++++++++++-------------
 include/linux/buffer_head.h  | 16 ++++++++++++++
 include/linux/compat_ioctl.h |  2 ++
 include/linux/elevator.h     |  3 +++
 include/linux/fs.h           | 25 +++++++++++++++++++---
 include/linux/genhd.h        |  4 ++++
 include/linux/mpage.h        |  3 +++
 include/linux/raid/md.h      |  3 +++
 include/linux/raid/md_k.h    |  3 +++
 include/scsi/scsi_tcq.h      |  3 ++-
 init/Kconfig                 |  2 +-
 init/do_mounts.c             | 13 +++++++++++-
 kernel/sys_ni.c              |  5 +++++
 mm/Makefile                  |  2 +-
 mm/filemap.c                 |  4 ++++
 mm/migrate.c                 |  2 ++
 mm/page-writeback.c          |  8 ++++---
 mm/truncate.c                |  2 ++
 44 files changed, 308 insertions(+), 63 deletions(-)
 create mode 100644 fs/no-block.c

(limited to 'kernel')

diff --git a/block/Kconfig b/block/Kconfig
index b6f5f0a79655..9af6c614dfde 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -1,6 +1,24 @@
 #
 # Block layer core configuration
 #
+config BLOCK
+       bool "Enable the block layer"
+       default y
+       help
+	 This permits the block layer to be removed from the kernel if it's not
+	 needed (on some embedded devices for example).  If this option is
+	 disabled, then blockdev files will become unusable and some
+	 filesystems (such as ext3) will become unavailable.
+
+	 This option will also disable SCSI character devices and USB storage
+	 since they make use of various block layer definitions and
+	 facilities.
+
+	 Say Y here unless you know you really don't want to mount disks and
+	 suchlike.
+
+if BLOCK
+
 #XXX - it makes sense to enable this only for 32-bit subarch's, not for x86_64
 #for instance.
 config LBD
@@ -33,4 +51,6 @@ config LSF
 
 	  If unsure, say Y.
 
+endif
+
 source block/Kconfig.iosched
diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
index 48d090e266fc..903f0d3b6852 100644
--- a/block/Kconfig.iosched
+++ b/block/Kconfig.iosched
@@ -1,3 +1,4 @@
+if BLOCK
 
 menu "IO Schedulers"
 
@@ -67,3 +68,5 @@ config DEFAULT_IOSCHED
 	default "noop" if DEFAULT_NOOP
 
 endmenu
+
+endif
diff --git a/block/Makefile b/block/Makefile
index c05de0e0037f..4b84d0d5947b 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -2,7 +2,7 @@
 # Makefile for the kernel block layer
 #
 
-obj-y	:= elevator.o ll_rw_blk.o ioctl.o genhd.o scsi_ioctl.o
+obj-$(CONFIG_BLOCK) := elevator.o ll_rw_blk.o ioctl.o genhd.o scsi_ioctl.o
 
 obj-$(CONFIG_IOSCHED_NOOP)	+= noop-iosched.o
 obj-$(CONFIG_IOSCHED_AS)	+= as-iosched.o
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index b5382cedf0c0..422e31d5f8e5 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -2,6 +2,8 @@
 # Block device driver configuration
 #
 
+if BLOCK
+
 menu "Block devices"
 
 config BLK_DEV_FD
@@ -468,3 +470,5 @@ config ATA_OVER_ETH
 	devices like the Coraid EtherDrive (R) Storage Blade.
 
 endmenu
+
+endif
diff --git a/drivers/cdrom/Kconfig b/drivers/cdrom/Kconfig
index ff5652d40619..4b12e9031fb3 100644
--- a/drivers/cdrom/Kconfig
+++ b/drivers/cdrom/Kconfig
@@ -3,7 +3,7 @@
 #
 
 menu "Old CD-ROM drivers (not SCSI, not IDE)"
-	depends on ISA
+	depends on ISA && BLOCK
 
 config CD_NO_IDESCSI
 	bool "Support non-SCSI/IDE/ATAPI CDROM drives"
diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig
index 4cc619edf424..bde1c665d9f4 100644
--- a/drivers/char/Kconfig
+++ b/drivers/char/Kconfig
@@ -1006,6 +1006,7 @@ config GPIO_VR41XX
 
 config RAW_DRIVER
 	tristate "RAW driver (/dev/raw/rawN) (OBSOLETE)"
+	depends on BLOCK
 	help
 	  The raw driver permits block devices to be bound to /dev/raw/rawN. 
 	  Once bound, I/O against /dev/raw/rawN uses efficient zero-copy I/O. 
diff --git a/drivers/char/random.c b/drivers/char/random.c
index 4c3a5ca9d8f7..b430a12eb819 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -655,6 +655,7 @@ void add_interrupt_randomness(int irq)
 	add_timer_randomness(irq_timer_state[irq], 0x100 + irq);
 }
 
+#ifdef CONFIG_BLOCK
 void add_disk_randomness(struct gendisk *disk)
 {
 	if (!disk || !disk->random)
@@ -667,6 +668,7 @@ void add_disk_randomness(struct gendisk *disk)
 }
 
 EXPORT_SYMBOL(add_disk_randomness);
+#endif
 
 #define EXTRACT_SIZE 10
 
@@ -918,6 +920,7 @@ void rand_initialize_irq(int irq)
 	}
 }
 
+#ifdef CONFIG_BLOCK
 void rand_initialize_disk(struct gendisk *disk)
 {
 	struct timer_rand_state *state;
@@ -932,6 +935,7 @@ void rand_initialize_disk(struct gendisk *disk)
 		disk->random = state;
 	}
 }
+#endif
 
 static ssize_t
 random_read(struct file * file, char __user * buf, size_t nbytes, loff_t *ppos)
diff --git a/drivers/ide/Kconfig b/drivers/ide/Kconfig
index b6fb167e20f6..69d627bd537a 100644
--- a/drivers/ide/Kconfig
+++ b/drivers/ide/Kconfig
@@ -4,6 +4,8 @@
 # Andre Hedrick <andre@linux-ide.org>
 #
 
+if BLOCK
+
 menu "ATA/ATAPI/MFM/RLL support"
 
 config IDE
@@ -1082,3 +1084,5 @@ config BLK_DEV_HD
 endif
 
 endmenu
+
+endif
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index bf869ed03eed..6dd31a291d84 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -2,6 +2,8 @@
 # Block device driver configuration
 #
 
+if BLOCK
+
 menu "Multi-device support (RAID and LVM)"
 
 config MD
@@ -251,3 +253,4 @@ config DM_MULTIPATH_EMC
 
 endmenu
 
+endif
diff --git a/drivers/message/i2o/Kconfig b/drivers/message/i2o/Kconfig
index fef677103880..6443392bffff 100644
--- a/drivers/message/i2o/Kconfig
+++ b/drivers/message/i2o/Kconfig
@@ -88,7 +88,7 @@ config I2O_BUS
 
 config I2O_BLOCK
 	tristate "I2O Block OSM"
-	depends on I2O
+	depends on I2O && BLOCK
 	---help---
 	  Include support for the I2O Block OSM. The Block OSM presents disk
 	  and other structured block devices to the operating system. If you
diff --git a/drivers/mmc/Kconfig b/drivers/mmc/Kconfig
index 45bcf098e762..f540bd88dc5a 100644
--- a/drivers/mmc/Kconfig
+++ b/drivers/mmc/Kconfig
@@ -21,7 +21,7 @@ config MMC_DEBUG
 
 config MMC_BLOCK
 	tristate "MMC block device driver"
-	depends on MMC
+	depends on MMC && BLOCK
 	default y
 	help
 	  Say Y here to enable the MMC block device driver support.
diff --git a/drivers/mmc/Makefile b/drivers/mmc/Makefile
index d2957e35cc6f..b1f6e03e7aa9 100644
--- a/drivers/mmc/Makefile
+++ b/drivers/mmc/Makefile
@@ -24,7 +24,8 @@ obj-$(CONFIG_MMC_AU1X)		+= au1xmmc.o
 obj-$(CONFIG_MMC_OMAP)		+= omap.o
 obj-$(CONFIG_MMC_AT91RM9200)	+= at91_mci.o
 
-mmc_core-y := mmc.o mmc_queue.o mmc_sysfs.o
+mmc_core-y := mmc.o mmc_sysfs.o
+mmc_core-$(CONFIG_BLOCK) += mmc_queue.o
 
 ifeq ($(CONFIG_MMC_DEBUG),y)
 EXTRA_CFLAGS += -DDEBUG
diff --git a/drivers/mtd/Kconfig b/drivers/mtd/Kconfig
index a03e862851db..a304b34c2632 100644
--- a/drivers/mtd/Kconfig
+++ b/drivers/mtd/Kconfig
@@ -166,7 +166,7 @@ config MTD_CHAR
 
 config MTD_BLOCK
 	tristate "Caching block device access to MTD devices"
-	depends on MTD
+	depends on MTD && BLOCK
 	---help---
 	  Although most flash chips have an erase size too large to be useful
 	  as block devices, it is possible to use MTD devices which are based
@@ -188,7 +188,7 @@ config MTD_BLOCK
 
 config MTD_BLOCK_RO
 	tristate "Readonly block device access to MTD devices"
-	depends on MTD_BLOCK!=y && MTD
+	depends on MTD_BLOCK!=y && MTD && BLOCK
 	help
 	  This allows you to mount read-only file systems (such as cramfs)
 	  from an MTD device, without the overhead (and danger) of the caching
@@ -199,7 +199,7 @@ config MTD_BLOCK_RO
 
 config FTL
 	tristate "FTL (Flash Translation Layer) support"
-	depends on MTD
+	depends on MTD && BLOCK
 	---help---
 	  This provides support for the original Flash Translation Layer which
 	  is part of the PCMCIA specification. It uses a kind of pseudo-
@@ -215,7 +215,7 @@ config FTL
 
 config NFTL
 	tristate "NFTL (NAND Flash Translation Layer) support"
-	depends on MTD
+	depends on MTD && BLOCK
 	---help---
 	  This provides support for the NAND Flash Translation Layer which is
 	  used on M-Systems' DiskOnChip devices. It uses a kind of pseudo-
@@ -238,7 +238,7 @@ config NFTL_RW
 
 config INFTL
 	tristate "INFTL (Inverse NAND Flash Translation Layer) support"
-	depends on MTD
+	depends on MTD && BLOCK
 	---help---
 	  This provides support for the Inverse NAND Flash Translation
 	  Layer which is used on M-Systems' newer DiskOnChip devices. It
@@ -255,7 +255,7 @@ config INFTL
 
 config RFD_FTL
         tristate "Resident Flash Disk (Flash Translation Layer) support"
-	depends on MTD
+	depends on MTD && BLOCK
 	---help---
 	  This provides support for the flash translation layer known
 	  as the Resident Flash Disk (RFD), as used by the Embedded BIOS
diff --git a/drivers/mtd/devices/Kconfig b/drivers/mtd/devices/Kconfig
index 16c02b5ccf7e..440f6851da69 100644
--- a/drivers/mtd/devices/Kconfig
+++ b/drivers/mtd/devices/Kconfig
@@ -136,7 +136,7 @@ config MTDRAM_ABS_POS
 
 config MTD_BLOCK2MTD
 	tristate "MTD using block device"
-	depends on MTD
+	depends on MTD && BLOCK
 	help
 	  This driver allows a block device to appear as an MTD. It would
 	  generally be used in the following cases:
diff --git a/drivers/s390/block/Kconfig b/drivers/s390/block/Kconfig
index 929d6fff6152..b250c5354503 100644
--- a/drivers/s390/block/Kconfig
+++ b/drivers/s390/block/Kconfig
@@ -1,4 +1,4 @@
-if S390
+if S390 && BLOCK
 
 comment "S/390 block device drivers"
 	depends on S390
diff --git a/drivers/scsi/Kconfig b/drivers/scsi/Kconfig
index c4dfcc91ddda..dab082002e6f 100644
--- a/drivers/scsi/Kconfig
+++ b/drivers/scsi/Kconfig
@@ -3,11 +3,13 @@ menu "SCSI device support"
 config RAID_ATTRS
 	tristate "RAID Transport Class"
 	default n
+	depends on BLOCK
 	---help---
 	  Provides RAID
 
 config SCSI
 	tristate "SCSI device support"
+	depends on BLOCK
 	---help---
 	  If you want to use a SCSI hard disk, SCSI tape drive, SCSI CD-ROM or
 	  any other SCSI device under Linux, say Y and make sure that you know
diff --git a/fs/Kconfig b/fs/Kconfig
index 4fd9efac29ab..1453d2d164f7 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -4,6 +4,8 @@
 
 menu "File systems"
 
+if BLOCK
+
 config EXT2_FS
 	tristate "Second extended fs support"
 	help
@@ -399,6 +401,8 @@ config ROMFS_FS
 	  If you don't know whether you need it, then you don't need it:
 	  answer N.
 
+endif
+
 config INOTIFY
 	bool "Inotify file change notification support"
 	default y
@@ -530,6 +534,7 @@ config FUSE_FS
 	  If you want to develop a userspace FS, or if you want to use
 	  a filesystem based on FUSE, answer Y or M.
 
+if BLOCK
 menu "CD-ROM/DVD Filesystems"
 
 config ISO9660_FS
@@ -597,7 +602,9 @@ config UDF_NLS
 	depends on (UDF_FS=m && NLS) || (UDF_FS=y && NLS=y)
 
 endmenu
+endif
 
+if BLOCK
 menu "DOS/FAT/NT Filesystems"
 
 config FAT_FS
@@ -782,6 +789,7 @@ config NTFS_RW
 	  It is perfectly safe to say N here.
 
 endmenu
+endif
 
 menu "Pseudo filesystems"
 
@@ -939,7 +947,7 @@ menu "Miscellaneous filesystems"
 
 config ADFS_FS
 	tristate "ADFS file system support (EXPERIMENTAL)"
-	depends on EXPERIMENTAL
+	depends on BLOCK && EXPERIMENTAL
 	help
 	  The Acorn Disc Filing System is the standard file system of the
 	  RiscOS operating system which runs on Acorn's ARM-based Risc PC
@@ -967,7 +975,7 @@ config ADFS_FS_RW
 
 config AFFS_FS
 	tristate "Amiga FFS file system support (EXPERIMENTAL)"
-	depends on EXPERIMENTAL
+	depends on BLOCK && EXPERIMENTAL
 	help
 	  The Fast File System (FFS) is the common file system used on hard
 	  disks by Amiga(tm) systems since AmigaOS Version 1.3 (34.20).  Say Y
@@ -989,7 +997,7 @@ config AFFS_FS
 
 config HFS_FS
 	tristate "Apple Macintosh file system support (EXPERIMENTAL)"
-	depends on EXPERIMENTAL
+	depends on BLOCK && EXPERIMENTAL
 	select NLS
 	help
 	  If you say Y here, you will be able to mount Macintosh-formatted
@@ -1002,6 +1010,7 @@ config HFS_FS
 
 config HFSPLUS_FS
 	tristate "Apple Extended HFS file system support"
+	depends on BLOCK
 	select NLS
 	select NLS_UTF8
 	help
@@ -1015,7 +1024,7 @@ config HFSPLUS_FS
 
 config BEFS_FS
 	tristate "BeOS file system (BeFS) support (read only) (EXPERIMENTAL)"
-	depends on EXPERIMENTAL
+	depends on BLOCK && EXPERIMENTAL
 	select NLS
 	help
 	  The BeOS File System (BeFS) is the native file system of Be, Inc's
@@ -1042,7 +1051,7 @@ config BEFS_DEBUG
 
 config BFS_FS
 	tristate "BFS file system support (EXPERIMENTAL)"
-	depends on EXPERIMENTAL
+	depends on BLOCK && EXPERIMENTAL
 	help
 	  Boot File System (BFS) is a file system used under SCO UnixWare to
 	  allow the bootloader access to the kernel image and other important
@@ -1064,7 +1073,7 @@ config BFS_FS
 
 config EFS_FS
 	tristate "EFS file system support (read only) (EXPERIMENTAL)"
-	depends on EXPERIMENTAL
+	depends on BLOCK && EXPERIMENTAL
 	help
 	  EFS is an older file system used for non-ISO9660 CD-ROMs and hard
 	  disk partitions by SGI's IRIX operating system (IRIX 6.0 and newer
@@ -1079,7 +1088,7 @@ config EFS_FS
 
 config JFFS_FS
 	tristate "Journalling Flash File System (JFFS) support"
-	depends on MTD
+	depends on MTD && BLOCK
 	help
 	  JFFS is the Journaling Flash File System developed by Axis
 	  Communications in Sweden, aimed at providing a crash/powerdown-safe
@@ -1264,6 +1273,7 @@ endchoice
 
 config CRAMFS
 	tristate "Compressed ROM file system support (cramfs)"
+	depends on BLOCK
 	select ZLIB_INFLATE
 	help
 	  Saying Y here includes support for CramFs (Compressed ROM File
@@ -1283,6 +1293,7 @@ config CRAMFS
 
 config VXFS_FS
 	tristate "FreeVxFS file system support (VERITAS VxFS(TM) compatible)"
+	depends on BLOCK
 	help
 	  FreeVxFS is a file system driver that support the VERITAS VxFS(TM)
 	  file system format.  VERITAS VxFS(TM) is the standard file system
@@ -1300,6 +1311,7 @@ config VXFS_FS
 
 config HPFS_FS
 	tristate "OS/2 HPFS file system support"
+	depends on BLOCK
 	help
 	  OS/2 is IBM's operating system for PC's, the same as Warp, and HPFS
 	  is the file system used for organizing files on OS/2 hard disk
@@ -1316,6 +1328,7 @@ config HPFS_FS
 
 config QNX4FS_FS
 	tristate "QNX4 file system support (read only)"
+	depends on BLOCK
 	help
 	  This is the file system used by the real-time operating systems
 	  QNX 4 and QNX 6 (the latter is also called QNX RTP).
@@ -1343,6 +1356,7 @@ config QNX4FS_RW
 
 config SYSV_FS
 	tristate "System V/Xenix/V7/Coherent file system support"
+	depends on BLOCK
 	help
 	  SCO, Xenix and Coherent are commercial Unix systems for Intel
 	  machines, and Version 7 was used on the DEC PDP-11. Saying Y
@@ -1381,6 +1395,7 @@ config SYSV_FS
 
 config UFS_FS
 	tristate "UFS file system support (read only)"
+	depends on BLOCK
 	help
 	  BSD and derivate versions of Unix (such as SunOS, FreeBSD, NetBSD,
 	  OpenBSD and NeXTstep) use a file system called UFS. Some System V
@@ -1959,11 +1974,13 @@ config GENERIC_ACL
 
 endmenu
 
+if BLOCK
 menu "Partition Types"
 
 source "fs/partitions/Kconfig"
 
 endmenu
+endif
 
 source "fs/nls/Kconfig"
 
diff --git a/fs/Makefile b/fs/Makefile
index 46b8cfe497b2..a503e6ce0f32 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -5,12 +5,18 @@
 # Rewritten to use lists instead of if-statements.
 # 
 
-obj-y :=	open.o read_write.o file_table.o buffer.o  bio.o super.o \
-		block_dev.o char_dev.o stat.o exec.o pipe.o namei.o fcntl.o \
+obj-y :=	open.o read_write.o file_table.o super.o \
+		char_dev.o stat.o exec.o pipe.o namei.o fcntl.o \
 		ioctl.o readdir.o select.o fifo.o locks.o dcache.o inode.o \
 		attr.o bad_inode.o file.o filesystems.o namespace.o aio.o \
-		seq_file.o xattr.o libfs.o fs-writeback.o mpage.o direct-io.o \
-		ioprio.o pnode.o drop_caches.o splice.o sync.o
+		seq_file.o xattr.o libfs.o fs-writeback.o \
+		pnode.o drop_caches.o splice.o sync.o
+
+ifeq ($(CONFIG_BLOCK),y)
+obj-y +=	buffer.o bio.o block_dev.o direct-io.o mpage.o ioprio.o
+else
+obj-y +=	no-block.o
+endif
 
 obj-$(CONFIG_INOTIFY)		+= inotify.o
 obj-$(CONFIG_INOTIFY_USER)	+= inotify_user.o
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index e1a56437040a..64b34533edea 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -645,6 +645,7 @@ out:
 }
 #endif
 
+#ifdef CONFIG_BLOCK
 struct hd_geometry32 {
 	unsigned char heads;
 	unsigned char sectors;
@@ -869,6 +870,7 @@ static int sg_grt_trans(unsigned int fd, unsigned int cmd, unsigned long arg)
 	}
 	return err;
 }
+#endif /* CONFIG_BLOCK */
 
 struct sock_fprog32 {
 	unsigned short	len;
@@ -992,6 +994,7 @@ static int ppp_ioctl_trans(unsigned int fd, unsigned int cmd, unsigned long arg)
 }
 
 
+#ifdef CONFIG_BLOCK
 struct mtget32 {
 	compat_long_t	mt_type;
 	compat_long_t	mt_resid;
@@ -1164,6 +1167,7 @@ static int cdrom_ioctl_trans(unsigned int fd, unsigned int cmd, unsigned long ar
 
 	return err;
 }
+#endif /* CONFIG_BLOCK */
 
 #ifdef CONFIG_VT
 
@@ -1491,6 +1495,7 @@ ret_einval(unsigned int fd, unsigned int cmd, unsigned long arg)
 	return -EINVAL;
 }
 
+#ifdef CONFIG_BLOCK
 static int broken_blkgetsize(unsigned int fd, unsigned int cmd, unsigned long arg)
 {
 	/* The mkswap binary hard codes it to Intel value :-((( */
@@ -1525,12 +1530,14 @@ static int blkpg_ioctl_trans(unsigned int fd, unsigned int cmd, unsigned long ar
 
 	return sys_ioctl(fd, cmd, (unsigned long)a);
 }
+#endif
 
 static int ioc_settimeout(unsigned int fd, unsigned int cmd, unsigned long arg)
 {
 	return rw_long(fd, AUTOFS_IOC_SETTIMEOUT, arg);
 }
 
+#ifdef CONFIG_BLOCK
 /* Fix sizeof(sizeof()) breakage */
 #define BLKBSZGET_32   _IOR(0x12,112,int)
 #define BLKBSZSET_32   _IOW(0x12,113,int)
@@ -1551,6 +1558,7 @@ static int do_blkgetsize64(unsigned int fd, unsigned int cmd,
 {
        return sys_ioctl(fd, BLKGETSIZE64, (unsigned long)compat_ptr(arg));
 }
+#endif
 
 /* Bluetooth ioctls */
 #define HCIUARTSETPROTO	_IOW('U', 200, int)
@@ -1571,6 +1579,7 @@ static int do_blkgetsize64(unsigned int fd, unsigned int cmd,
 #define HIDPGETCONNLIST	_IOR('H', 210, int)
 #define HIDPGETCONNINFO	_IOR('H', 211, int)
 
+#ifdef CONFIG_BLOCK
 struct floppy_struct32 {
 	compat_uint_t	size;
 	compat_uint_t	sect;
@@ -1895,6 +1904,7 @@ out:
 	kfree(karg);
 	return err;
 }
+#endif
 
 struct mtd_oob_buf32 {
 	u_int32_t start;
@@ -1936,6 +1946,7 @@ static int mtd_rw_oob(unsigned int fd, unsigned int cmd, unsigned long arg)
 	return err;
 }	
 
+#ifdef CONFIG_BLOCK
 struct raw32_config_request
 {
         compat_int_t    raw_minor;
@@ -2000,6 +2011,7 @@ static int raw_ioctl(unsigned fd, unsigned cmd, unsigned long arg)
         }
         return ret;
 }
+#endif /* CONFIG_BLOCK */
 
 struct serial_struct32 {
         compat_int_t    type;
@@ -2606,6 +2618,7 @@ HANDLE_IOCTL(SIOCBRDELIF, dev_ifsioc)
 HANDLE_IOCTL(SIOCRTMSG, ret_einval)
 HANDLE_IOCTL(SIOCGSTAMP, do_siocgstamp)
 #endif
+#ifdef CONFIG_BLOCK
 HANDLE_IOCTL(HDIO_GETGEO, hdio_getgeo)
 HANDLE_IOCTL(BLKRAGET, w_long)
 HANDLE_IOCTL(BLKGETSIZE, w_long)
@@ -2631,14 +2644,17 @@ HANDLE_IOCTL(FDGETFDCSTAT32, fd_ioctl_trans)
 HANDLE_IOCTL(FDWERRORGET32, fd_ioctl_trans)
 HANDLE_IOCTL(SG_IO,sg_ioctl_trans)
 HANDLE_IOCTL(SG_GET_REQUEST_TABLE, sg_grt_trans)
+#endif
 HANDLE_IOCTL(PPPIOCGIDLE32, ppp_ioctl_trans)
 HANDLE_IOCTL(PPPIOCSCOMPRESS32, ppp_ioctl_trans)
 HANDLE_IOCTL(PPPIOCSPASS32, ppp_sock_fprog_ioctl_trans)
 HANDLE_IOCTL(PPPIOCSACTIVE32, ppp_sock_fprog_ioctl_trans)
+#ifdef CONFIG_BLOCK
 HANDLE_IOCTL(MTIOCGET32, mt_ioctl_trans)
 HANDLE_IOCTL(MTIOCPOS32, mt_ioctl_trans)
 HANDLE_IOCTL(CDROMREADAUDIO, cdrom_ioctl_trans)
 HANDLE_IOCTL(CDROM_SEND_PACKET, cdrom_ioctl_trans)
+#endif
 #define AUTOFS_IOC_SETTIMEOUT32 _IOWR(0x93,0x64,unsigned int)
 HANDLE_IOCTL(AUTOFS_IOC_SETTIMEOUT32, ioc_settimeout)
 #ifdef CONFIG_VT
@@ -2677,12 +2693,14 @@ HANDLE_IOCTL(SONET_SETFRAMING, do_atm_ioctl)
 HANDLE_IOCTL(SONET_GETFRAMING, do_atm_ioctl)
 HANDLE_IOCTL(SONET_GETFRSENSE, do_atm_ioctl)
 /* block stuff */
+#ifdef CONFIG_BLOCK
 HANDLE_IOCTL(BLKBSZGET_32, do_blkbszget)
 HANDLE_IOCTL(BLKBSZSET_32, do_blkbszset)
 HANDLE_IOCTL(BLKGETSIZE64_32, do_blkgetsize64)
 /* Raw devices */
 HANDLE_IOCTL(RAW_SETBIND, raw_ioctl)
 HANDLE_IOCTL(RAW_GETBIND, raw_ioctl)
+#endif
 /* Serial */
 HANDLE_IOCTL(TIOCGSERIAL, serial_struct_ioctl)
 HANDLE_IOCTL(TIOCSSERIAL, serial_struct_ioctl)
diff --git a/fs/internal.h b/fs/internal.h
index f662b703bb97..f07147d63255 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -14,10 +14,16 @@
 /*
  * block_dev.c
  */
+#ifdef CONFIG_BLOCK
 extern struct super_block *blockdev_superblock;
 extern void __init bdev_cache_init(void);
 
 #define sb_is_blkdev_sb(sb) ((sb) == blockdev_superblock)
+#else
+static inline void bdev_cache_init(void) {}
+
+#define sb_is_blkdev_sb(sb) 0
+#endif
 
 /*
  * char_dev.c
diff --git a/fs/no-block.c b/fs/no-block.c
new file mode 100644
index 000000000000..d269a93d3467
--- /dev/null
+++ b/fs/no-block.c
@@ -0,0 +1,22 @@
+/* no-block.c: implementation of routines required for non-BLOCK configuration
+ *
+ * Copyright (C) 2006 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/fs.h>
+
+static int no_blkdev_open(struct inode * inode, struct file * filp)
+{
+	return -ENODEV;
+}
+
+const struct file_operations def_blk_fops = {
+	.open		= no_blkdev_open,
+};
diff --git a/fs/partitions/Makefile b/fs/partitions/Makefile
index d713ce6b3e12..67e665fdb7fc 100644
--- a/fs/partitions/Makefile
+++ b/fs/partitions/Makefile
@@ -2,7 +2,7 @@
 # Makefile for the linux kernel.
 #
 
-obj-y := check.o
+obj-$(CONFIG_BLOCK) := check.o
 
 obj-$(CONFIG_ACORN_PARTITION) += acorn.o
 obj-$(CONFIG_AMIGA_PARTITION) += amiga.o
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 5bbd60896050..66bc425f2f3d 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -277,12 +277,15 @@ static int devinfo_show(struct seq_file *f, void *v)
 		if (i == 0)
 			seq_printf(f, "Character devices:\n");
 		chrdev_show(f, i);
-	} else {
+	}
+#ifdef CONFIG_BLOCK
+	else {
 		i -= CHRDEV_MAJOR_HASH_SIZE;
 		if (i == 0)
 			seq_printf(f, "\nBlock devices:\n");
 		blkdev_show(f, i);
 	}
+#endif
 	return 0;
 }
 
@@ -355,6 +358,7 @@ static int stram_read_proc(char *page, char **start, off_t off,
 }
 #endif
 
+#ifdef CONFIG_BLOCK
 extern struct seq_operations partitions_op;
 static int partitions_open(struct inode *inode, struct file *file)
 {
@@ -378,6 +382,7 @@ static struct file_operations proc_diskstats_operations = {
 	.llseek		= seq_lseek,
 	.release	= seq_release,
 };
+#endif
 
 #ifdef CONFIG_MODULES
 extern struct seq_operations modules_op;
@@ -695,7 +700,9 @@ void __init proc_misc_init(void)
 		entry->proc_fops = &proc_kmsg_operations;
 	create_seq_entry("devices", 0, &proc_devinfo_operations);
 	create_seq_entry("cpuinfo", 0, &proc_cpuinfo_operations);
+#ifdef CONFIG_BLOCK
 	create_seq_entry("partitions", 0, &proc_partitions_operations);
+#endif
 	create_seq_entry("stat", 0, &proc_stat_operations);
 	create_seq_entry("interrupts", 0, &proc_interrupts_operations);
 #ifdef CONFIG_SLAB
@@ -707,7 +714,9 @@ void __init proc_misc_init(void)
 	create_seq_entry("buddyinfo",S_IRUGO, &fragmentation_file_operations);
 	create_seq_entry("vmstat",S_IRUGO, &proc_vmstat_file_operations);
 	create_seq_entry("zoneinfo",S_IRUGO, &proc_zoneinfo_file_operations);
+#ifdef CONFIG_BLOCK
 	create_seq_entry("diskstats", 0, &proc_diskstats_operations);
+#endif
 #ifdef CONFIG_MODULES
 	create_seq_entry("modules", 0, &proc_modules_operations);
 #endif
diff --git a/fs/quota.c b/fs/quota.c
index d6a2be826e29..b9dae76a0b6e 100644
--- a/fs/quota.c
+++ b/fs/quota.c
@@ -337,6 +337,34 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id, void
 	return 0;
 }
 
+/*
+ * look up a superblock on which quota ops will be performed
+ * - use the name of a block device to find the superblock thereon
+ */
+static inline struct super_block *quotactl_block(const char __user *special)
+{
+#ifdef CONFIG_BLOCK
+	struct block_device *bdev;
+	struct super_block *sb;
+	char *tmp = getname(special);
+
+	if (IS_ERR(tmp))
+		return ERR_PTR(PTR_ERR(tmp));
+	bdev = lookup_bdev(tmp);
+	putname(tmp);
+	if (IS_ERR(bdev))
+		return ERR_PTR(PTR_ERR(bdev));
+	sb = get_super(bdev);
+	bdput(bdev);
+	if (!sb)
+		return ERR_PTR(-ENODEV);
+
+	return sb;
+#else
+	return ERR_PTR(-ENODEV);
+#endif
+}
+
 /*
  * This is the system call interface. This communicates with
  * the user-level programs. Currently this only supports diskquota
@@ -347,25 +375,15 @@ asmlinkage long sys_quotactl(unsigned int cmd, const char __user *special, qid_t
 {
 	uint cmds, type;
 	struct super_block *sb = NULL;
-	struct block_device *bdev;
-	char *tmp;
 	int ret;
 
 	cmds = cmd >> SUBCMDSHIFT;
 	type = cmd & SUBCMDMASK;
 
 	if (cmds != Q_SYNC || special) {
-		tmp = getname(special);
-		if (IS_ERR(tmp))
-			return PTR_ERR(tmp);
-		bdev = lookup_bdev(tmp);
-		putname(tmp);
-		if (IS_ERR(bdev))
-			return PTR_ERR(bdev);
-		sb = get_super(bdev);
-		bdput(bdev);
-		if (!sb)
-			return -ENODEV;
+		sb = quotactl_block(special);
+		if (IS_ERR(sb))
+			return PTR_ERR(sb);
 	}
 
 	ret = check_quotactl_valid(sb, type, cmds, id);
diff --git a/fs/super.c b/fs/super.c
index 15671cd048b1..aec99ddbe53f 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -571,8 +571,10 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
 {
 	int retval;
 	
+#ifdef CONFIG_BLOCK
 	if (!(flags & MS_RDONLY) && bdev_read_only(sb->s_bdev))
 		return -EACCES;
+#endif
 	if (flags & MS_RDONLY)
 		acct_auto_close(sb);
 	shrink_dcache_sb(sb);
@@ -692,6 +694,7 @@ void kill_litter_super(struct super_block *sb)
 
 EXPORT_SYMBOL(kill_litter_super);
 
+#ifdef CONFIG_BLOCK
 static int set_bdev_super(struct super_block *s, void *data)
 {
 	s->s_bdev = data;
@@ -787,6 +790,7 @@ void kill_block_super(struct super_block *sb)
 }
 
 EXPORT_SYMBOL(kill_block_super);
+#endif
 
 int get_sb_nodev(struct file_system_type *fs_type,
 	int flags, void *data,
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index 26b364c9d62c..35115bca036e 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -1,5 +1,6 @@
 config XFS_FS
 	tristate "XFS filesystem support"
+	depends on BLOCK
 	help
 	  XFS is a high performance journaling filesystem which originated
 	  on the SGI IRIX platform.  It is completely multi-threaded, can
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 2c01a90998a7..3e36107d342a 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -16,6 +16,22 @@
 
 #include <asm/scatterlist.h>
 
+#ifdef CONFIG_LBD
+# include <asm/div64.h>
+# define sector_div(a, b) do_div(a, b)
+#else
+# define sector_div(n, b)( \
+{ \
+	int _res; \
+	_res = (n) % (b); \
+	(n) /= (b); \
+	_res; \
+} \
+)
+#endif
+
+#ifdef CONFIG_BLOCK
+
 struct scsi_ioctl_command;
 
 struct request_queue;
@@ -818,24 +834,30 @@ struct work_struct;
 int kblockd_schedule_work(struct work_struct *work);
 void kblockd_flush(void);
 
-#ifdef CONFIG_LBD
-# include <asm/div64.h>
-# define sector_div(a, b) do_div(a, b)
-#else
-# define sector_div(n, b)( \
-{ \
-	int _res; \
-	_res = (n) % (b); \
-	(n) /= (b); \
-	_res; \
-} \
-)
-#endif 
-
 #define MODULE_ALIAS_BLOCKDEV(major,minor) \
 	MODULE_ALIAS("block-major-" __stringify(major) "-" __stringify(minor))
 #define MODULE_ALIAS_BLOCKDEV_MAJOR(major) \
 	MODULE_ALIAS("block-major-" __stringify(major) "-*")
 
 
+#else /* CONFIG_BLOCK */
+/*
+ * stubs for when the block layer is configured out
+ */
+#define buffer_heads_over_limit 0
+
+static inline long blk_congestion_wait(int rw, long timeout)
+{
+	return timeout;
+}
+
+static inline long nr_blockdev_pages(void)
+{
+	return 0;
+}
+
+static inline void exit_io_context(void) {}
+
+#endif /* CONFIG_BLOCK */
+
 #endif
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 64b508e35d2a..131ffd37e716 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -14,6 +14,8 @@
 #include <linux/wait.h>
 #include <asm/atomic.h>
 
+#ifdef CONFIG_BLOCK
+
 enum bh_state_bits {
 	BH_Uptodate,	/* Contains valid data */
 	BH_Dirty,	/* Is dirty */
@@ -301,4 +303,18 @@ static inline void lock_buffer(struct buffer_head *bh)
 }
 
 extern int __set_page_dirty_buffers(struct page *page);
+
+#else /* CONFIG_BLOCK */
+
+static inline void buffer_init(void) {}
+static inline int try_to_free_buffers(struct page *page) { return 1; }
+static inline int sync_blockdev(struct block_device *bdev) { return 0; }
+static inline int inode_has_buffers(struct inode *inode) { return 0; }
+static inline void invalidate_inode_buffers(struct inode *inode) {}
+static inline int remove_inode_buffers(struct inode *inode) { return 1; }
+static inline int sync_mapping_buffers(struct address_space *mapping) { return 0; }
+static inline void invalidate_bdev(struct block_device *bdev, int destroy_dirty_buffers) {}
+
+
+#endif /* CONFIG_BLOCK */
 #endif /* _LINUX_BUFFER_HEAD_H */
diff --git a/include/linux/compat_ioctl.h b/include/linux/compat_ioctl.h
index 98d40e08ba6e..d61ef5951538 100644
--- a/include/linux/compat_ioctl.h
+++ b/include/linux/compat_ioctl.h
@@ -90,6 +90,7 @@ COMPATIBLE_IOCTL(FDTWADDLE)
 COMPATIBLE_IOCTL(FDFMTTRK)
 COMPATIBLE_IOCTL(FDRAWCMD)
 /* 0x12 */
+#ifdef CONFIG_BLOCK
 COMPATIBLE_IOCTL(BLKRASET)
 COMPATIBLE_IOCTL(BLKROSET)
 COMPATIBLE_IOCTL(BLKROGET)
@@ -103,6 +104,7 @@ COMPATIBLE_IOCTL(BLKTRACESETUP)
 COMPATIBLE_IOCTL(BLKTRACETEARDOWN)
 ULONG_IOCTL(BLKRASET)
 ULONG_IOCTL(BLKFRASET)
+#endif
 /* RAID */
 COMPATIBLE_IOCTL(RAID_VERSION)
 COMPATIBLE_IOCTL(GET_ARRAY_INFO)
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index 9c5a04f6114c..b3370ef5164d 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -3,6 +3,8 @@
 
 #include <linux/percpu.h>
 
+#ifdef CONFIG_BLOCK
+
 typedef int (elevator_merge_fn) (request_queue_t *, struct request **,
 				 struct bio *);
 
@@ -203,4 +205,5 @@ enum {
 	__val;							\
 })
 
+#endif /* CONFIG_BLOCK */
 #endif
diff --git a/include/linux/fs.h b/include/linux/fs.h
index b73a47582dbe..5baf3a153403 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1482,6 +1482,7 @@ extern void __init vfs_caches_init(unsigned long);
 extern void putname(const char *name);
 #endif
 
+#ifdef CONFIG_BLOCK
 extern int register_blkdev(unsigned int, const char *);
 extern int unregister_blkdev(unsigned int, const char *);
 extern struct block_device *bdget(dev_t);
@@ -1490,11 +1491,15 @@ extern void bd_forget(struct inode *inode);
 extern void bdput(struct block_device *);
 extern struct block_device *open_by_devnum(dev_t, unsigned);
 extern struct block_device *open_partition_by_devnum(dev_t, unsigned);
-extern const struct file_operations def_blk_fops;
 extern const struct address_space_operations def_blk_aops;
+#else
+static inline void bd_forget(struct inode *inode) {}
+#endif
+extern const struct file_operations def_blk_fops;
 extern const struct file_operations def_chr_fops;
 extern const struct file_operations bad_sock_fops;
 extern const struct file_operations def_fifo_fops;
+#ifdef CONFIG_BLOCK
 extern int ioctl_by_bdev(struct block_device *, unsigned, unsigned long);
 extern int blkdev_ioctl(struct inode *, struct file *, unsigned, unsigned long);
 extern long compat_blkdev_ioctl(struct file *, unsigned, unsigned long);
@@ -1510,6 +1515,7 @@ extern void bd_release_from_disk(struct block_device *, struct gendisk *);
 #define bd_claim_by_disk(bdev, holder, disk)	bd_claim(bdev, holder)
 #define bd_release_from_disk(bdev, disk)	bd_release(bdev)
 #endif
+#endif
 
 /* fs/char_dev.c */
 #define CHRDEV_MAJOR_HASH_SIZE	255
@@ -1523,14 +1529,19 @@ extern int chrdev_open(struct inode *, struct file *);
 extern void chrdev_show(struct seq_file *,off_t);
 
 /* fs/block_dev.c */
-#define BLKDEV_MAJOR_HASH_SIZE	255
 #define BDEVNAME_SIZE	32	/* Largest string for a blockdev identifier */
+
+#ifdef CONFIG_BLOCK
+#define BLKDEV_MAJOR_HASH_SIZE	255
 extern const char *__bdevname(dev_t, char *buffer);
 extern const char *bdevname(struct block_device *bdev, char *buffer);
 extern struct block_device *lookup_bdev(const char *);
 extern struct block_device *open_bdev_excl(const char *, int, void *);
 extern void close_bdev_excl(struct block_device *);
 extern void blkdev_show(struct seq_file *,off_t);
+#else
+#define BLKDEV_MAJOR_HASH_SIZE	0
+#endif
 
 extern void init_special_inode(struct inode *, umode_t, dev_t);
 
@@ -1544,6 +1555,7 @@ extern const struct file_operations rdwr_fifo_fops;
 
 extern int fs_may_remount_ro(struct super_block *);
 
+#ifdef CONFIG_BLOCK
 /*
  * return READ, READA, or WRITE
  */
@@ -1555,9 +1567,10 @@ extern int fs_may_remount_ro(struct super_block *);
 #define bio_data_dir(bio)	((bio)->bi_rw & 1)
 
 extern int check_disk_change(struct block_device *);
-extern int invalidate_inodes(struct super_block *);
 extern int __invalidate_device(struct block_device *);
 extern int invalidate_partition(struct gendisk *, int);
+#endif
+extern int invalidate_inodes(struct super_block *);
 unsigned long invalidate_mapping_pages(struct address_space *mapping,
 					pgoff_t start, pgoff_t end);
 unsigned long invalidate_inode_pages(struct address_space *mapping);
@@ -1590,7 +1603,9 @@ extern void emergency_sync(void);
 extern void emergency_remount(void);
 extern int do_remount_sb(struct super_block *sb, int flags,
 			 void *data, int force);
+#ifdef CONFIG_BLOCK
 extern sector_t bmap(struct inode *, sector_t);
+#endif
 extern int notify_change(struct dentry *, struct iattr *);
 extern int permission(struct inode *, int, struct nameidata *);
 extern int generic_permission(struct inode *, int,
@@ -1673,9 +1688,11 @@ static inline void insert_inode_hash(struct inode *inode) {
 extern struct file * get_empty_filp(void);
 extern void file_move(struct file *f, struct list_head *list);
 extern void file_kill(struct file *f);
+#ifdef CONFIG_BLOCK
 struct bio;
 extern void submit_bio(int, struct bio *);
 extern int bdev_read_only(struct block_device *);
+#endif
 extern int set_blocksize(struct block_device *, int);
 extern int sb_set_blocksize(struct super_block *, int);
 extern int sb_min_blocksize(struct super_block *, int);
@@ -1756,6 +1773,7 @@ static inline void do_generic_file_read(struct file * filp, loff_t *ppos,
 				actor);
 }
 
+#ifdef CONFIG_BLOCK
 ssize_t __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	struct block_device *bdev, const struct iovec *iov, loff_t offset,
 	unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
@@ -1793,6 +1811,7 @@ static inline ssize_t blockdev_direct_IO_own_locking(int rw, struct kiocb *iocb,
 	return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,
 				nr_segs, get_block, end_io, DIO_OWN_LOCKING);
 }
+#endif
 
 extern const struct file_operations generic_ro_fops;
 
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index e4af57e87c17..41f276fdd185 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -11,6 +11,8 @@
 
 #include <linux/types.h>
 
+#ifdef CONFIG_BLOCK
+
 enum {
 /* These three have identical behaviour; use the second one if DOS FDISK gets
    confused about extended/logical partitions starting past cylinder 1023. */
@@ -420,3 +422,5 @@ static inline struct block_device *bdget_disk(struct gendisk *disk, int index)
 #endif
 
 #endif
+
+#endif
diff --git a/include/linux/mpage.h b/include/linux/mpage.h
index 517c098fde20..cc5fb75af78a 100644
--- a/include/linux/mpage.h
+++ b/include/linux/mpage.h
@@ -9,6 +9,7 @@
  * (And no, it doesn't do the #ifdef __MPAGE_H thing, and it doesn't do
  * nested includes.  Get it right in the .c file).
  */
+#ifdef CONFIG_BLOCK
 
 struct writeback_control;
 typedef int (writepage_t)(struct page *page, struct writeback_control *wbc);
@@ -20,3 +21,5 @@ int mpage_writepages(struct address_space *mapping,
 		struct writeback_control *wbc, get_block_t get_block);
 int mpage_writepage(struct page *page, get_block_t *get_block,
 		struct writeback_control *wbc);
+
+#endif
diff --git a/include/linux/raid/md.h b/include/linux/raid/md.h
index eb3e547c8fee..c588709acbbc 100644
--- a/include/linux/raid/md.h
+++ b/include/linux/raid/md.h
@@ -53,6 +53,8 @@
 #include <linux/raid/md_u.h>
 #include <linux/raid/md_k.h>
 
+#ifdef CONFIG_MD
+
 /*
  * Different major versions are not compatible.
  * Different minor versions are only downward compatible.
@@ -95,5 +97,6 @@ extern void md_new_event(mddev_t *mddev);
 
 extern void md_update_sb(mddev_t * mddev);
 
+#endif /* CONFIG_MD */
 #endif 
 
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index d28890295852..920b94fe31fa 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -18,6 +18,8 @@
 /* and dm-bio-list.h is not under include/linux because.... ??? */
 #include "../../../drivers/md/dm-bio-list.h"
 
+#ifdef CONFIG_BLOCK
+
 #define	LEVEL_MULTIPATH		(-4)
 #define	LEVEL_LINEAR		(-1)
 #define	LEVEL_FAULTY		(-5)
@@ -362,5 +364,6 @@ static inline void safe_put_page(struct page *p)
 	if (p) put_page(p);
 }
 
+#endif /* CONFIG_BLOCK */
 #endif
 
diff --git a/include/scsi/scsi_tcq.h b/include/scsi/scsi_tcq.h
index bbf66219b769..c247a28259bc 100644
--- a/include/scsi/scsi_tcq.h
+++ b/include/scsi/scsi_tcq.h
@@ -6,7 +6,6 @@
 #include <scsi/scsi_device.h>
 #include <scsi/scsi_host.h>
 
-
 #define MSG_SIMPLE_TAG	0x20
 #define MSG_HEAD_TAG	0x21
 #define MSG_ORDERED_TAG	0x22
@@ -14,6 +13,7 @@
 #define SCSI_NO_TAG	(-1)    /* identify no tag in use */
 
 
+#ifdef CONFIG_BLOCK
 
 /**
  * scsi_get_tag_type - get the type of tag the device supports
@@ -144,4 +144,5 @@ static inline int scsi_init_shared_tag_map(struct Scsi_Host *shost, int depth)
 	return shost->bqt ? 0 : -ENOMEM;
 }
 
+#endif /* CONFIG_BLOCK */
 #endif /* _SCSI_SCSI_TCQ_H */
diff --git a/init/Kconfig b/init/Kconfig
index 4381006dd666..d2eb7a84a264 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -92,7 +92,7 @@ config LOCALVERSION_AUTO
 
 config SWAP
 	bool "Support for paging of anonymous memory (swap)"
-	depends on MMU
+	depends on MMU && BLOCK
 	default y
 	help
 	  This option allows you to choose whether you want to have support
diff --git a/init/do_mounts.c b/init/do_mounts.c
index b290aadb1d3f..dc1ec0803ef9 100644
--- a/init/do_mounts.c
+++ b/init/do_mounts.c
@@ -285,7 +285,11 @@ void __init mount_block_root(char *name, int flags)
 {
 	char *fs_names = __getname();
 	char *p;
+#ifdef CONFIG_BLOCK
 	char b[BDEVNAME_SIZE];
+#else
+	const char *b = name;
+#endif
 
 	get_fs_names(fs_names);
 retry:
@@ -304,7 +308,9 @@ retry:
 		 * Allow the user to distinguish between failed sys_open
 		 * and bad superblock on root device.
 		 */
+#ifdef CONFIG_BLOCK
 		__bdevname(ROOT_DEV, b);
+#endif
 		printk("VFS: Cannot open root device \"%s\" or %s\n",
 				root_device_name, b);
 		printk("Please append a correct \"root=\" boot option\n");
@@ -316,7 +322,10 @@ retry:
 	for (p = fs_names; *p; p += strlen(p)+1)
 		printk(" %s", p);
 	printk("\n");
-	panic("VFS: Unable to mount root fs on %s", __bdevname(ROOT_DEV, b));
+#ifdef CONFIG_BLOCK
+	__bdevname(ROOT_DEV, b);
+#endif
+	panic("VFS: Unable to mount root fs on %s", b);
 out:
 	putname(fs_names);
 }
@@ -387,8 +396,10 @@ void __init mount_root(void)
 			change_floppy("root floppy");
 	}
 #endif
+#ifdef CONFIG_BLOCK
 	create_dev("/dev/root", ROOT_DEV);
 	mount_block_root("/dev/root", root_mountflags);
+#endif
 }
 
 /*
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 6991bece67e8..7a3b2e75f040 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -134,3 +134,8 @@ cond_syscall(sys_madvise);
 cond_syscall(sys_mremap);
 cond_syscall(sys_remap_file_pages);
 cond_syscall(compat_sys_move_pages);
+
+/* block-layer dependent */
+cond_syscall(sys_bdflush);
+cond_syscall(sys_ioprio_set);
+cond_syscall(sys_ioprio_get);
diff --git a/mm/Makefile b/mm/Makefile
index 4f2166a833b9..12b3a4eee88d 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -12,7 +12,7 @@ obj-y			:= bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
 			   readahead.o swap.o truncate.o vmscan.o \
 			   prio_tree.o util.o mmzone.o vmstat.o $(mmu-y)
 
-ifeq ($(CONFIG_MMU),y)
+ifeq ($(CONFIG_MMU)$(CONFIG_BLOCK),yy)
 obj-y			+= bounce.o
 endif
 obj-$(CONFIG_SWAP)	+= page_io.o swap_state.o swapfile.o thrash.o
diff --git a/mm/filemap.c b/mm/filemap.c
index d6846de08887..c4fe97f5ace0 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2020,6 +2020,7 @@ inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, i
 		if (unlikely(*pos + *count > inode->i_sb->s_maxbytes))
 			*count = inode->i_sb->s_maxbytes - *pos;
 	} else {
+#ifdef CONFIG_BLOCK
 		loff_t isize;
 		if (bdev_read_only(I_BDEV(inode)))
 			return -EPERM;
@@ -2031,6 +2032,9 @@ inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, i
 
 		if (*pos + *count > isize)
 			*count = isize - *pos;
+#else
+		return -EPERM;
+#endif
 	}
 	return 0;
 }
diff --git a/mm/migrate.c b/mm/migrate.c
index 7f50e3ff54cd..ba2453f9483d 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -409,6 +409,7 @@ int migrate_page(struct address_space *mapping,
 }
 EXPORT_SYMBOL(migrate_page);
 
+#ifdef CONFIG_BLOCK
 /*
  * Migration function for pages with buffers. This function can only be used
  * if the underlying filesystem guarantees that no other references to "page"
@@ -466,6 +467,7 @@ int buffer_migrate_page(struct address_space *mapping,
 	return 0;
 }
 EXPORT_SYMBOL(buffer_migrate_page);
+#endif
 
 /*
  * Writeback a page to clean the dirty state
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index ecf27839c203..c0d4ce144dec 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -807,9 +807,11 @@ int fastcall set_page_dirty(struct page *page)
 
 	if (likely(mapping)) {
 		int (*spd)(struct page *) = mapping->a_ops->set_page_dirty;
-		if (spd)
-			return (*spd)(page);
-		return __set_page_dirty_buffers(page);
+#ifdef CONFIG_BLOCK
+		if (!spd)
+			spd = __set_page_dirty_buffers;
+#endif
+		return (*spd)(page);
 	}
 	if (!PageDirty(page)) {
 		if (!TestSetPageDirty(page))
diff --git a/mm/truncate.c b/mm/truncate.c
index cd3e34b816db..8fde6580657e 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -35,8 +35,10 @@ void do_invalidatepage(struct page *page, unsigned long offset)
 {
 	void (*invalidatepage)(struct page *, unsigned long);
 	invalidatepage = page->mapping->a_ops->invalidatepage;
+#ifdef CONFIG_BLOCK
 	if (!invalidatepage)
 		invalidatepage = block_invalidatepage;
+#endif
 	if (invalidatepage)
 		(*invalidatepage)(page, offset);
 }
-- 
cgit v1.2.3


From 5c87579e65ee4f419b2369407f82326d38b5d2d8 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Sat, 30 Sep 2006 23:27:17 -0700
Subject: [PATCH] maximum latency tracking infrastructure

Add infrastructure to track "maximum allowable latency" for power saving
policies.

The reason for adding this infrastructure is that power management in the
idle loop needs to make a tradeoff between latency and power savings
(deeper power save modes have a longer latency to running code again).  The
code that today makes this tradeoff just does a rather simple algorithm;
however this is not good enough: There are devices and use cases where a
lower latency is required than that the higher power saving states provide.
 An example would be audio playback, but another example is the ipw2100
wireless driver that right now has a very direct and ugly acpi hook to
disable some higher power states randomly when it gets certain types of
error.

The proposed solution is to have an interface where drivers can

* announce the maximum latency (in microseconds) that they can deal with
* modify this latency
* give up their constraint

and a function where the code that decides on power saving strategy can
query the current global desired maximum.

This patch has a user of each side: on the consumer side, ACPI is patched
to use this, on the producer side the ipw2100 driver is patched.

A generic maximum latency is also registered of 2 timer ticks (more and you
lose accurate time tracking after all).

While the existing users of the patch are x86 specific, the infrastructure
is not.  I'd like to ask the arch maintainers of other architectures if the
infrastructure is generic enough for their use (assuming the architecture
has such a tradeoff as concept at all), and the sound/multimedia driver
owners to look at the driver facing API to see if this is something they
can use.

[akpm@osdl.org: cleanups]
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Acked-by: Jesse Barnes <jesse.barnes@intel.com>
Cc: "Brown, Len" <len.brown@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/acpi/processor_idle.c  |  38 +++++-
 drivers/net/wireless/ipw2100.c |  10 ++
 include/linux/latency.h        |  25 ++++
 kernel/Makefile                |   2 +-
 kernel/latency.c               | 279 +++++++++++++++++++++++++++++++++++++++++
 5 files changed, 349 insertions(+), 5 deletions(-)
 create mode 100644 include/linux/latency.h
 create mode 100644 kernel/latency.c

(limited to 'kernel')

diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index 71066066d626..0a395fca843b 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -38,6 +38,7 @@
 #include <linux/dmi.h>
 #include <linux/moduleparam.h>
 #include <linux/sched.h>	/* need_resched() */
+#include <linux/latency.h>
 
 #include <asm/io.h>
 #include <asm/uaccess.h>
@@ -453,7 +454,8 @@ static void acpi_processor_idle(void)
 	 */
 	if (cx->promotion.state &&
 	    ((cx->promotion.state - pr->power.states) <= max_cstate)) {
-		if (sleep_ticks > cx->promotion.threshold.ticks) {
+		if (sleep_ticks > cx->promotion.threshold.ticks &&
+		  cx->promotion.state->latency <= system_latency_constraint()) {
 			cx->promotion.count++;
 			cx->demotion.count = 0;
 			if (cx->promotion.count >=
@@ -494,8 +496,10 @@ static void acpi_processor_idle(void)
       end:
 	/*
 	 * Demote if current state exceeds max_cstate
+	 * or if the latency of the current state is unacceptable
 	 */
-	if ((pr->power.state - pr->power.states) > max_cstate) {
+	if ((pr->power.state - pr->power.states) > max_cstate ||
+		pr->power.state->latency > system_latency_constraint()) {
 		if (cx->demotion.state)
 			next_state = cx->demotion.state;
 	}
@@ -1009,9 +1013,11 @@ static int acpi_processor_power_seq_show(struct seq_file *seq, void *offset)
 
 	seq_printf(seq, "active state:            C%zd\n"
 		   "max_cstate:              C%d\n"
-		   "bus master activity:     %08x\n",
+		   "bus master activity:     %08x\n"
+		   "maximum allowed latency: %d usec\n",
 		   pr->power.state ? pr->power.state - pr->power.states : 0,
-		   max_cstate, (unsigned)pr->power.bm_activity);
+		   max_cstate, (unsigned)pr->power.bm_activity,
+		   system_latency_constraint());
 
 	seq_puts(seq, "states:\n");
 
@@ -1077,6 +1083,28 @@ static const struct file_operations acpi_processor_power_fops = {
 	.release = single_release,
 };
 
+static void smp_callback(void *v)
+{
+	/* we already woke the CPU up, nothing more to do */
+}
+
+/*
+ * This function gets called when a part of the kernel has a new latency
+ * requirement.  This means we need to get all processors out of their C-state,
+ * and then recalculate a new suitable C-state. Just do a cross-cpu IPI; that
+ * wakes them all right up.
+ */
+static int acpi_processor_latency_notify(struct notifier_block *b,
+		unsigned long l, void *v)
+{
+	smp_call_function(smp_callback, NULL, 0, 1);
+	return NOTIFY_OK;
+}
+
+static struct notifier_block acpi_processor_latency_notifier = {
+	.notifier_call = acpi_processor_latency_notify,
+};
+
 int acpi_processor_power_init(struct acpi_processor *pr,
 			      struct acpi_device *device)
 {
@@ -1093,6 +1121,7 @@ int acpi_processor_power_init(struct acpi_processor *pr,
 			       "ACPI: processor limited to max C-state %d\n",
 			       max_cstate);
 		first_run++;
+		register_latency_notifier(&acpi_processor_latency_notifier);
 	}
 
 	if (!pr)
@@ -1164,6 +1193,7 @@ int acpi_processor_power_exit(struct acpi_processor *pr,
 		 * copies of pm_idle before proceeding.
 		 */
 		cpu_idle_wait();
+		unregister_latency_notifier(&acpi_processor_latency_notifier);
 	}
 
 	return 0;
diff --git a/drivers/net/wireless/ipw2100.c b/drivers/net/wireless/ipw2100.c
index 6c5add701a6f..97937809de09 100644
--- a/drivers/net/wireless/ipw2100.c
+++ b/drivers/net/wireless/ipw2100.c
@@ -163,6 +163,7 @@ that only one external action is invoked at a time.
 #include <linux/firmware.h>
 #include <linux/acpi.h>
 #include <linux/ctype.h>
+#include <linux/latency.h>
 
 #include "ipw2100.h"
 
@@ -1697,6 +1698,11 @@ static int ipw2100_up(struct ipw2100_priv *priv, int deferred)
 		return 0;
 	}
 
+	/* the ipw2100 hardware really doesn't want power management delays
+	 * longer than 175usec
+	 */
+	modify_acceptable_latency("ipw2100", 175);
+
 	/* If the interrupt is enabled, turn it off... */
 	spin_lock_irqsave(&priv->low_lock, flags);
 	ipw2100_disable_interrupts(priv);
@@ -1849,6 +1855,8 @@ static void ipw2100_down(struct ipw2100_priv *priv)
 	ipw2100_disable_interrupts(priv);
 	spin_unlock_irqrestore(&priv->low_lock, flags);
 
+	modify_acceptable_latency("ipw2100", INFINITE_LATENCY);
+
 #ifdef ACPI_CSTATE_LIMIT_DEFINED
 	if (priv->config & CFG_C3_DISABLED) {
 		IPW_DEBUG_INFO(": Resetting C3 transitions.\n");
@@ -6534,6 +6542,7 @@ static int __init ipw2100_init(void)
 
 	ret = pci_register_driver(&ipw2100_pci_driver);
 
+	set_acceptable_latency("ipw2100", INFINITE_LATENCY);
 #ifdef CONFIG_IPW2100_DEBUG
 	ipw2100_debug_level = debug;
 	driver_create_file(&ipw2100_pci_driver.driver,
@@ -6554,6 +6563,7 @@ static void __exit ipw2100_exit(void)
 			   &driver_attr_debug_level);
 #endif
 	pci_unregister_driver(&ipw2100_pci_driver);
+	remove_acceptable_latency("ipw2100");
 }
 
 module_init(ipw2100_init);
diff --git a/include/linux/latency.h b/include/linux/latency.h
new file mode 100644
index 000000000000..c08b52bb55b0
--- /dev/null
+++ b/include/linux/latency.h
@@ -0,0 +1,25 @@
+/*
+ * latency.h: Explicit system-wide latency-expectation infrastructure
+ *
+ * (C) Copyright 2006 Intel Corporation
+ * Author: Arjan van de Ven <arjan@linux.intel.com>
+ *
+ */
+
+#ifndef _INCLUDE_GUARD_LATENCY_H_
+#define _INCLUDE_GUARD_LATENCY_H_
+
+#include <linux/notifier.h>
+
+void set_acceptable_latency(char *identifier, int usecs);
+void modify_acceptable_latency(char *identifier, int usecs);
+void remove_acceptable_latency(char *identifier);
+void synchronize_acceptable_latency(void);
+int system_latency_constraint(void);
+
+int register_latency_notifier(struct notifier_block * nb);
+int unregister_latency_notifier(struct notifier_block * nb);
+
+#define INFINITE_LATENCY 1000000
+
+#endif
diff --git a/kernel/Makefile b/kernel/Makefile
index d62ec66c1af2..e210e8cf7237 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -8,7 +8,7 @@ obj-y     = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
 	    signal.o sys.o kmod.o workqueue.o pid.o \
 	    rcupdate.o extable.o params.o posix-timers.o \
 	    kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
-	    hrtimer.o rwsem.o
+	    hrtimer.o rwsem.o latency.o
 
 obj-$(CONFIG_STACKTRACE) += stacktrace.o
 obj-y += time/
diff --git a/kernel/latency.c b/kernel/latency.c
new file mode 100644
index 000000000000..258f2555abbc
--- /dev/null
+++ b/kernel/latency.c
@@ -0,0 +1,279 @@
+/*
+ * latency.c: Explicit system-wide latency-expectation infrastructure
+ *
+ * The purpose of this infrastructure is to allow device drivers to set
+ * latency constraint they have and to collect and summarize these
+ * expectations globally. The cummulated result can then be used by
+ * power management and similar users to make decisions that have
+ * tradoffs with a latency component.
+ *
+ * An example user of this are the x86 C-states; each higher C state saves
+ * more power, but has a higher exit latency. For the idle loop power
+ * code to make a good decision which C-state to use, information about
+ * acceptable latencies is required.
+ *
+ * An example announcer of latency is an audio driver that knowns it
+ * will get an interrupt when the hardware has 200 usec of samples
+ * left in the DMA buffer; in that case the driver can set a latency
+ * constraint of, say, 150 usec.
+ *
+ * Multiple drivers can each announce their maximum accepted latency,
+ * to keep these appart, a string based identifier is used.
+ *
+ *
+ * (C) Copyright 2006 Intel Corporation
+ * Author: Arjan van de Ven <arjan@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#include <linux/latency.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/notifier.h>
+#include <asm/atomic.h>
+
+struct latency_info {
+	struct list_head list;
+	int usecs;
+	char *identifier;
+};
+
+/*
+ * locking rule: all modifications to current_max_latency and
+ * latency_list need to be done while holding the latency_lock.
+ * latency_lock needs to be taken _irqsave.
+ */
+static atomic_t current_max_latency;
+static DEFINE_SPINLOCK(latency_lock);
+
+static LIST_HEAD(latency_list);
+static BLOCKING_NOTIFIER_HEAD(latency_notifier);
+
+/*
+ * This function returns the maximum latency allowed, which
+ * happens to be the minimum of all maximum latencies on the
+ * list.
+ */
+static int __find_max_latency(void)
+{
+	int min = INFINITE_LATENCY;
+	struct latency_info *info;
+
+	list_for_each_entry(info, &latency_list, list) {
+		if (info->usecs < min)
+			min = info->usecs;
+	}
+	return min;
+}
+
+/**
+ * set_acceptable_latency - sets the maximum latency acceptable
+ * @identifier: string that identifies this driver
+ * @usecs: maximum acceptable latency for this driver
+ *
+ * This function informs the kernel that this device(driver)
+ * can accept at most usecs latency. This setting is used for
+ * power management and similar tradeoffs.
+ *
+ * This function sleeps and can only be called from process
+ * context.
+ * Calling this function with an existing identifier is valid
+ * and will cause the existing latency setting to be changed.
+ */
+void set_acceptable_latency(char *identifier, int usecs)
+{
+	struct latency_info *info, *iter;
+	unsigned long flags;
+	int found_old = 0;
+
+	info = kzalloc(sizeof(struct latency_info), GFP_KERNEL);
+	if (!info)
+		return;
+	info->usecs = usecs;
+	info->identifier = kstrdup(identifier, GFP_KERNEL);
+	if (!info->identifier)
+		goto free_info;
+
+	spin_lock_irqsave(&latency_lock, flags);
+	list_for_each_entry(iter, &latency_list, list) {
+		if (strcmp(iter->identifier, identifier)==0) {
+			found_old = 1;
+			iter->usecs = usecs;
+			break;
+		}
+	}
+	if (!found_old)
+		list_add(&info->list, &latency_list);
+
+	if (usecs < atomic_read(&current_max_latency))
+		atomic_set(&current_max_latency, usecs);
+
+	spin_unlock_irqrestore(&latency_lock, flags);
+
+	blocking_notifier_call_chain(&latency_notifier,
+		atomic_read(&current_max_latency), NULL);
+
+	/*
+	 * if we inserted the new one, we're done; otherwise there was
+	 * an existing one so we need to free the redundant data
+	 */
+	if (!found_old)
+		return;
+
+	kfree(info->identifier);
+free_info:
+	kfree(info);
+}
+EXPORT_SYMBOL_GPL(set_acceptable_latency);
+
+/**
+ * modify_acceptable_latency - changes the maximum latency acceptable
+ * @identifier: string that identifies this driver
+ * @usecs: maximum acceptable latency for this driver
+ *
+ * This function informs the kernel that this device(driver)
+ * can accept at most usecs latency. This setting is used for
+ * power management and similar tradeoffs.
+ *
+ * This function does not sleep and can be called in any context.
+ * Trying to use a non-existing identifier silently gets ignored.
+ *
+ * Due to the atomic nature of this function, the modified latency
+ * value will only be used for future decisions; past decisions
+ * can still lead to longer latencies in the near future.
+ */
+void modify_acceptable_latency(char *identifier, int usecs)
+{
+	struct latency_info *iter;
+	unsigned long flags;
+
+	spin_lock_irqsave(&latency_lock, flags);
+	list_for_each_entry(iter, &latency_list, list) {
+		if (strcmp(iter->identifier, identifier) == 0) {
+			iter->usecs = usecs;
+			break;
+		}
+	}
+	if (usecs < atomic_read(&current_max_latency))
+		atomic_set(&current_max_latency, usecs);
+	spin_unlock_irqrestore(&latency_lock, flags);
+}
+EXPORT_SYMBOL_GPL(modify_acceptable_latency);
+
+/**
+ * remove_acceptable_latency - removes the maximum latency acceptable
+ * @identifier: string that identifies this driver
+ *
+ * This function removes a previously set maximum latency setting
+ * for the driver and frees up any resources associated with the
+ * bookkeeping needed for this.
+ *
+ * This function does not sleep and can be called in any context.
+ * Trying to use a non-existing identifier silently gets ignored.
+ */
+void remove_acceptable_latency(char *identifier)
+{
+	unsigned long flags;
+	int newmax = 0;
+	struct latency_info *iter, *temp;
+
+	spin_lock_irqsave(&latency_lock, flags);
+
+	list_for_each_entry_safe(iter,  temp, &latency_list, list) {
+		if (strcmp(iter->identifier, identifier) == 0) {
+			list_del(&iter->list);
+			newmax = iter->usecs;
+			kfree(iter->identifier);
+			kfree(iter);
+			break;
+		}
+	}
+
+	/* If we just deleted the system wide value, we need to
+	 * recalculate with a full search
+	 */
+	if (newmax == atomic_read(&current_max_latency)) {
+		newmax = __find_max_latency();
+		atomic_set(&current_max_latency, newmax);
+	}
+	spin_unlock_irqrestore(&latency_lock, flags);
+}
+EXPORT_SYMBOL_GPL(remove_acceptable_latency);
+
+/**
+ * system_latency_constraint - queries the system wide latency maximum
+ *
+ * This function returns the system wide maximum latency in
+ * microseconds.
+ *
+ * This function does not sleep and can be called in any context.
+ */
+int system_latency_constraint(void)
+{
+	return atomic_read(&current_max_latency);
+}
+EXPORT_SYMBOL_GPL(system_latency_constraint);
+
+/**
+ * synchronize_acceptable_latency - recalculates all latency decisions
+ *
+ * This function will cause a callback to various kernel pieces that
+ * will make those pieces rethink their latency decisions. This implies
+ * that if there are overlong latencies in hardware state already, those
+ * latencies get taken right now. When this call completes no overlong
+ * latency decisions should be active anymore.
+ *
+ * Typical usecase of this is after a modify_acceptable_latency() call,
+ * which in itself is non-blocking and non-synchronizing.
+ *
+ * This function blocks and should not be called with locks held.
+ */
+
+void synchronize_acceptable_latency(void)
+{
+	blocking_notifier_call_chain(&latency_notifier,
+		atomic_read(&current_max_latency), NULL);
+}
+EXPORT_SYMBOL_GPL(synchronize_acceptable_latency);
+
+/*
+ * Latency notifier: this notifier gets called when a non-atomic new
+ * latency value gets set. The expectation nof the caller of the
+ * non-atomic set is that when the call returns, future latencies
+ * are within bounds, so the functions on the notifier list are
+ * expected to take the overlong latencies immediately, inside the
+ * callback, and not make a overlong latency decision anymore.
+ *
+ * The callback gets called when the new latency value is made
+ * active so system_latency_constraint() returns the new latency.
+ */
+int register_latency_notifier(struct notifier_block * nb)
+{
+	return blocking_notifier_chain_register(&latency_notifier, nb);
+}
+EXPORT_SYMBOL_GPL(register_latency_notifier);
+
+int unregister_latency_notifier(struct notifier_block * nb)
+{
+	return blocking_notifier_chain_unregister(&latency_notifier, nb);
+}
+EXPORT_SYMBOL_GPL(unregister_latency_notifier);
+
+static __init int latency_init(void)
+{
+	atomic_set(&current_max_latency, INFINITE_LATENCY);
+	/*
+	 * we don't want by default to have longer latencies than 2 ticks,
+	 * since that would cause lost ticks
+	 */
+	set_acceptable_latency("kernel", 2*1000000/HZ);
+	return 0;
+}
+
+module_init(latency_init);
-- 
cgit v1.2.3


From 756184b7d771992f4fb1998d62aebcaf3e028076 Mon Sep 17 00:00:00 2001
From: Cal Peake <cp@absolutedigital.net>
Date: Sat, 30 Sep 2006 23:27:24 -0700
Subject: [PATCH] CodingStyle cleanup for kernel/sys.c

Fix up kernel/sys.c to be consistent with CodingStyle and the rest of the
file.

Signed-off-by: Cal Peake <cp@absolutedigital.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sys.c | 80 +++++++++++++++++++++++-------------------------------------
 1 file changed, 30 insertions(+), 50 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sys.c b/kernel/sys.c
index b88806c66244..2460581c928c 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -607,11 +607,10 @@ static void kernel_restart_prepare(char *cmd)
 void kernel_restart(char *cmd)
 {
 	kernel_restart_prepare(cmd);
-	if (!cmd) {
+	if (!cmd)
 		printk(KERN_EMERG "Restarting system.\n");
-	} else {
+	else
 		printk(KERN_EMERG "Restarting system with command '%s'.\n", cmd);
-	}
 	machine_restart(cmd);
 }
 EXPORT_SYMBOL_GPL(kernel_restart);
@@ -627,9 +626,8 @@ static void kernel_kexec(void)
 #ifdef CONFIG_KEXEC
 	struct kimage *image;
 	image = xchg(&kexec_image, NULL);
-	if (!image) {
+	if (!image)
 		return;
-	}
 	kernel_restart_prepare(NULL);
 	printk(KERN_EMERG "Starting new kernel\n");
 	machine_shutdown();
@@ -823,12 +821,10 @@ asmlinkage long sys_setregid(gid_t rgid, gid_t egid)
 		    (current->sgid == egid) ||
 		    capable(CAP_SETGID))
 			new_egid = egid;
-		else {
+		else
 			return -EPERM;
-		}
 	}
-	if (new_egid != old_egid)
-	{
+	if (new_egid != old_egid) {
 		current->mm->dumpable = suid_dumpable;
 		smp_wmb();
 	}
@@ -857,19 +853,14 @@ asmlinkage long sys_setgid(gid_t gid)
 	if (retval)
 		return retval;
 
-	if (capable(CAP_SETGID))
-	{
-		if(old_egid != gid)
-		{
+	if (capable(CAP_SETGID)) {
+		if (old_egid != gid) {
 			current->mm->dumpable = suid_dumpable;
 			smp_wmb();
 		}
 		current->gid = current->egid = current->sgid = current->fsgid = gid;
-	}
-	else if ((gid == current->gid) || (gid == current->sgid))
-	{
-		if(old_egid != gid)
-		{
+	} else if ((gid == current->gid) || (gid == current->sgid)) {
+		if (old_egid != gid) {
 			current->mm->dumpable = suid_dumpable;
 			smp_wmb();
 		}
@@ -900,8 +891,7 @@ static int set_user(uid_t new_ruid, int dumpclear)
 
 	switch_uid(new_user);
 
-	if(dumpclear)
-	{
+	if (dumpclear) {
 		current->mm->dumpable = suid_dumpable;
 		smp_wmb();
 	}
@@ -957,8 +947,7 @@ asmlinkage long sys_setreuid(uid_t ruid, uid_t euid)
 	if (new_ruid != old_ruid && set_user(new_ruid, new_euid != old_euid) < 0)
 		return -EAGAIN;
 
-	if (new_euid != old_euid)
-	{
+	if (new_euid != old_euid) {
 		current->mm->dumpable = suid_dumpable;
 		smp_wmb();
 	}
@@ -1008,8 +997,7 @@ asmlinkage long sys_setuid(uid_t uid)
 	} else if ((uid != current->uid) && (uid != new_suid))
 		return -EPERM;
 
-	if (old_euid != uid)
-	{
+	if (old_euid != uid) {
 		current->mm->dumpable = suid_dumpable;
 		smp_wmb();
 	}
@@ -1054,8 +1042,7 @@ asmlinkage long sys_setresuid(uid_t ruid, uid_t euid, uid_t suid)
 			return -EAGAIN;
 	}
 	if (euid != (uid_t) -1) {
-		if (euid != current->euid)
-		{
+		if (euid != current->euid) {
 			current->mm->dumpable = suid_dumpable;
 			smp_wmb();
 		}
@@ -1105,8 +1092,7 @@ asmlinkage long sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid)
 			return -EPERM;
 	}
 	if (egid != (gid_t) -1) {
-		if (egid != current->egid)
-		{
+		if (egid != current->egid) {
 			current->mm->dumpable = suid_dumpable;
 			smp_wmb();
 		}
@@ -1151,10 +1137,8 @@ asmlinkage long sys_setfsuid(uid_t uid)
 
 	if (uid == current->uid || uid == current->euid ||
 	    uid == current->suid || uid == current->fsuid || 
-	    capable(CAP_SETUID))
-	{
-		if (uid != old_fsuid)
-		{
+	    capable(CAP_SETUID)) {
+		if (uid != old_fsuid) {
 			current->mm->dumpable = suid_dumpable;
 			smp_wmb();
 		}
@@ -1182,10 +1166,8 @@ asmlinkage long sys_setfsgid(gid_t gid)
 
 	if (gid == current->gid || gid == current->egid ||
 	    gid == current->sgid || gid == current->fsgid || 
-	    capable(CAP_SETGID))
-	{
-		if (gid != old_fsgid)
-		{
+	    capable(CAP_SETGID)) {
+		if (gid != old_fsgid) {
 			current->mm->dumpable = suid_dumpable;
 			smp_wmb();
 		}
@@ -1321,9 +1303,9 @@ out:
 
 asmlinkage long sys_getpgid(pid_t pid)
 {
-	if (!pid) {
+	if (!pid)
 		return process_group(current);
-	} else {
+	else {
 		int retval;
 		struct task_struct *p;
 
@@ -1353,9 +1335,9 @@ asmlinkage long sys_getpgrp(void)
 
 asmlinkage long sys_getsid(pid_t pid)
 {
-	if (!pid) {
+	if (!pid)
 		return current->signal->session;
-	} else {
+	else {
 		int retval;
 		struct task_struct *p;
 
@@ -1363,7 +1345,7 @@ asmlinkage long sys_getsid(pid_t pid)
 		p = find_task_by_pid(pid);
 
 		retval = -ESRCH;
-		if(p) {
+		if (p) {
 			retval = security_task_getsid(p);
 			if (!retval)
 				retval = p->signal->session;
@@ -1431,9 +1413,9 @@ struct group_info *groups_alloc(int gidsetsize)
 	group_info->nblocks = nblocks;
 	atomic_set(&group_info->usage, 1);
 
-	if (gidsetsize <= NGROUPS_SMALL) {
+	if (gidsetsize <= NGROUPS_SMALL)
 		group_info->blocks[0] = group_info->small_block;
-	} else {
+	else {
 		for (i = 0; i < nblocks; i++) {
 			gid_t *b;
 			b = (void *)__get_free_page(GFP_USER);
@@ -1489,7 +1471,7 @@ static int groups_to_user(gid_t __user *grouplist,
 /* fill a group_info from a user-space array - it must be allocated already */
 static int groups_from_user(struct group_info *group_info,
     gid_t __user *grouplist)
- {
+{
 	int i;
 	int count = group_info->ngroups;
 
@@ -1647,9 +1629,8 @@ asmlinkage long sys_setgroups(int gidsetsize, gid_t __user *grouplist)
 int in_group_p(gid_t grp)
 {
 	int retval = 1;
-	if (grp != current->fsgid) {
+	if (grp != current->fsgid)
 		retval = groups_search(current->group_info, grp);
-	}
 	return retval;
 }
 
@@ -1658,9 +1639,8 @@ EXPORT_SYMBOL(in_group_p);
 int in_egroup_p(gid_t grp)
 {
 	int retval = 1;
-	if (grp != current->egid) {
+	if (grp != current->egid)
 		retval = groups_search(current->group_info, grp);
-	}
 	return retval;
 }
 
@@ -1775,9 +1755,9 @@ asmlinkage long sys_old_getrlimit(unsigned int resource, struct rlimit __user *r
 	task_lock(current->group_leader);
 	x = current->signal->rlim[resource];
 	task_unlock(current->group_leader);
-	if(x.rlim_cur > 0x7FFFFFFF)
+	if (x.rlim_cur > 0x7FFFFFFF)
 		x.rlim_cur = 0x7FFFFFFF;
-	if(x.rlim_max > 0x7FFFFFFF)
+	if (x.rlim_max > 0x7FFFFFFF)
 		x.rlim_max = 0x7FFFFFFF;
 	return copy_to_user(rlim, &x, sizeof(x))?-EFAULT:0;
 }
-- 
cgit v1.2.3


From ef6edc9746dc2bfdacf44eefd5f881179971c478 Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Sat, 30 Sep 2006 23:27:43 -0700
Subject: [PATCH] Directed yield: cpu_relax variants for spinlocks and rw-locks

On systems running with virtual cpus there is optimization potential in
regard to spinlocks and rw-locks.  If the virtual cpu that has taken a lock
is known to a cpu that wants to acquire the same lock it is beneficial to
yield the timeslice of the virtual cpu in favour of the cpu that has the
lock (directed yield).

With CONFIG_PREEMPT="n" this can be implemented by the architecture without
common code changes.  Powerpc already does this.

With CONFIG_PREEMPT="y" the lock loops are coded with _raw_spin_trylock,
_raw_read_trylock and _raw_write_trylock in kernel/spinlock.c.  If the lock
could not be taken cpu_relax is called.  A directed yield is not possible
because cpu_relax doesn't know anything about the lock.  To be able to
yield the lock in favour of the current lock holder variants of cpu_relax
for spinlocks and rw-locks are needed.  The new _raw_spin_relax,
_raw_read_relax and _raw_write_relax primitives differ from cpu_relax
insofar that they have an argument: a pointer to the lock structure.

Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Haavard Skinnemoen <hskinnemoen@atmel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-alpha/spinlock.h         | 4 ++++
 include/asm-arm/spinlock.h           | 4 ++++
 include/asm-cris/arch-v32/spinlock.h | 4 ++++
 include/asm-i386/spinlock.h          | 4 ++++
 include/asm-ia64/spinlock.h          | 4 ++++
 include/asm-m32r/spinlock.h          | 4 ++++
 include/asm-mips/spinlock.h          | 4 ++++
 include/asm-parisc/spinlock.h        | 4 ++++
 include/asm-powerpc/spinlock.h       | 4 ++++
 include/asm-ppc/spinlock.h           | 4 ++++
 include/asm-s390/spinlock.h          | 4 ++++
 include/asm-sh/spinlock.h            | 4 ++++
 include/asm-sparc/spinlock.h         | 4 ++++
 include/asm-sparc64/spinlock.h       | 4 ++++
 include/asm-x86_64/spinlock.h        | 4 ++++
 kernel/spinlock.c                    | 4 ++--
 16 files changed, 62 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/asm-alpha/spinlock.h b/include/asm-alpha/spinlock.h
index 0c294c9b0c55..aeeb125f6851 100644
--- a/include/asm-alpha/spinlock.h
+++ b/include/asm-alpha/spinlock.h
@@ -166,4 +166,8 @@ static inline void __raw_write_unlock(raw_rwlock_t * lock)
 	lock->lock = 0;
 }
 
+#define _raw_spin_relax(lock)	cpu_relax()
+#define _raw_read_relax(lock)	cpu_relax()
+#define _raw_write_relax(lock)	cpu_relax()
+
 #endif /* _ALPHA_SPINLOCK_H */
diff --git a/include/asm-arm/spinlock.h b/include/asm-arm/spinlock.h
index 01b7c26a3038..861092fbaa53 100644
--- a/include/asm-arm/spinlock.h
+++ b/include/asm-arm/spinlock.h
@@ -218,4 +218,8 @@ static inline int __raw_read_trylock(raw_rwlock_t *rw)
 /* read_can_lock - would read_trylock() succeed? */
 #define __raw_read_can_lock(x)		((x)->lock < 0x80000000)
 
+#define _raw_spin_relax(lock)	cpu_relax()
+#define _raw_read_relax(lock)	cpu_relax()
+#define _raw_write_relax(lock)	cpu_relax()
+
 #endif /* __ASM_SPINLOCK_H */
diff --git a/include/asm-cris/arch-v32/spinlock.h b/include/asm-cris/arch-v32/spinlock.h
index 52df72a62232..5f43df0a5fb4 100644
--- a/include/asm-cris/arch-v32/spinlock.h
+++ b/include/asm-cris/arch-v32/spinlock.h
@@ -160,4 +160,8 @@ static __inline__ int is_write_locked(rwlock_t *rw)
 	return rw->counter < 0;
 }
 
+#define _raw_spin_relax(lock)	cpu_relax()
+#define _raw_read_relax(lock)	cpu_relax()
+#define _raw_write_relax(lock)	cpu_relax()
+
 #endif /* __ASM_ARCH_SPINLOCK_H */
diff --git a/include/asm-i386/spinlock.h b/include/asm-i386/spinlock.h
index b0b3043f05e1..c18b71fae6b3 100644
--- a/include/asm-i386/spinlock.h
+++ b/include/asm-i386/spinlock.h
@@ -205,4 +205,8 @@ static inline void __raw_write_unlock(raw_rwlock_t *rw)
 				 : "+m" (rw->lock) : : "memory");
 }
 
+#define _raw_spin_relax(lock)	cpu_relax()
+#define _raw_read_relax(lock)	cpu_relax()
+#define _raw_write_relax(lock)	cpu_relax()
+
 #endif /* __ASM_SPINLOCK_H */
diff --git a/include/asm-ia64/spinlock.h b/include/asm-ia64/spinlock.h
index 9e83210dc312..ff857e31738a 100644
--- a/include/asm-ia64/spinlock.h
+++ b/include/asm-ia64/spinlock.h
@@ -213,4 +213,8 @@ static inline int __raw_read_trylock(raw_rwlock_t *x)
 	return (u32)ia64_cmpxchg4_acq((__u32 *)(x), new.word, old.word) == old.word;
 }
 
+#define _raw_spin_relax(lock)	cpu_relax()
+#define _raw_read_relax(lock)	cpu_relax()
+#define _raw_write_relax(lock)	cpu_relax()
+
 #endif /*  _ASM_IA64_SPINLOCK_H */
diff --git a/include/asm-m32r/spinlock.h b/include/asm-m32r/spinlock.h
index f9f90727a4a1..f5cfba81ee10 100644
--- a/include/asm-m32r/spinlock.h
+++ b/include/asm-m32r/spinlock.h
@@ -316,4 +316,8 @@ static inline int __raw_write_trylock(raw_rwlock_t *lock)
 	return 0;
 }
 
+#define _raw_spin_relax(lock)	cpu_relax()
+#define _raw_read_relax(lock)	cpu_relax()
+#define _raw_write_relax(lock)	cpu_relax()
+
 #endif	/* _ASM_M32R_SPINLOCK_H */
diff --git a/include/asm-mips/spinlock.h b/include/asm-mips/spinlock.h
index 4c1a1b53aeaf..c8d5587467bb 100644
--- a/include/asm-mips/spinlock.h
+++ b/include/asm-mips/spinlock.h
@@ -328,4 +328,8 @@ static inline int __raw_write_trylock(raw_rwlock_t *rw)
 }
 
 
+#define _raw_spin_relax(lock)	cpu_relax()
+#define _raw_read_relax(lock)	cpu_relax()
+#define _raw_write_relax(lock)	cpu_relax()
+
 #endif /* _ASM_SPINLOCK_H */
diff --git a/include/asm-parisc/spinlock.h b/include/asm-parisc/spinlock.h
index a93960e232cf..e1825530365d 100644
--- a/include/asm-parisc/spinlock.h
+++ b/include/asm-parisc/spinlock.h
@@ -152,4 +152,8 @@ static __inline__ int __raw_write_can_lock(raw_rwlock_t *rw)
 	return !rw->counter;
 }
 
+#define _raw_spin_relax(lock)	cpu_relax()
+#define _raw_read_relax(lock)	cpu_relax()
+#define _raw_write_relax(lock)	cpu_relax()
+
 #endif /* __ASM_SPINLOCK_H */
diff --git a/include/asm-powerpc/spinlock.h b/include/asm-powerpc/spinlock.h
index c31e4382a775..eaccac8327f2 100644
--- a/include/asm-powerpc/spinlock.h
+++ b/include/asm-powerpc/spinlock.h
@@ -285,5 +285,9 @@ static __inline__ void __raw_write_unlock(raw_rwlock_t *rw)
 	rw->lock = 0;
 }
 
+#define _raw_spin_relax(lock)	cpu_relax()
+#define _raw_read_relax(lock)	cpu_relax()
+#define _raw_write_relax(lock)	cpu_relax()
+
 #endif /* __KERNEL__ */
 #endif /* __ASM_SPINLOCK_H */
diff --git a/include/asm-ppc/spinlock.h b/include/asm-ppc/spinlock.h
index 5c64b75f0295..fccaf5531e57 100644
--- a/include/asm-ppc/spinlock.h
+++ b/include/asm-ppc/spinlock.h
@@ -161,4 +161,8 @@ static __inline__ void __raw_write_unlock(raw_rwlock_t *rw)
 	rw->lock = 0;
 }
 
+#define _raw_spin_relax(lock)	cpu_relax()
+#define _raw_read_relax(lock)	cpu_relax()
+#define _raw_write_relax(lock)	cpu_relax()
+
 #endif /* __ASM_SPINLOCK_H */
diff --git a/include/asm-s390/spinlock.h b/include/asm-s390/spinlock.h
index ce3edf6d63b3..5f00feaf1be6 100644
--- a/include/asm-s390/spinlock.h
+++ b/include/asm-s390/spinlock.h
@@ -154,4 +154,8 @@ static inline int __raw_write_trylock(raw_rwlock_t *rw)
 	return _raw_write_trylock_retry(rw);
 }
 
+#define _raw_spin_relax(lock)	cpu_relax()
+#define _raw_read_relax(lock)	cpu_relax()
+#define _raw_write_relax(lock)	cpu_relax()
+
 #endif /* __ASM_SPINLOCK_H */
diff --git a/include/asm-sh/spinlock.h b/include/asm-sh/spinlock.h
index 846322d4c35d..54458fd24981 100644
--- a/include/asm-sh/spinlock.h
+++ b/include/asm-sh/spinlock.h
@@ -100,4 +100,8 @@ static inline int __raw_write_trylock(raw_rwlock_t *rw)
 	return 0;
 }
 
+#define _raw_spin_relax(lock)	cpu_relax()
+#define _raw_read_relax(lock)	cpu_relax()
+#define _raw_write_relax(lock)	cpu_relax()
+
 #endif /* __ASM_SH_SPINLOCK_H */
diff --git a/include/asm-sparc/spinlock.h b/include/asm-sparc/spinlock.h
index 1c75474ba1df..557d08959d2f 100644
--- a/include/asm-sparc/spinlock.h
+++ b/include/asm-sparc/spinlock.h
@@ -154,6 +154,10 @@ static inline int __raw_write_trylock(raw_rwlock_t *rw)
 #define __raw_spin_lock_flags(lock, flags) __raw_spin_lock(lock)
 #define __raw_read_trylock(lock) generic__raw_read_trylock(lock)
 
+#define _raw_spin_relax(lock)	cpu_relax()
+#define _raw_read_relax(lock)	cpu_relax()
+#define _raw_write_relax(lock)	cpu_relax()
+
 #define __raw_read_can_lock(rw) (!((rw)->lock & 0xff))
 #define __raw_write_can_lock(rw) (!(rw)->lock)
 
diff --git a/include/asm-sparc64/spinlock.h b/include/asm-sparc64/spinlock.h
index bd5ffc76bc7e..0006fe9f8c7a 100644
--- a/include/asm-sparc64/spinlock.h
+++ b/include/asm-sparc64/spinlock.h
@@ -241,6 +241,10 @@ static int inline __write_trylock(raw_rwlock_t *lock)
 #define __raw_read_can_lock(rw)		(!((rw)->lock & 0x80000000UL))
 #define __raw_write_can_lock(rw)	(!(rw)->lock)
 
+#define _raw_spin_relax(lock)	cpu_relax()
+#define _raw_read_relax(lock)	cpu_relax()
+#define _raw_write_relax(lock)	cpu_relax()
+
 #endif /* !(__ASSEMBLY__) */
 
 #endif /* !(__SPARC64_SPINLOCK_H) */
diff --git a/include/asm-x86_64/spinlock.h b/include/asm-x86_64/spinlock.h
index 3daf5b005905..05ef097ba55b 100644
--- a/include/asm-x86_64/spinlock.h
+++ b/include/asm-x86_64/spinlock.h
@@ -133,4 +133,8 @@ static inline void __raw_write_unlock(raw_rwlock_t *rw)
 				: "=m" (rw->lock) : : "memory");
 }
 
+#define _raw_spin_relax(lock)	cpu_relax()
+#define _raw_read_relax(lock)	cpu_relax()
+#define _raw_write_relax(lock)	cpu_relax()
+
 #endif /* __ASM_SPINLOCK_H */
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index d48143eafbfd..476c3741511b 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -215,7 +215,7 @@ void __lockfunc _##op##_lock(locktype##_t *lock)			\
 		if (!(lock)->break_lock)				\
 			(lock)->break_lock = 1;				\
 		while (!op##_can_lock(lock) && (lock)->break_lock)	\
-			cpu_relax();					\
+			_raw_##op##_relax(&lock->raw_lock);		\
 	}								\
 	(lock)->break_lock = 0;						\
 }									\
@@ -237,7 +237,7 @@ unsigned long __lockfunc _##op##_lock_irqsave(locktype##_t *lock)	\
 		if (!(lock)->break_lock)				\
 			(lock)->break_lock = 1;				\
 		while (!op##_can_lock(lock) && (lock)->break_lock)	\
-			cpu_relax();					\
+			_raw_##op##_relax(&lock->raw_lock);		\
 	}								\
 	(lock)->break_lock = 0;						\
 	return flags;							\
-- 
cgit v1.2.3


From 4c7ee8de956fc250fe31e2fa91f6da980fabe317 Mon Sep 17 00:00:00 2001
From: john stultz <johnstul@us.ibm.com>
Date: Sat, 30 Sep 2006 23:28:22 -0700
Subject: [PATCH] NTP: Move all the NTP related code to ntp.c

Move all the NTP related code to ntp.c

[akpm@osdl.org: cleanups, build fix]
Signed-off-by: John Stultz <johnstul@us.ibm.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Roman Zippel <zippel@linux-m68k.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/timex.h |  10 +-
 kernel/time.c         | 173 ----------------------
 kernel/time/Makefile  |   2 +-
 kernel/time/ntp.c     | 389 ++++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/timer.c        | 211 +--------------------------
 5 files changed, 399 insertions(+), 386 deletions(-)
 create mode 100644 kernel/time/ntp.c

(limited to 'kernel')

diff --git a/include/linux/timex.h b/include/linux/timex.h
index d543d3871e38..2a21485bf183 100644
--- a/include/linux/timex.h
+++ b/include/linux/timex.h
@@ -294,11 +294,15 @@ extern void register_time_interpolator(struct time_interpolator *);
 extern void unregister_time_interpolator(struct time_interpolator *);
 extern void time_interpolator_reset(void);
 extern unsigned long time_interpolator_get_offset(void);
+extern void time_interpolator_update(long delta_nsec);
 
 #else /* !CONFIG_TIME_INTERPOLATION */
 
-static inline void
-time_interpolator_reset(void)
+static inline void time_interpolator_reset(void)
+{
+}
+
+static inline void time_interpolator_update(long delta_nsec)
 {
 }
 
@@ -309,6 +313,8 @@ time_interpolator_reset(void)
 /* Returns how long ticks are at present, in ns / 2^(SHIFT_SCALE-10). */
 extern u64 current_tick_length(void);
 
+extern void second_overflow(void);
+extern void update_ntp_one_tick(void);
 extern int do_adjtimex(struct timex *);
 
 #endif /* KERNEL */
diff --git a/kernel/time.c b/kernel/time.c
index 5bd489747643..0e017bff4c19 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -202,179 +202,6 @@ asmlinkage long sys_settimeofday(struct timeval __user *tv,
 	return do_sys_settimeofday(tv ? &new_ts : NULL, tz ? &new_tz : NULL);
 }
 
-/* we call this to notify the arch when the clock is being
- * controlled.  If no such arch routine, do nothing.
- */
-void __attribute__ ((weak)) notify_arch_cmos_timer(void)
-{
-	return;
-}
-
-/* adjtimex mainly allows reading (and writing, if superuser) of
- * kernel time-keeping variables. used by xntpd.
- */
-int do_adjtimex(struct timex *txc)
-{
-        long ltemp, mtemp, save_adjust;
-	int result;
-
-	/* In order to modify anything, you gotta be super-user! */
-	if (txc->modes && !capable(CAP_SYS_TIME))
-		return -EPERM;
-		
-	/* Now we validate the data before disabling interrupts */
-
-	if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT)
-	  /* singleshot must not be used with any other mode bits */
-		if (txc->modes != ADJ_OFFSET_SINGLESHOT)
-			return -EINVAL;
-
-	if (txc->modes != ADJ_OFFSET_SINGLESHOT && (txc->modes & ADJ_OFFSET))
-	  /* adjustment Offset limited to +- .512 seconds */
-		if (txc->offset <= - MAXPHASE || txc->offset >= MAXPHASE )
-			return -EINVAL;	
-
-	/* if the quartz is off by more than 10% something is VERY wrong ! */
-	if (txc->modes & ADJ_TICK)
-		if (txc->tick <  900000/USER_HZ ||
-		    txc->tick > 1100000/USER_HZ)
-			return -EINVAL;
-
-	write_seqlock_irq(&xtime_lock);
-	result = time_state;	/* mostly `TIME_OK' */
-
-	/* Save for later - semantics of adjtime is to return old value */
-	save_adjust = time_next_adjust ? time_next_adjust : time_adjust;
-
-#if 0	/* STA_CLOCKERR is never set yet */
-	time_status &= ~STA_CLOCKERR;		/* reset STA_CLOCKERR */
-#endif
-	/* If there are input parameters, then process them */
-	if (txc->modes)
-	{
-	    if (txc->modes & ADJ_STATUS)	/* only set allowed bits */
-		time_status =  (txc->status & ~STA_RONLY) |
-			      (time_status & STA_RONLY);
-
-	    if (txc->modes & ADJ_FREQUENCY) {	/* p. 22 */
-		if (txc->freq > MAXFREQ || txc->freq < -MAXFREQ) {
-		    result = -EINVAL;
-		    goto leave;
-		}
-		time_freq = txc->freq;
-	    }
-
-	    if (txc->modes & ADJ_MAXERROR) {
-		if (txc->maxerror < 0 || txc->maxerror >= NTP_PHASE_LIMIT) {
-		    result = -EINVAL;
-		    goto leave;
-		}
-		time_maxerror = txc->maxerror;
-	    }
-
-	    if (txc->modes & ADJ_ESTERROR) {
-		if (txc->esterror < 0 || txc->esterror >= NTP_PHASE_LIMIT) {
-		    result = -EINVAL;
-		    goto leave;
-		}
-		time_esterror = txc->esterror;
-	    }
-
-	    if (txc->modes & ADJ_TIMECONST) {	/* p. 24 */
-		if (txc->constant < 0) {	/* NTP v4 uses values > 6 */
-		    result = -EINVAL;
-		    goto leave;
-		}
-		time_constant = txc->constant;
-	    }
-
-	    if (txc->modes & ADJ_OFFSET) {	/* values checked earlier */
-		if (txc->modes == ADJ_OFFSET_SINGLESHOT) {
-		    /* adjtime() is independent from ntp_adjtime() */
-		    if ((time_next_adjust = txc->offset) == 0)
-			 time_adjust = 0;
-		}
-		else if (time_status & STA_PLL) {
-		    ltemp = txc->offset;
-
-		    /*
-		     * Scale the phase adjustment and
-		     * clamp to the operating range.
-		     */
-		    if (ltemp > MAXPHASE)
-		        time_offset = MAXPHASE << SHIFT_UPDATE;
-		    else if (ltemp < -MAXPHASE)
-			time_offset = -(MAXPHASE << SHIFT_UPDATE);
-		    else
-		        time_offset = ltemp << SHIFT_UPDATE;
-
-		    /*
-		     * Select whether the frequency is to be controlled
-		     * and in which mode (PLL or FLL). Clamp to the operating
-		     * range. Ugly multiply/divide should be replaced someday.
-		     */
-
-		    if (time_status & STA_FREQHOLD || time_reftime == 0)
-		        time_reftime = xtime.tv_sec;
-		    mtemp = xtime.tv_sec - time_reftime;
-		    time_reftime = xtime.tv_sec;
-		    if (time_status & STA_FLL) {
-		        if (mtemp >= MINSEC) {
-			    ltemp = (time_offset / mtemp) << (SHIFT_USEC -
-							      SHIFT_UPDATE);
-			    time_freq += shift_right(ltemp, SHIFT_KH);
-			} else /* calibration interval too short (p. 12) */
-				result = TIME_ERROR;
-		    } else {	/* PLL mode */
-		        if (mtemp < MAXSEC) {
-			    ltemp *= mtemp;
-			    time_freq += shift_right(ltemp,(time_constant +
-						       time_constant +
-						       SHIFT_KF - SHIFT_USEC));
-			} else /* calibration interval too long (p. 12) */
-				result = TIME_ERROR;
-		    }
-		    time_freq = min(time_freq, time_tolerance);
-		    time_freq = max(time_freq, -time_tolerance);
-		} /* STA_PLL */
-	    } /* txc->modes & ADJ_OFFSET */
-	    if (txc->modes & ADJ_TICK) {
-		tick_usec = txc->tick;
-		tick_nsec = TICK_USEC_TO_NSEC(tick_usec);
-	    }
-	} /* txc->modes */
-leave:	if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0)
-		result = TIME_ERROR;
-	
-	if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT)
-	    txc->offset	   = save_adjust;
-	else {
-	    txc->offset = shift_right(time_offset, SHIFT_UPDATE);
-	}
-	txc->freq	   = time_freq;
-	txc->maxerror	   = time_maxerror;
-	txc->esterror	   = time_esterror;
-	txc->status	   = time_status;
-	txc->constant	   = time_constant;
-	txc->precision	   = time_precision;
-	txc->tolerance	   = time_tolerance;
-	txc->tick	   = tick_usec;
-
-	/* PPS is not implemented, so these are zero */
-	txc->ppsfreq	   = 0;
-	txc->jitter	   = 0;
-	txc->shift	   = 0;
-	txc->stabil	   = 0;
-	txc->jitcnt	   = 0;
-	txc->calcnt	   = 0;
-	txc->errcnt	   = 0;
-	txc->stbcnt	   = 0;
-	write_sequnlock_irq(&xtime_lock);
-	do_gettimeofday(&txc->time);
-	notify_arch_cmos_timer();
-	return(result);
-}
-
 asmlinkage long sys_adjtimex(struct timex __user *txc_p)
 {
 	struct timex txc;		/* Local copy of parameter */
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index e1dfd8e86cce..61a3907d16fb 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -1 +1 @@
-obj-y += clocksource.o jiffies.o
+obj-y += ntp.o clocksource.o jiffies.o
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
new file mode 100644
index 000000000000..8ccce15b4b23
--- /dev/null
+++ b/kernel/time/ntp.c
@@ -0,0 +1,389 @@
+/*
+ * linux/kernel/time/ntp.c
+ *
+ * NTP state machine interfaces and logic.
+ *
+ * This code was mainly moved from kernel/timer.c and kernel/time.c
+ * Please see those files for relevant copyright info and historical
+ * changelogs.
+ */
+
+#include <linux/mm.h>
+#include <linux/time.h>
+#include <linux/timex.h>
+
+#include <asm/div64.h>
+#include <asm/timex.h>
+
+/* Don't completely fail for HZ > 500.  */
+int tickadj = 500/HZ ? : 1;		/* microsecs */
+
+/*
+ * phase-lock loop variables
+ */
+/* TIME_ERROR prevents overwriting the CMOS clock */
+int time_state = TIME_OK;		/* clock synchronization status	*/
+int time_status = STA_UNSYNC;		/* clock status bits		*/
+long time_offset;			/* time adjustment (us)		*/
+long time_constant = 2;			/* pll time constant		*/
+long time_tolerance = MAXFREQ;		/* frequency tolerance (ppm)	*/
+long time_precision = 1;		/* clock precision (us)		*/
+long time_maxerror = NTP_PHASE_LIMIT;	/* maximum error (us)		*/
+long time_esterror = NTP_PHASE_LIMIT;	/* estimated error (us)		*/
+long time_freq = (((NSEC_PER_SEC + HZ/2) % HZ - HZ/2) << SHIFT_USEC) / NSEC_PER_USEC;
+					/* frequency offset (scaled ppm)*/
+static long time_adj;			/* tick adjust (scaled 1 / HZ)	*/
+long time_reftime;			/* time at last adjustment (s)	*/
+long time_adjust;
+long time_next_adjust;
+
+/*
+ * this routine handles the overflow of the microsecond field
+ *
+ * The tricky bits of code to handle the accurate clock support
+ * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame.
+ * They were originally developed for SUN and DEC kernels.
+ * All the kudos should go to Dave for this stuff.
+ */
+void second_overflow(void)
+{
+	long ltemp;
+
+	/* Bump the maxerror field */
+	time_maxerror += time_tolerance >> SHIFT_USEC;
+	if (time_maxerror > NTP_PHASE_LIMIT) {
+		time_maxerror = NTP_PHASE_LIMIT;
+		time_status |= STA_UNSYNC;
+	}
+
+	/*
+	 * Leap second processing. If in leap-insert state at the end of the
+	 * day, the system clock is set back one second; if in leap-delete
+	 * state, the system clock is set ahead one second. The microtime()
+	 * routine or external clock driver will insure that reported time is
+	 * always monotonic. The ugly divides should be replaced.
+	 */
+	switch (time_state) {
+	case TIME_OK:
+		if (time_status & STA_INS)
+			time_state = TIME_INS;
+		else if (time_status & STA_DEL)
+			time_state = TIME_DEL;
+		break;
+	case TIME_INS:
+		if (xtime.tv_sec % 86400 == 0) {
+			xtime.tv_sec--;
+			wall_to_monotonic.tv_sec++;
+			/*
+			 * The timer interpolator will make time change
+			 * gradually instead of an immediate jump by one second
+			 */
+			time_interpolator_update(-NSEC_PER_SEC);
+			time_state = TIME_OOP;
+			clock_was_set();
+			printk(KERN_NOTICE "Clock: inserting leap second "
+					"23:59:60 UTC\n");
+		}
+		break;
+	case TIME_DEL:
+		if ((xtime.tv_sec + 1) % 86400 == 0) {
+			xtime.tv_sec++;
+			wall_to_monotonic.tv_sec--;
+			/*
+			 * Use of time interpolator for a gradual change of
+			 * time
+			 */
+			time_interpolator_update(NSEC_PER_SEC);
+			time_state = TIME_WAIT;
+			clock_was_set();
+			printk(KERN_NOTICE "Clock: deleting leap second "
+					"23:59:59 UTC\n");
+		}
+		break;
+	case TIME_OOP:
+		time_state = TIME_WAIT;
+		break;
+	case TIME_WAIT:
+		if (!(time_status & (STA_INS | STA_DEL)))
+		time_state = TIME_OK;
+	}
+
+	/*
+	 * Compute the phase adjustment for the next second. In PLL mode, the
+	 * offset is reduced by a fixed factor times the time constant. In FLL
+	 * mode the offset is used directly. In either mode, the maximum phase
+	 * adjustment for each second is clamped so as to spread the adjustment
+	 * over not more than the number of seconds between updates.
+	 */
+	ltemp = time_offset;
+	if (!(time_status & STA_FLL))
+		ltemp = shift_right(ltemp, SHIFT_KG + time_constant);
+	ltemp = min(ltemp, (MAXPHASE / MINSEC) << SHIFT_UPDATE);
+	ltemp = max(ltemp, -(MAXPHASE / MINSEC) << SHIFT_UPDATE);
+	time_offset -= ltemp;
+	time_adj = ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE);
+
+	/*
+	 * Compute the frequency estimate and additional phase adjustment due
+	 * to frequency error for the next second.
+	 */
+	ltemp = time_freq;
+	time_adj += shift_right(ltemp,(SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE));
+
+#if HZ == 100
+	/*
+	 * Compensate for (HZ==100) != (1 << SHIFT_HZ).  Add 25% and 3.125% to
+	 * get 128.125; => only 0.125% error (p. 14)
+	 */
+	time_adj += shift_right(time_adj, 2) + shift_right(time_adj, 5);
+#endif
+#if HZ == 250
+	/*
+	 * Compensate for (HZ==250) != (1 << SHIFT_HZ).  Add 1.5625% and
+	 * 0.78125% to get 255.85938; => only 0.05% error (p. 14)
+	 */
+	time_adj += shift_right(time_adj, 6) + shift_right(time_adj, 7);
+#endif
+#if HZ == 1000
+	/*
+	 * Compensate for (HZ==1000) != (1 << SHIFT_HZ).  Add 1.5625% and
+	 * 0.78125% to get 1023.4375; => only 0.05% error (p. 14)
+	 */
+	time_adj += shift_right(time_adj, 6) + shift_right(time_adj, 7);
+#endif
+}
+
+/*
+ * Returns how many microseconds we need to add to xtime this tick
+ * in doing an adjustment requested with adjtime.
+ */
+static long adjtime_adjustment(void)
+{
+	long time_adjust_step;
+
+	time_adjust_step = time_adjust;
+	if (time_adjust_step) {
+		/*
+		 * We are doing an adjtime thing.  Prepare time_adjust_step to
+		 * be within bounds.  Note that a positive time_adjust means we
+		 * want the clock to run faster.
+		 *
+		 * Limit the amount of the step to be in the range
+		 * -tickadj .. +tickadj
+		 */
+		time_adjust_step = min(time_adjust_step, (long)tickadj);
+		time_adjust_step = max(time_adjust_step, (long)-tickadj);
+	}
+	return time_adjust_step;
+}
+
+/* in the NTP reference this is called "hardclock()" */
+void update_ntp_one_tick(void)
+{
+	long time_adjust_step;
+
+	time_adjust_step = adjtime_adjustment();
+	if (time_adjust_step)
+		/* Reduce by this step the amount of time left  */
+		time_adjust -= time_adjust_step;
+
+	/* Changes by adjtime() do not take effect till next tick. */
+	if (time_next_adjust != 0) {
+		time_adjust = time_next_adjust;
+		time_next_adjust = 0;
+	}
+}
+
+/*
+ * Return how long ticks are at the moment, that is, how much time
+ * update_wall_time_one_tick will add to xtime next time we call it
+ * (assuming no calls to do_adjtimex in the meantime).
+ * The return value is in fixed-point nanoseconds shifted by the
+ * specified number of bits to the right of the binary point.
+ * This function has no side-effects.
+ */
+u64 current_tick_length(void)
+{
+	long delta_nsec;
+	u64 ret;
+
+	/* calculate the finest interval NTP will allow.
+	 *    ie: nanosecond value shifted by (SHIFT_SCALE - 10)
+	 */
+	delta_nsec = tick_nsec + adjtime_adjustment() * 1000;
+	ret = (u64)delta_nsec << TICK_LENGTH_SHIFT;
+	ret += (s64)time_adj << (TICK_LENGTH_SHIFT - (SHIFT_SCALE - 10));
+
+	return ret;
+}
+
+
+void __attribute__ ((weak)) notify_arch_cmos_timer(void)
+{
+	return;
+}
+
+/* adjtimex mainly allows reading (and writing, if superuser) of
+ * kernel time-keeping variables. used by xntpd.
+ */
+int do_adjtimex(struct timex *txc)
+{
+	long ltemp, mtemp, save_adjust;
+	int result;
+
+	/* In order to modify anything, you gotta be super-user! */
+	if (txc->modes && !capable(CAP_SYS_TIME))
+		return -EPERM;
+
+	/* Now we validate the data before disabling interrupts */
+
+	if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT)
+	  /* singleshot must not be used with any other mode bits */
+		if (txc->modes != ADJ_OFFSET_SINGLESHOT)
+			return -EINVAL;
+
+	if (txc->modes != ADJ_OFFSET_SINGLESHOT && (txc->modes & ADJ_OFFSET))
+	  /* adjustment Offset limited to +- .512 seconds */
+		if (txc->offset <= - MAXPHASE || txc->offset >= MAXPHASE )
+			return -EINVAL;
+
+	/* if the quartz is off by more than 10% something is VERY wrong ! */
+	if (txc->modes & ADJ_TICK)
+		if (txc->tick <  900000/USER_HZ ||
+		    txc->tick > 1100000/USER_HZ)
+			return -EINVAL;
+
+	write_seqlock_irq(&xtime_lock);
+	result = time_state;	/* mostly `TIME_OK' */
+
+	/* Save for later - semantics of adjtime is to return old value */
+	save_adjust = time_next_adjust ? time_next_adjust : time_adjust;
+
+#if 0	/* STA_CLOCKERR is never set yet */
+	time_status &= ~STA_CLOCKERR;		/* reset STA_CLOCKERR */
+#endif
+	/* If there are input parameters, then process them */
+	if (txc->modes)
+	{
+	    if (txc->modes & ADJ_STATUS)	/* only set allowed bits */
+		time_status =  (txc->status & ~STA_RONLY) |
+			      (time_status & STA_RONLY);
+
+	    if (txc->modes & ADJ_FREQUENCY) {	/* p. 22 */
+		if (txc->freq > MAXFREQ || txc->freq < -MAXFREQ) {
+		    result = -EINVAL;
+		    goto leave;
+		}
+		time_freq = txc->freq;
+	    }
+
+	    if (txc->modes & ADJ_MAXERROR) {
+		if (txc->maxerror < 0 || txc->maxerror >= NTP_PHASE_LIMIT) {
+		    result = -EINVAL;
+		    goto leave;
+		}
+		time_maxerror = txc->maxerror;
+	    }
+
+	    if (txc->modes & ADJ_ESTERROR) {
+		if (txc->esterror < 0 || txc->esterror >= NTP_PHASE_LIMIT) {
+		    result = -EINVAL;
+		    goto leave;
+		}
+		time_esterror = txc->esterror;
+	    }
+
+	    if (txc->modes & ADJ_TIMECONST) {	/* p. 24 */
+		if (txc->constant < 0) {	/* NTP v4 uses values > 6 */
+		    result = -EINVAL;
+		    goto leave;
+		}
+		time_constant = txc->constant;
+	    }
+
+	    if (txc->modes & ADJ_OFFSET) {	/* values checked earlier */
+		if (txc->modes == ADJ_OFFSET_SINGLESHOT) {
+		    /* adjtime() is independent from ntp_adjtime() */
+		    if ((time_next_adjust = txc->offset) == 0)
+			 time_adjust = 0;
+		}
+		else if (time_status & STA_PLL) {
+		    ltemp = txc->offset;
+
+		    /*
+		     * Scale the phase adjustment and
+		     * clamp to the operating range.
+		     */
+		    if (ltemp > MAXPHASE)
+		        time_offset = MAXPHASE << SHIFT_UPDATE;
+		    else if (ltemp < -MAXPHASE)
+			time_offset = -(MAXPHASE << SHIFT_UPDATE);
+		    else
+		        time_offset = ltemp << SHIFT_UPDATE;
+
+		    /*
+		     * Select whether the frequency is to be controlled
+		     * and in which mode (PLL or FLL). Clamp to the operating
+		     * range. Ugly multiply/divide should be replaced someday.
+		     */
+
+		    if (time_status & STA_FREQHOLD || time_reftime == 0)
+		        time_reftime = xtime.tv_sec;
+		    mtemp = xtime.tv_sec - time_reftime;
+		    time_reftime = xtime.tv_sec;
+		    if (time_status & STA_FLL) {
+		        if (mtemp >= MINSEC) {
+			    ltemp = (time_offset / mtemp) << (SHIFT_USEC -
+							      SHIFT_UPDATE);
+			    time_freq += shift_right(ltemp, SHIFT_KH);
+			} else /* calibration interval too short (p. 12) */
+				result = TIME_ERROR;
+		    } else {	/* PLL mode */
+		        if (mtemp < MAXSEC) {
+			    ltemp *= mtemp;
+			    time_freq += shift_right(ltemp,(time_constant +
+						       time_constant +
+						       SHIFT_KF - SHIFT_USEC));
+			} else /* calibration interval too long (p. 12) */
+				result = TIME_ERROR;
+		    }
+		    time_freq = min(time_freq, time_tolerance);
+		    time_freq = max(time_freq, -time_tolerance);
+		} /* STA_PLL */
+	    } /* txc->modes & ADJ_OFFSET */
+	    if (txc->modes & ADJ_TICK) {
+		tick_usec = txc->tick;
+		tick_nsec = TICK_USEC_TO_NSEC(tick_usec);
+	    }
+	} /* txc->modes */
+leave:	if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0)
+		result = TIME_ERROR;
+
+	if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT)
+	    txc->offset	   = save_adjust;
+	else {
+	    txc->offset = shift_right(time_offset, SHIFT_UPDATE);
+	}
+	txc->freq	   = time_freq;
+	txc->maxerror	   = time_maxerror;
+	txc->esterror	   = time_esterror;
+	txc->status	   = time_status;
+	txc->constant	   = time_constant;
+	txc->precision	   = time_precision;
+	txc->tolerance	   = time_tolerance;
+	txc->tick	   = tick_usec;
+
+	/* PPS is not implemented, so these are zero */
+	txc->ppsfreq	   = 0;
+	txc->jitter	   = 0;
+	txc->shift	   = 0;
+	txc->stabil	   = 0;
+	txc->jitcnt	   = 0;
+	txc->calcnt	   = 0;
+	txc->errcnt	   = 0;
+	txc->stbcnt	   = 0;
+	write_sequnlock_irq(&xtime_lock);
+	do_gettimeofday(&txc->time);
+	notify_arch_cmos_timer();
+	return(result);
+}
diff --git a/kernel/timer.c b/kernel/timer.c
index 4f55622b0d38..5fccc7cbf3b4 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -41,12 +41,6 @@
 #include <asm/timex.h>
 #include <asm/io.h>
 
-#ifdef CONFIG_TIME_INTERPOLATION
-static void time_interpolator_update(long delta_nsec);
-#else
-#define time_interpolator_update(x)
-#endif
-
 u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES;
 
 EXPORT_SYMBOL(jiffies_64);
@@ -587,209 +581,6 @@ struct timespec wall_to_monotonic __attribute__ ((aligned (16)));
 
 EXPORT_SYMBOL(xtime);
 
-/* Don't completely fail for HZ > 500.  */
-int tickadj = 500/HZ ? : 1;		/* microsecs */
-
-
-/*
- * phase-lock loop variables
- */
-/* TIME_ERROR prevents overwriting the CMOS clock */
-int time_state = TIME_OK;		/* clock synchronization status	*/
-int time_status = STA_UNSYNC;		/* clock status bits		*/
-long time_offset;			/* time adjustment (us)		*/
-long time_constant = 2;			/* pll time constant		*/
-long time_tolerance = MAXFREQ;		/* frequency tolerance (ppm)	*/
-long time_precision = 1;		/* clock precision (us)		*/
-long time_maxerror = NTP_PHASE_LIMIT;	/* maximum error (us)		*/
-long time_esterror = NTP_PHASE_LIMIT;	/* estimated error (us)		*/
-long time_freq = (((NSEC_PER_SEC + HZ/2) % HZ - HZ/2) << SHIFT_USEC) / NSEC_PER_USEC;
-					/* frequency offset (scaled ppm)*/
-static long time_adj;			/* tick adjust (scaled 1 / HZ)	*/
-long time_reftime;			/* time at last adjustment (s)	*/
-long time_adjust;
-long time_next_adjust;
-
-/*
- * this routine handles the overflow of the microsecond field
- *
- * The tricky bits of code to handle the accurate clock support
- * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame.
- * They were originally developed for SUN and DEC kernels.
- * All the kudos should go to Dave for this stuff.
- *
- */
-static void second_overflow(void)
-{
-	long ltemp;
-
-	/* Bump the maxerror field */
-	time_maxerror += time_tolerance >> SHIFT_USEC;
-	if (time_maxerror > NTP_PHASE_LIMIT) {
-		time_maxerror = NTP_PHASE_LIMIT;
-		time_status |= STA_UNSYNC;
-	}
-
-	/*
-	 * Leap second processing. If in leap-insert state at the end of the
-	 * day, the system clock is set back one second; if in leap-delete
-	 * state, the system clock is set ahead one second. The microtime()
-	 * routine or external clock driver will insure that reported time is
-	 * always monotonic. The ugly divides should be replaced.
-	 */
-	switch (time_state) {
-	case TIME_OK:
-		if (time_status & STA_INS)
-			time_state = TIME_INS;
-		else if (time_status & STA_DEL)
-			time_state = TIME_DEL;
-		break;
-	case TIME_INS:
-		if (xtime.tv_sec % 86400 == 0) {
-			xtime.tv_sec--;
-			wall_to_monotonic.tv_sec++;
-			/*
-			 * The timer interpolator will make time change
-			 * gradually instead of an immediate jump by one second
-			 */
-			time_interpolator_update(-NSEC_PER_SEC);
-			time_state = TIME_OOP;
-			clock_was_set();
-			printk(KERN_NOTICE "Clock: inserting leap second "
-					"23:59:60 UTC\n");
-		}
-		break;
-	case TIME_DEL:
-		if ((xtime.tv_sec + 1) % 86400 == 0) {
-			xtime.tv_sec++;
-			wall_to_monotonic.tv_sec--;
-			/*
-			 * Use of time interpolator for a gradual change of
-			 * time
-			 */
-			time_interpolator_update(NSEC_PER_SEC);
-			time_state = TIME_WAIT;
-			clock_was_set();
-			printk(KERN_NOTICE "Clock: deleting leap second "
-					"23:59:59 UTC\n");
-		}
-		break;
-	case TIME_OOP:
-		time_state = TIME_WAIT;
-		break;
-	case TIME_WAIT:
-		if (!(time_status & (STA_INS | STA_DEL)))
-		time_state = TIME_OK;
-	}
-
-	/*
-	 * Compute the phase adjustment for the next second. In PLL mode, the
-	 * offset is reduced by a fixed factor times the time constant. In FLL
-	 * mode the offset is used directly. In either mode, the maximum phase
-	 * adjustment for each second is clamped so as to spread the adjustment
-	 * over not more than the number of seconds between updates.
-	 */
-	ltemp = time_offset;
-	if (!(time_status & STA_FLL))
-		ltemp = shift_right(ltemp, SHIFT_KG + time_constant);
-	ltemp = min(ltemp, (MAXPHASE / MINSEC) << SHIFT_UPDATE);
-	ltemp = max(ltemp, -(MAXPHASE / MINSEC) << SHIFT_UPDATE);
-	time_offset -= ltemp;
-	time_adj = ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE);
-
-	/*
-	 * Compute the frequency estimate and additional phase adjustment due
-	 * to frequency error for the next second.
-	 */
-	ltemp = time_freq;
-	time_adj += shift_right(ltemp,(SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE));
-
-#if HZ == 100
-	/*
-	 * Compensate for (HZ==100) != (1 << SHIFT_HZ).  Add 25% and 3.125% to
-	 * get 128.125; => only 0.125% error (p. 14)
-	 */
-	time_adj += shift_right(time_adj, 2) + shift_right(time_adj, 5);
-#endif
-#if HZ == 250
-	/*
-	 * Compensate for (HZ==250) != (1 << SHIFT_HZ).  Add 1.5625% and
-	 * 0.78125% to get 255.85938; => only 0.05% error (p. 14)
-	 */
-	time_adj += shift_right(time_adj, 6) + shift_right(time_adj, 7);
-#endif
-#if HZ == 1000
-	/*
-	 * Compensate for (HZ==1000) != (1 << SHIFT_HZ).  Add 1.5625% and
-	 * 0.78125% to get 1023.4375; => only 0.05% error (p. 14)
-	 */
-	time_adj += shift_right(time_adj, 6) + shift_right(time_adj, 7);
-#endif
-}
-
-/*
- * Returns how many microseconds we need to add to xtime this tick
- * in doing an adjustment requested with adjtime.
- */
-static long adjtime_adjustment(void)
-{
-	long time_adjust_step;
-
-	time_adjust_step = time_adjust;
-	if (time_adjust_step) {
-		/*
-		 * We are doing an adjtime thing.  Prepare time_adjust_step to
-		 * be within bounds.  Note that a positive time_adjust means we
-		 * want the clock to run faster.
-		 *
-		 * Limit the amount of the step to be in the range
-		 * -tickadj .. +tickadj
-		 */
-		time_adjust_step = min(time_adjust_step, (long)tickadj);
-		time_adjust_step = max(time_adjust_step, (long)-tickadj);
-	}
-	return time_adjust_step;
-}
-
-/* in the NTP reference this is called "hardclock()" */
-static void update_ntp_one_tick(void)
-{
-	long time_adjust_step;
-
-	time_adjust_step = adjtime_adjustment();
-	if (time_adjust_step)
-		/* Reduce by this step the amount of time left  */
-		time_adjust -= time_adjust_step;
-
-	/* Changes by adjtime() do not take effect till next tick. */
-	if (time_next_adjust != 0) {
-		time_adjust = time_next_adjust;
-		time_next_adjust = 0;
-	}
-}
-
-/*
- * Return how long ticks are at the moment, that is, how much time
- * update_wall_time_one_tick will add to xtime next time we call it
- * (assuming no calls to do_adjtimex in the meantime).
- * The return value is in fixed-point nanoseconds shifted by the
- * specified number of bits to the right of the binary point.
- * This function has no side-effects.
- */
-u64 current_tick_length(void)
-{
-	long delta_nsec;
-	u64 ret;
-
-	/* calculate the finest interval NTP will allow.
-	 *    ie: nanosecond value shifted by (SHIFT_SCALE - 10)
-	 */
-	delta_nsec = tick_nsec + adjtime_adjustment() * 1000;
-	ret = (u64)delta_nsec << TICK_LENGTH_SHIFT;
-	ret += (s64)time_adj << (TICK_LENGTH_SHIFT - (SHIFT_SCALE - 10));
-
-	return ret;
-}
 
 /* XXX - all of this timekeeping code should be later moved to time.c */
 #include <linux/clocksource.h>
@@ -1775,7 +1566,7 @@ unsigned long time_interpolator_get_offset(void)
 #define INTERPOLATOR_ADJUST 65536
 #define INTERPOLATOR_MAX_SKIP 10*INTERPOLATOR_ADJUST
 
-static void time_interpolator_update(long delta_nsec)
+void time_interpolator_update(long delta_nsec)
 {
 	u64 counter;
 	unsigned long offset;
-- 
cgit v1.2.3


From b0ee75561beadc4db4d9a899c8ef4a7db50aa0ab Mon Sep 17 00:00:00 2001
From: Roman Zippel <zippel@linux-m68k.org>
Date: Sat, 30 Sep 2006 23:28:22 -0700
Subject: [PATCH] ntp: add ntp_update_frequency

This introduces ntp_update_frequency() and deinlines ntp_clear() (as it's not
performance critical).  ntp_update_frequency() calculates the base tick length
using tick_usec and adds a base adjustment, in case the frequency doesn't
divide evenly by HZ.

Signed-off-by: Roman Zippel <zippel@linux-m68k.org>
Cc: john stultz <johnstul@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/timex.h | 14 ++------------
 kernel/time/ntp.c     | 50 ++++++++++++++++++++++++++++++++++++++++++++------
 kernel/timer.c        | 11 ++++-------
 3 files changed, 50 insertions(+), 25 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/timex.h b/include/linux/timex.h
index 2a21485bf183..b589c8218bb9 100644
--- a/include/linux/timex.h
+++ b/include/linux/timex.h
@@ -219,18 +219,8 @@ extern long time_reftime;	/* time at last adjustment (s) */
 extern long time_adjust;	/* The amount of adjtime left */
 extern long time_next_adjust;	/* Value for time_adjust at next tick */
 
-/**
- * ntp_clear - Clears the NTP state variables
- *
- * Must be called while holding a write on the xtime_lock
- */
-static inline void ntp_clear(void)
-{
-	time_adjust = 0;		/* stop active adjtime() */
-	time_status |= STA_UNSYNC;
-	time_maxerror = NTP_PHASE_LIMIT;
-	time_esterror = NTP_PHASE_LIMIT;
-}
+extern void ntp_clear(void);
+extern void ntp_update_frequency(void);
 
 /**
  * ntp_synced - Returns 1 if the NTP status is not UNSYNC
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 8ccce15b4b23..77137bec2aea 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -15,6 +15,13 @@
 #include <asm/div64.h>
 #include <asm/timex.h>
 
+/*
+ * Timekeeping variables
+ */
+unsigned long tick_usec = TICK_USEC; 		/* USER_HZ period (usec) */
+unsigned long tick_nsec;			/* ACTHZ period (nsec) */
+static u64 tick_length, tick_length_base;
+
 /* Don't completely fail for HZ > 500.  */
 int tickadj = 500/HZ ? : 1;		/* microsecs */
 
@@ -37,6 +44,36 @@ long time_reftime;			/* time at last adjustment (s)	*/
 long time_adjust;
 long time_next_adjust;
 
+/**
+ * ntp_clear - Clears the NTP state variables
+ *
+ * Must be called while holding a write on the xtime_lock
+ */
+void ntp_clear(void)
+{
+	time_adjust = 0;		/* stop active adjtime() */
+	time_status |= STA_UNSYNC;
+	time_maxerror = NTP_PHASE_LIMIT;
+	time_esterror = NTP_PHASE_LIMIT;
+
+	ntp_update_frequency();
+
+	tick_length = tick_length_base;
+}
+
+#define CLOCK_TICK_OVERFLOW	(LATCH * HZ - CLOCK_TICK_RATE)
+#define CLOCK_TICK_ADJUST	(((s64)CLOCK_TICK_OVERFLOW * NSEC_PER_SEC) / (s64)CLOCK_TICK_RATE)
+
+void ntp_update_frequency(void)
+{
+	tick_length_base = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ) << TICK_LENGTH_SHIFT;
+	tick_length_base += (s64)CLOCK_TICK_ADJUST << TICK_LENGTH_SHIFT;
+
+	do_div(tick_length_base, HZ);
+
+	tick_nsec = tick_length_base >> TICK_LENGTH_SHIFT;
+}
+
 /*
  * this routine handles the overflow of the microsecond field
  *
@@ -151,6 +188,7 @@ void second_overflow(void)
 	 */
 	time_adj += shift_right(time_adj, 6) + shift_right(time_adj, 7);
 #endif
+	tick_length = tick_length_base;
 }
 
 /*
@@ -204,14 +242,13 @@ void update_ntp_one_tick(void)
  */
 u64 current_tick_length(void)
 {
-	long delta_nsec;
 	u64 ret;
 
 	/* calculate the finest interval NTP will allow.
 	 *    ie: nanosecond value shifted by (SHIFT_SCALE - 10)
 	 */
-	delta_nsec = tick_nsec + adjtime_adjustment() * 1000;
-	ret = (u64)delta_nsec << TICK_LENGTH_SHIFT;
+	ret = tick_length;
+	ret += (u64)(adjtime_adjustment() * 1000) << TICK_LENGTH_SHIFT;
 	ret += (s64)time_adj << (TICK_LENGTH_SHIFT - (SHIFT_SCALE - 10));
 
 	return ret;
@@ -351,10 +388,11 @@ int do_adjtimex(struct timex *txc)
 		    time_freq = max(time_freq, -time_tolerance);
 		} /* STA_PLL */
 	    } /* txc->modes & ADJ_OFFSET */
-	    if (txc->modes & ADJ_TICK) {
+	    if (txc->modes & ADJ_TICK)
 		tick_usec = txc->tick;
-		tick_nsec = TICK_USEC_TO_NSEC(tick_usec);
-	    }
+
+	    if (txc->modes & ADJ_TICK)
+		    ntp_update_frequency();
 	} /* txc->modes */
 leave:	if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0)
 		result = TIME_ERROR;
diff --git a/kernel/timer.c b/kernel/timer.c
index 5fccc7cbf3b4..78d3fa10fcd6 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -562,12 +562,6 @@ found:
 
 /******************************************************************/
 
-/*
- * Timekeeping variables
- */
-unsigned long tick_usec = TICK_USEC; 		/* USER_HZ period (usec) */
-unsigned long tick_nsec = TICK_NSEC;		/* ACTHZ period (nsec) */
-
 /* 
  * The current time 
  * wall_to_monotonic is what we need to add to xtime (or xtime corrected 
@@ -757,10 +751,13 @@ void __init timekeeping_init(void)
 	unsigned long flags;
 
 	write_seqlock_irqsave(&xtime_lock, flags);
+
+	ntp_clear();
+
 	clock = clocksource_get_next();
 	clocksource_calculate_interval(clock, tick_nsec);
 	clock->cycle_last = clocksource_read(clock);
-	ntp_clear();
+
 	write_sequnlock_irqrestore(&xtime_lock, flags);
 }
 
-- 
cgit v1.2.3


From ab8783b688f33c40ed7b37b814a4a1e7d341ce11 Mon Sep 17 00:00:00 2001
From: Roman Zippel <zippel@linux-m68k.org>
Date: Sat, 30 Sep 2006 23:28:23 -0700
Subject: [PATCH] ntp: add time_adj to tick length

This makes time_adj local to second_overflow() and integrates it into the tick
length instead of adding it everytime.

Signed-off-by: Roman Zippel <zippel@linux-m68k.org>
Cc: john stultz <johnstul@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/time/ntp.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 77137bec2aea..c09628d6b848 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -39,7 +39,6 @@ long time_maxerror = NTP_PHASE_LIMIT;	/* maximum error (us)		*/
 long time_esterror = NTP_PHASE_LIMIT;	/* estimated error (us)		*/
 long time_freq = (((NSEC_PER_SEC + HZ/2) % HZ - HZ/2) << SHIFT_USEC) / NSEC_PER_USEC;
 					/* frequency offset (scaled ppm)*/
-static long time_adj;			/* tick adjust (scaled 1 / HZ)	*/
 long time_reftime;			/* time at last adjustment (s)	*/
 long time_adjust;
 long time_next_adjust;
@@ -84,7 +83,7 @@ void ntp_update_frequency(void)
  */
 void second_overflow(void)
 {
-	long ltemp;
+	long ltemp, time_adj;
 
 	/* Bump the maxerror field */
 	time_maxerror += time_tolerance >> SHIFT_USEC;
@@ -189,6 +188,7 @@ void second_overflow(void)
 	time_adj += shift_right(time_adj, 6) + shift_right(time_adj, 7);
 #endif
 	tick_length = tick_length_base;
+	tick_length += (s64)time_adj << (TICK_LENGTH_SHIFT - (SHIFT_SCALE - 10));
 }
 
 /*
@@ -245,11 +245,9 @@ u64 current_tick_length(void)
 	u64 ret;
 
 	/* calculate the finest interval NTP will allow.
-	 *    ie: nanosecond value shifted by (SHIFT_SCALE - 10)
 	 */
 	ret = tick_length;
 	ret += (u64)(adjtime_adjustment() * 1000) << TICK_LENGTH_SHIFT;
-	ret += (s64)time_adj << (TICK_LENGTH_SHIFT - (SHIFT_SCALE - 10));
 
 	return ret;
 }
-- 
cgit v1.2.3


From dc6a43e46f1b6de22701f97bec022e97088cfa90 Mon Sep 17 00:00:00 2001
From: Roman Zippel <zippel@linux-m68k.org>
Date: Sat, 30 Sep 2006 23:28:24 -0700
Subject: [PATCH] ntp: add time_freq to tick length

This adds the frequency part to ntp_update_frequency().

Signed-off-by: Roman Zippel <zippel@linux-m68k.org>
Cc: john stultz <johnstul@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/time/ntp.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index c09628d6b848..ab21eb06e09b 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -37,8 +37,7 @@ long time_tolerance = MAXFREQ;		/* frequency tolerance (ppm)	*/
 long time_precision = 1;		/* clock precision (us)		*/
 long time_maxerror = NTP_PHASE_LIMIT;	/* maximum error (us)		*/
 long time_esterror = NTP_PHASE_LIMIT;	/* estimated error (us)		*/
-long time_freq = (((NSEC_PER_SEC + HZ/2) % HZ - HZ/2) << SHIFT_USEC) / NSEC_PER_USEC;
-					/* frequency offset (scaled ppm)*/
+long time_freq;				/* frequency offset (scaled ppm)*/
 long time_reftime;			/* time at last adjustment (s)	*/
 long time_adjust;
 long time_next_adjust;
@@ -67,6 +66,7 @@ void ntp_update_frequency(void)
 {
 	tick_length_base = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ) << TICK_LENGTH_SHIFT;
 	tick_length_base += (s64)CLOCK_TICK_ADJUST << TICK_LENGTH_SHIFT;
+	tick_length_base += ((s64)time_freq * NSEC_PER_USEC) << (TICK_LENGTH_SHIFT - SHIFT_USEC);
 
 	do_div(tick_length_base, HZ);
 
@@ -163,8 +163,6 @@ void second_overflow(void)
 	 * Compute the frequency estimate and additional phase adjustment due
 	 * to frequency error for the next second.
 	 */
-	ltemp = time_freq;
-	time_adj += shift_right(ltemp,(SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE));
 
 #if HZ == 100
 	/*
@@ -389,7 +387,7 @@ int do_adjtimex(struct timex *txc)
 	    if (txc->modes & ADJ_TICK)
 		tick_usec = txc->tick;
 
-	    if (txc->modes & ADJ_TICK)
+	    if (txc->modes & (ADJ_TICK|ADJ_FREQUENCY|ADJ_OFFSET))
 		    ntp_update_frequency();
 	} /* txc->modes */
 leave:	if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0)
-- 
cgit v1.2.3


From 3d3675cc3d04d7fd4bb11e8c1ea79e5ade4f5e44 Mon Sep 17 00:00:00 2001
From: Roman Zippel <zippel@linux-m68k.org>
Date: Sat, 30 Sep 2006 23:28:25 -0700
Subject: [PATCH] ntp: prescale time_offset

This converts time_offset into a scaled per tick value.  This avoids now
completely the crude compensation in second_overflow().

Signed-off-by: Roman Zippel <zippel@linux-m68k.org>
Cc: john stultz <johnstul@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/timex.h |  2 +-
 kernel/time/ntp.c     | 64 +++++++++++++--------------------------------------
 2 files changed, 17 insertions(+), 49 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/timex.h b/include/linux/timex.h
index b589c8218bb9..1cde6f6a2712 100644
--- a/include/linux/timex.h
+++ b/include/linux/timex.h
@@ -89,7 +89,7 @@
  * FINENSEC is 1 ns in SHIFT_UPDATE units of the time_phase variable.
  */
 #define SHIFT_SCALE 22		/* phase scale (shift) */
-#define SHIFT_UPDATE (SHIFT_KG + MAXTC) /* time offset scale (shift) */
+#define SHIFT_UPDATE (SHIFT_HZ + 1) /* time offset scale (shift) */
 #define SHIFT_USEC 16		/* frequency offset scale (shift) */
 #define FINENSEC (1L << (SHIFT_SCALE - 10)) /* ~1 ns in phase units */
 
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index ab21eb06e09b..238ce47ef09d 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -31,7 +31,7 @@ int tickadj = 500/HZ ? : 1;		/* microsecs */
 /* TIME_ERROR prevents overwriting the CMOS clock */
 int time_state = TIME_OK;		/* clock synchronization status	*/
 int time_status = STA_UNSYNC;		/* clock status bits		*/
-long time_offset;			/* time adjustment (us)		*/
+long time_offset;			/* time adjustment (ns)		*/
 long time_constant = 2;			/* pll time constant		*/
 long time_tolerance = MAXFREQ;		/* frequency tolerance (ppm)	*/
 long time_precision = 1;		/* clock precision (us)		*/
@@ -57,6 +57,7 @@ void ntp_clear(void)
 	ntp_update_frequency();
 
 	tick_length = tick_length_base;
+	time_offset = 0;
 }
 
 #define CLOCK_TICK_OVERFLOW	(LATCH * HZ - CLOCK_TICK_RATE)
@@ -83,7 +84,7 @@ void ntp_update_frequency(void)
  */
 void second_overflow(void)
 {
-	long ltemp, time_adj;
+	long time_adj;
 
 	/* Bump the maxerror field */
 	time_maxerror += time_tolerance >> SHIFT_USEC;
@@ -151,42 +152,14 @@ void second_overflow(void)
 	 * adjustment for each second is clamped so as to spread the adjustment
 	 * over not more than the number of seconds between updates.
 	 */
-	ltemp = time_offset;
-	if (!(time_status & STA_FLL))
-		ltemp = shift_right(ltemp, SHIFT_KG + time_constant);
-	ltemp = min(ltemp, (MAXPHASE / MINSEC) << SHIFT_UPDATE);
-	ltemp = max(ltemp, -(MAXPHASE / MINSEC) << SHIFT_UPDATE);
-	time_offset -= ltemp;
-	time_adj = ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE);
-
-	/*
-	 * Compute the frequency estimate and additional phase adjustment due
-	 * to frequency error for the next second.
-	 */
-
-#if HZ == 100
-	/*
-	 * Compensate for (HZ==100) != (1 << SHIFT_HZ).  Add 25% and 3.125% to
-	 * get 128.125; => only 0.125% error (p. 14)
-	 */
-	time_adj += shift_right(time_adj, 2) + shift_right(time_adj, 5);
-#endif
-#if HZ == 250
-	/*
-	 * Compensate for (HZ==250) != (1 << SHIFT_HZ).  Add 1.5625% and
-	 * 0.78125% to get 255.85938; => only 0.05% error (p. 14)
-	 */
-	time_adj += shift_right(time_adj, 6) + shift_right(time_adj, 7);
-#endif
-#if HZ == 1000
-	/*
-	 * Compensate for (HZ==1000) != (1 << SHIFT_HZ).  Add 1.5625% and
-	 * 0.78125% to get 1023.4375; => only 0.05% error (p. 14)
-	 */
-	time_adj += shift_right(time_adj, 6) + shift_right(time_adj, 7);
-#endif
 	tick_length = tick_length_base;
-	tick_length += (s64)time_adj << (TICK_LENGTH_SHIFT - (SHIFT_SCALE - 10));
+	time_adj = time_offset;
+	if (!(time_status & STA_FLL))
+		time_adj = shift_right(time_adj, SHIFT_KG + time_constant);
+	time_adj = min(time_adj, -((MAXPHASE / HZ) << SHIFT_UPDATE) / MINSEC);
+	time_adj = max(time_adj, ((MAXPHASE / HZ) << SHIFT_UPDATE) / MINSEC);
+	time_offset -= time_adj;
+	tick_length += (s64)time_adj << (TICK_LENGTH_SHIFT - SHIFT_UPDATE);
 }
 
 /*
@@ -347,12 +320,8 @@ int do_adjtimex(struct timex *txc)
 		     * Scale the phase adjustment and
 		     * clamp to the operating range.
 		     */
-		    if (ltemp > MAXPHASE)
-		        time_offset = MAXPHASE << SHIFT_UPDATE;
-		    else if (ltemp < -MAXPHASE)
-			time_offset = -(MAXPHASE << SHIFT_UPDATE);
-		    else
-		        time_offset = ltemp << SHIFT_UPDATE;
+		    time_offset = min(ltemp, MAXPHASE);
+		    time_offset = max(time_offset, -MAXPHASE);
 
 		    /*
 		     * Select whether the frequency is to be controlled
@@ -366,8 +335,7 @@ int do_adjtimex(struct timex *txc)
 		    time_reftime = xtime.tv_sec;
 		    if (time_status & STA_FLL) {
 		        if (mtemp >= MINSEC) {
-			    ltemp = (time_offset / mtemp) << (SHIFT_USEC -
-							      SHIFT_UPDATE);
+			    ltemp = ((time_offset << 12) / mtemp) << (SHIFT_USEC - 12);
 			    time_freq += shift_right(ltemp, SHIFT_KH);
 			} else /* calibration interval too short (p. 12) */
 				result = TIME_ERROR;
@@ -382,6 +350,7 @@ int do_adjtimex(struct timex *txc)
 		    }
 		    time_freq = min(time_freq, time_tolerance);
 		    time_freq = max(time_freq, -time_tolerance);
+		    time_offset = (time_offset * NSEC_PER_USEC / HZ) << SHIFT_UPDATE;
 		} /* STA_PLL */
 	    } /* txc->modes & ADJ_OFFSET */
 	    if (txc->modes & ADJ_TICK)
@@ -395,9 +364,8 @@ leave:	if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0)
 
 	if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT)
 	    txc->offset	   = save_adjust;
-	else {
-	    txc->offset = shift_right(time_offset, SHIFT_UPDATE);
-	}
+	else
+	    txc->offset    = shift_right(time_offset, SHIFT_UPDATE) * HZ / 1000;
 	txc->freq	   = time_freq;
 	txc->maxerror	   = time_maxerror;
 	txc->esterror	   = time_esterror;
-- 
cgit v1.2.3


From 8f807f8d2137ba728d22820103131038639b68a9 Mon Sep 17 00:00:00 2001
From: Roman Zippel <zippel@linux-m68k.org>
Date: Sat, 30 Sep 2006 23:28:25 -0700
Subject: [PATCH] ntp: add time_adjust to tick length

This folds update_ntp_one_tick() into second_overflow() and adds time_adjust
to the tick length, this makes time_next_adjust unnecessary.  This slightly
changes the adjtime() behaviour, instead of applying it to the next tick, it's
applied to the next second.

Signed-off-by: Roman Zippel <zippel@linux-m68k.org>
Cc: john stultz <johnstul@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/timex.h |  1 -
 kernel/time/ntp.c     | 71 +++++++++++++--------------------------------------
 kernel/timer.c        |  2 --
 3 files changed, 18 insertions(+), 56 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/timex.h b/include/linux/timex.h
index 1cde6f6a2712..b5f297e17668 100644
--- a/include/linux/timex.h
+++ b/include/linux/timex.h
@@ -217,7 +217,6 @@ extern long time_freq;		/* frequency offset (scaled ppm) */
 extern long time_reftime;	/* time at last adjustment (s) */
 
 extern long time_adjust;	/* The amount of adjtime left */
-extern long time_next_adjust;	/* Value for time_adjust at next tick */
 
 extern void ntp_clear(void);
 extern void ntp_update_frequency(void);
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 238ce47ef09d..65223b7ed810 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -22,8 +22,9 @@ unsigned long tick_usec = TICK_USEC; 		/* USER_HZ period (usec) */
 unsigned long tick_nsec;			/* ACTHZ period (nsec) */
 static u64 tick_length, tick_length_base;
 
-/* Don't completely fail for HZ > 500.  */
-int tickadj = 500/HZ ? : 1;		/* microsecs */
+#define MAX_TICKADJ		500		/* microsecs */
+#define MAX_TICKADJ_SCALED	(((u64)(MAX_TICKADJ * NSEC_PER_USEC) << \
+				  TICK_LENGTH_SHIFT) / HZ)
 
 /*
  * phase-lock loop variables
@@ -40,7 +41,6 @@ long time_esterror = NTP_PHASE_LIMIT;	/* estimated error (us)		*/
 long time_freq;				/* frequency offset (scaled ppm)*/
 long time_reftime;			/* time at last adjustment (s)	*/
 long time_adjust;
-long time_next_adjust;
 
 /**
  * ntp_clear - Clears the NTP state variables
@@ -160,46 +160,19 @@ void second_overflow(void)
 	time_adj = max(time_adj, ((MAXPHASE / HZ) << SHIFT_UPDATE) / MINSEC);
 	time_offset -= time_adj;
 	tick_length += (s64)time_adj << (TICK_LENGTH_SHIFT - SHIFT_UPDATE);
-}
-
-/*
- * Returns how many microseconds we need to add to xtime this tick
- * in doing an adjustment requested with adjtime.
- */
-static long adjtime_adjustment(void)
-{
-	long time_adjust_step;
-
-	time_adjust_step = time_adjust;
-	if (time_adjust_step) {
-		/*
-		 * We are doing an adjtime thing.  Prepare time_adjust_step to
-		 * be within bounds.  Note that a positive time_adjust means we
-		 * want the clock to run faster.
-		 *
-		 * Limit the amount of the step to be in the range
-		 * -tickadj .. +tickadj
-		 */
-		time_adjust_step = min(time_adjust_step, (long)tickadj);
-		time_adjust_step = max(time_adjust_step, (long)-tickadj);
-	}
-	return time_adjust_step;
-}
 
-/* in the NTP reference this is called "hardclock()" */
-void update_ntp_one_tick(void)
-{
-	long time_adjust_step;
-
-	time_adjust_step = adjtime_adjustment();
-	if (time_adjust_step)
-		/* Reduce by this step the amount of time left  */
-		time_adjust -= time_adjust_step;
-
-	/* Changes by adjtime() do not take effect till next tick. */
-	if (time_next_adjust != 0) {
-		time_adjust = time_next_adjust;
-		time_next_adjust = 0;
+	if (unlikely(time_adjust)) {
+		if (time_adjust > MAX_TICKADJ) {
+			time_adjust -= MAX_TICKADJ;
+			tick_length += MAX_TICKADJ_SCALED;
+		} else if (time_adjust < -MAX_TICKADJ) {
+			time_adjust += MAX_TICKADJ;
+			tick_length -= MAX_TICKADJ_SCALED;
+		} else {
+			time_adjust = 0;
+			tick_length += (s64)(time_adjust * NSEC_PER_USEC /
+					     HZ) << TICK_LENGTH_SHIFT;
+		}
 	}
 }
 
@@ -213,14 +186,7 @@ void update_ntp_one_tick(void)
  */
 u64 current_tick_length(void)
 {
-	u64 ret;
-
-	/* calculate the finest interval NTP will allow.
-	 */
-	ret = tick_length;
-	ret += (u64)(adjtime_adjustment() * 1000) << TICK_LENGTH_SHIFT;
-
-	return ret;
+	return tick_length;
 }
 
 
@@ -263,7 +229,7 @@ int do_adjtimex(struct timex *txc)
 	result = time_state;	/* mostly `TIME_OK' */
 
 	/* Save for later - semantics of adjtime is to return old value */
-	save_adjust = time_next_adjust ? time_next_adjust : time_adjust;
+	save_adjust = time_adjust;
 
 #if 0	/* STA_CLOCKERR is never set yet */
 	time_status &= ~STA_CLOCKERR;		/* reset STA_CLOCKERR */
@@ -310,8 +276,7 @@ int do_adjtimex(struct timex *txc)
 	    if (txc->modes & ADJ_OFFSET) {	/* values checked earlier */
 		if (txc->modes == ADJ_OFFSET_SINGLESHOT) {
 		    /* adjtime() is independent from ntp_adjtime() */
-		    if ((time_next_adjust = txc->offset) == 0)
-			 time_adjust = 0;
+		    time_adjust = txc->offset;
 		}
 		else if (time_status & STA_PLL) {
 		    ltemp = txc->offset;
diff --git a/kernel/timer.c b/kernel/timer.c
index 78d3fa10fcd6..654dd5df2417 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -937,8 +937,6 @@ static void update_wall_time(void)
 		/* interpolator bits */
 		time_interpolator_update(clock->xtime_interval
 						>> clock->shift);
-		/* increment the NTP state machine */
-		update_ntp_one_tick();
 
 		/* accumulate error between NTP and clock interval */
 		clock->error += current_tick_length();
-- 
cgit v1.2.3


From 97eebe138caaf78354b1fad233e63bafdcc4fd54 Mon Sep 17 00:00:00 2001
From: Roman Zippel <zippel@linux-m68k.org>
Date: Sat, 30 Sep 2006 23:28:26 -0700
Subject: [PATCH] ntp: remove time_tolerance

time_tolerance isn't changed at all in the kernel, so simply remove it, this
simplifies the next patch, as it avoids a number of conversions.

Signed-off-by: Roman Zippel <zippel@linux-m68k.org>
Cc: john stultz <johnstul@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/timex.h | 1 -
 kernel/time/ntp.c     | 9 ++++-----
 2 files changed, 4 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/timex.h b/include/linux/timex.h
index b5f297e17668..7715b4c0caf9 100644
--- a/include/linux/timex.h
+++ b/include/linux/timex.h
@@ -208,7 +208,6 @@ extern int time_state;		/* clock status */
 extern int time_status;		/* clock synchronization status bits */
 extern long time_offset;	/* time adjustment (us) */
 extern long time_constant;	/* pll time constant */
-extern long time_tolerance;	/* frequency tolerance (ppm) */
 extern long time_precision;	/* clock precision (us) */
 extern long time_maxerror;	/* maximum error */
 extern long time_esterror;	/* estimated error */
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 65223b7ed810..af7563f5d4e2 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -34,7 +34,6 @@ int time_state = TIME_OK;		/* clock synchronization status	*/
 int time_status = STA_UNSYNC;		/* clock status bits		*/
 long time_offset;			/* time adjustment (ns)		*/
 long time_constant = 2;			/* pll time constant		*/
-long time_tolerance = MAXFREQ;		/* frequency tolerance (ppm)	*/
 long time_precision = 1;		/* clock precision (us)		*/
 long time_maxerror = NTP_PHASE_LIMIT;	/* maximum error (us)		*/
 long time_esterror = NTP_PHASE_LIMIT;	/* estimated error (us)		*/
@@ -87,7 +86,7 @@ void second_overflow(void)
 	long time_adj;
 
 	/* Bump the maxerror field */
-	time_maxerror += time_tolerance >> SHIFT_USEC;
+	time_maxerror += MAXFREQ >> SHIFT_USEC;
 	if (time_maxerror > NTP_PHASE_LIMIT) {
 		time_maxerror = NTP_PHASE_LIMIT;
 		time_status |= STA_UNSYNC;
@@ -313,8 +312,8 @@ int do_adjtimex(struct timex *txc)
 			} else /* calibration interval too long (p. 12) */
 				result = TIME_ERROR;
 		    }
-		    time_freq = min(time_freq, time_tolerance);
-		    time_freq = max(time_freq, -time_tolerance);
+		    time_freq = min(time_freq, MAXFREQ);
+		    time_freq = max(time_freq, -MAXFREQ);
 		    time_offset = (time_offset * NSEC_PER_USEC / HZ) << SHIFT_UPDATE;
 		} /* STA_PLL */
 	    } /* txc->modes & ADJ_OFFSET */
@@ -337,7 +336,7 @@ leave:	if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0)
 	txc->status	   = time_status;
 	txc->constant	   = time_constant;
 	txc->precision	   = time_precision;
-	txc->tolerance	   = time_tolerance;
+	txc->tolerance	   = MAXFREQ;
 	txc->tick	   = tick_usec;
 
 	/* PPS is not implemented, so these are zero */
-- 
cgit v1.2.3


From 04b617e71e363e640e88be1e43f53fa6a3afef9f Mon Sep 17 00:00:00 2001
From: Roman Zippel <zippel@linux-m68k.org>
Date: Sat, 30 Sep 2006 23:28:27 -0700
Subject: [PATCH] ntp: convert time_freq to nsec value

This converts time_freq to a scaled nsec value and adds around 6bit of extra
resolution.  This pushes the time_freq to its 32bit limits so the calculatons
have to be done with 64bit.

Signed-off-by: Roman Zippel <zippel@linux-m68k.org>
Cc: john stultz <johnstul@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/timex.h |  2 ++
 kernel/time/ntp.c     | 36 ++++++++++++++++++++++--------------
 2 files changed, 24 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/timex.h b/include/linux/timex.h
index 7715b4c0caf9..671609ee1a3d 100644
--- a/include/linux/timex.h
+++ b/include/linux/timex.h
@@ -91,10 +91,12 @@
 #define SHIFT_SCALE 22		/* phase scale (shift) */
 #define SHIFT_UPDATE (SHIFT_HZ + 1) /* time offset scale (shift) */
 #define SHIFT_USEC 16		/* frequency offset scale (shift) */
+#define SHIFT_NSEC 12		/* kernel frequency offset scale */
 #define FINENSEC (1L << (SHIFT_SCALE - 10)) /* ~1 ns in phase units */
 
 #define MAXPHASE 512000L        /* max phase error (us) */
 #define MAXFREQ (512L << SHIFT_USEC)  /* max frequency error (ppm) */
+#define MAXFREQ_NSEC (512000L << SHIFT_NSEC) /* max frequency error (ppb) */
 #define MINSEC 16L              /* min interval between updates (s) */
 #define MAXSEC 1200L            /* max interval between updates (s) */
 #define	NTP_PHASE_LIMIT	(MAXPHASE << 5)	/* beyond max. dispersion */
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index af7563f5d4e2..9137b54613e0 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -66,7 +66,7 @@ void ntp_update_frequency(void)
 {
 	tick_length_base = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ) << TICK_LENGTH_SHIFT;
 	tick_length_base += (s64)CLOCK_TICK_ADJUST << TICK_LENGTH_SHIFT;
-	tick_length_base += ((s64)time_freq * NSEC_PER_USEC) << (TICK_LENGTH_SHIFT - SHIFT_USEC);
+	tick_length_base += (s64)time_freq << (TICK_LENGTH_SHIFT - SHIFT_NSEC);
 
 	do_div(tick_length_base, HZ);
 
@@ -200,6 +200,7 @@ void __attribute__ ((weak)) notify_arch_cmos_timer(void)
 int do_adjtimex(struct timex *txc)
 {
 	long ltemp, mtemp, save_adjust;
+	s64 freq_adj;
 	int result;
 
 	/* In order to modify anything, you gotta be super-user! */
@@ -245,7 +246,7 @@ int do_adjtimex(struct timex *txc)
 		    result = -EINVAL;
 		    goto leave;
 		}
-		time_freq = txc->freq;
+		time_freq = ((s64)txc->freq * NSEC_PER_USEC) >> (SHIFT_USEC - SHIFT_NSEC);
 	    }
 
 	    if (txc->modes & ADJ_MAXERROR) {
@@ -278,14 +279,14 @@ int do_adjtimex(struct timex *txc)
 		    time_adjust = txc->offset;
 		}
 		else if (time_status & STA_PLL) {
-		    ltemp = txc->offset;
+		    ltemp = txc->offset * NSEC_PER_USEC;
 
 		    /*
 		     * Scale the phase adjustment and
 		     * clamp to the operating range.
 		     */
-		    time_offset = min(ltemp, MAXPHASE);
-		    time_offset = max(time_offset, -MAXPHASE);
+		    time_offset = min(ltemp, MAXPHASE * NSEC_PER_USEC);
+		    time_offset = max(time_offset, -MAXPHASE * NSEC_PER_USEC);
 
 		    /*
 		     * Select whether the frequency is to be controlled
@@ -297,24 +298,31 @@ int do_adjtimex(struct timex *txc)
 		        time_reftime = xtime.tv_sec;
 		    mtemp = xtime.tv_sec - time_reftime;
 		    time_reftime = xtime.tv_sec;
+		    freq_adj = 0;
 		    if (time_status & STA_FLL) {
 		        if (mtemp >= MINSEC) {
-			    ltemp = ((time_offset << 12) / mtemp) << (SHIFT_USEC - 12);
-			    time_freq += shift_right(ltemp, SHIFT_KH);
+			    freq_adj = (s64)time_offset << (SHIFT_NSEC - SHIFT_KH);
+			    if (time_offset < 0) {
+				freq_adj = -freq_adj;
+				do_div(freq_adj, mtemp);
+				freq_adj = -freq_adj;
+			    } else
+				do_div(freq_adj, mtemp);
 			} else /* calibration interval too short (p. 12) */
 				result = TIME_ERROR;
 		    } else {	/* PLL mode */
 		        if (mtemp < MAXSEC) {
-			    ltemp *= mtemp;
-			    time_freq += shift_right(ltemp,(time_constant +
+			    freq_adj = (s64)ltemp * mtemp;
+			    freq_adj = shift_right(freq_adj,(time_constant +
 						       time_constant +
-						       SHIFT_KF - SHIFT_USEC));
+						       SHIFT_KF - SHIFT_NSEC));
 			} else /* calibration interval too long (p. 12) */
 				result = TIME_ERROR;
 		    }
-		    time_freq = min(time_freq, MAXFREQ);
-		    time_freq = max(time_freq, -MAXFREQ);
-		    time_offset = (time_offset * NSEC_PER_USEC / HZ) << SHIFT_UPDATE;
+		    freq_adj += time_freq;
+		    freq_adj = min(freq_adj, (s64)MAXFREQ_NSEC);
+		    time_freq = max(freq_adj, (s64)-MAXFREQ_NSEC);
+		    time_offset = (time_offset / HZ) << SHIFT_UPDATE;
 		} /* STA_PLL */
 	    } /* txc->modes & ADJ_OFFSET */
 	    if (txc->modes & ADJ_TICK)
@@ -330,7 +338,7 @@ leave:	if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0)
 	    txc->offset	   = save_adjust;
 	else
 	    txc->offset    = shift_right(time_offset, SHIFT_UPDATE) * HZ / 1000;
-	txc->freq	   = time_freq;
+	txc->freq	   = (time_freq / NSEC_PER_USEC) << (SHIFT_USEC - SHIFT_NSEC);
 	txc->maxerror	   = time_maxerror;
 	txc->esterror	   = time_esterror;
 	txc->status	   = time_status;
-- 
cgit v1.2.3


From f19923937321244e7dc334767eb4b67e0e3d5c74 Mon Sep 17 00:00:00 2001
From: Roman Zippel <zippel@linux-m68k.org>
Date: Sat, 30 Sep 2006 23:28:28 -0700
Subject: [PATCH] ntp: convert to the NTP4 reference model

This converts the kernel ntp model into a model which matches the nanokernel
reference implementations.  The previous patches already increased the
resolution and precision of the computations, so that this conversion becomes
quite simple.

<linux@horizon.com> explains:

The original NTP kernel interface was defined in units of microseconds.
That's what Linux implements.  As computers have gotten faster and can now
split microseconds easily, a new kernel interface using nanosecond units was
defined ("the nanokernel", confusing as that name is to OS hackers), and
there's an STA_NANO bit in the adjtimex() status field to tell the application
which units it's using.

The current ntpd supports both, but Linux loses some possible timing
resolution because of quantization effects, and the ntpd hackers would really
like to be able to drop the backwards compatibility code.

Ulrich Windl has been maintaining a patch set to do the conversion for years,
but it's hard to keep in sync.

Signed-off-by: Roman Zippel <zippel@linux-m68k.org>
Cc: john stultz <johnstul@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/timex.h | 11 +++++------
 kernel/time/ntp.c     | 51 +++++++++++++++++++--------------------------------
 2 files changed, 24 insertions(+), 38 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/timex.h b/include/linux/timex.h
index 671609ee1a3d..ac808f13fa0e 100644
--- a/include/linux/timex.h
+++ b/include/linux/timex.h
@@ -69,10 +69,9 @@
  * zero to MAXTC, the PLL will converge in 15 minutes to 16 hours,
  * respectively.
  */
-#define SHIFT_KG 6		/* phase factor (shift) */
-#define SHIFT_KF 16		/* PLL frequency factor (shift) */
-#define SHIFT_KH 2		/* FLL frequency factor (shift) */
-#define MAXTC 6			/* maximum time constant (shift) */
+#define SHIFT_PLL	4	/* PLL frequency factor (shift) */
+#define SHIFT_FLL	2	/* FLL frequency factor (shift) */
+#define MAXTC		10	/* maximum time constant (shift) */
 
 /*
  * The SHIFT_SCALE define establishes the decimal point of the time_phase
@@ -97,8 +96,8 @@
 #define MAXPHASE 512000L        /* max phase error (us) */
 #define MAXFREQ (512L << SHIFT_USEC)  /* max frequency error (ppm) */
 #define MAXFREQ_NSEC (512000L << SHIFT_NSEC) /* max frequency error (ppb) */
-#define MINSEC 16L              /* min interval between updates (s) */
-#define MAXSEC 1200L            /* max interval between updates (s) */
+#define MINSEC 256		/* min interval between updates (s) */
+#define MAXSEC 2048		/* max interval between updates (s) */
 #define	NTP_PHASE_LIMIT	(MAXPHASE << 5)	/* beyond max. dispersion */
 
 /*
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 9137b54613e0..1ab5e9d7fa50 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -145,18 +145,11 @@ void second_overflow(void)
 	}
 
 	/*
-	 * Compute the phase adjustment for the next second. In PLL mode, the
-	 * offset is reduced by a fixed factor times the time constant. In FLL
-	 * mode the offset is used directly. In either mode, the maximum phase
-	 * adjustment for each second is clamped so as to spread the adjustment
-	 * over not more than the number of seconds between updates.
+	 * Compute the phase adjustment for the next second. The offset is
+	 * reduced by a fixed factor times the time constant.
 	 */
 	tick_length = tick_length_base;
-	time_adj = time_offset;
-	if (!(time_status & STA_FLL))
-		time_adj = shift_right(time_adj, SHIFT_KG + time_constant);
-	time_adj = min(time_adj, -((MAXPHASE / HZ) << SHIFT_UPDATE) / MINSEC);
-	time_adj = max(time_adj, ((MAXPHASE / HZ) << SHIFT_UPDATE) / MINSEC);
+	time_adj = shift_right(time_offset, SHIFT_PLL + time_constant);
 	time_offset -= time_adj;
 	tick_length += (s64)time_adj << (TICK_LENGTH_SHIFT - SHIFT_UPDATE);
 
@@ -200,7 +193,7 @@ void __attribute__ ((weak)) notify_arch_cmos_timer(void)
 int do_adjtimex(struct timex *txc)
 {
 	long ltemp, mtemp, save_adjust;
-	s64 freq_adj;
+	s64 freq_adj, temp64;
 	int result;
 
 	/* In order to modify anything, you gotta be super-user! */
@@ -270,7 +263,7 @@ int do_adjtimex(struct timex *txc)
 		    result = -EINVAL;
 		    goto leave;
 		}
-		time_constant = txc->constant;
+		time_constant = min(txc->constant + 4, (long)MAXTC);
 	    }
 
 	    if (txc->modes & ADJ_OFFSET) {	/* values checked earlier */
@@ -298,26 +291,20 @@ int do_adjtimex(struct timex *txc)
 		        time_reftime = xtime.tv_sec;
 		    mtemp = xtime.tv_sec - time_reftime;
 		    time_reftime = xtime.tv_sec;
-		    freq_adj = 0;
-		    if (time_status & STA_FLL) {
-		        if (mtemp >= MINSEC) {
-			    freq_adj = (s64)time_offset << (SHIFT_NSEC - SHIFT_KH);
-			    if (time_offset < 0) {
-				freq_adj = -freq_adj;
-				do_div(freq_adj, mtemp);
-				freq_adj = -freq_adj;
-			    } else
-				do_div(freq_adj, mtemp);
-			} else /* calibration interval too short (p. 12) */
-				result = TIME_ERROR;
-		    } else {	/* PLL mode */
-		        if (mtemp < MAXSEC) {
-			    freq_adj = (s64)ltemp * mtemp;
-			    freq_adj = shift_right(freq_adj,(time_constant +
-						       time_constant +
-						       SHIFT_KF - SHIFT_NSEC));
-			} else /* calibration interval too long (p. 12) */
-				result = TIME_ERROR;
+
+		    freq_adj = (s64)time_offset * mtemp;
+		    freq_adj = shift_right(freq_adj, time_constant * 2 +
+					   (SHIFT_PLL + 2) * 2 - SHIFT_NSEC);
+		    if (mtemp >= MINSEC && (time_status & STA_FLL || mtemp > MAXSEC)) {
+			temp64 = (s64)time_offset << (SHIFT_NSEC - SHIFT_FLL);
+			if (time_offset < 0) {
+			    temp64 = -temp64;
+			    do_div(temp64, mtemp);
+			    freq_adj -= temp64;
+			} else {
+			    do_div(temp64, mtemp);
+			    freq_adj += temp64;
+			}
 		    }
 		    freq_adj += time_freq;
 		    freq_adj = min(freq_adj, (s64)MAXFREQ_NSEC);
-- 
cgit v1.2.3


From 70bc42f90a3f4721c89dbe865e6c95da8565b41c Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Sat, 30 Sep 2006 23:28:29 -0700
Subject: [PATCH] kernel/time/ntp.c: possible cleanups

This patch contains the following possible cleanups:
- make the following needlessly global function static:
  - ntp_update_frequency()
- make the following needlessly global variables static:
  - time_state
  - time_offset
  - time_constant
  - time_reftime
- remove the following read-only global variable:
  - time_precision

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Cc: Roman Zippel <zippel@linux-m68k.org>
Cc: john stultz <johnstul@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/timex.h |  6 ------
 kernel/time/ntp.c     | 40 ++++++++++++++++++++--------------------
 2 files changed, 20 insertions(+), 26 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/timex.h b/include/linux/timex.h
index 261381b5da82..049dfe4a11f2 100644
--- a/include/linux/timex.h
+++ b/include/linux/timex.h
@@ -198,21 +198,15 @@ extern int tickadj;			/* amount of adjustment per tick */
 /*
  * phase-lock loop variables
  */
-extern int time_state;		/* clock status */
 extern int time_status;		/* clock synchronization status bits */
-extern long time_offset;	/* time adjustment (us) */
-extern long time_constant;	/* pll time constant */
-extern long time_precision;	/* clock precision (us) */
 extern long time_maxerror;	/* maximum error */
 extern long time_esterror;	/* estimated error */
 
 extern long time_freq;		/* frequency offset (scaled ppm) */
-extern long time_reftime;	/* time at last adjustment (s) */
 
 extern long time_adjust;	/* The amount of adjtime left */
 
 extern void ntp_clear(void);
-extern void ntp_update_frequency(void);
 
 /**
  * ntp_synced - Returns 1 if the NTP status is not UNSYNC
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 1ab5e9d7fa50..47195fa0ec4f 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -30,17 +30,31 @@ static u64 tick_length, tick_length_base;
  * phase-lock loop variables
  */
 /* TIME_ERROR prevents overwriting the CMOS clock */
-int time_state = TIME_OK;		/* clock synchronization status	*/
+static int time_state = TIME_OK;	/* clock synchronization status	*/
 int time_status = STA_UNSYNC;		/* clock status bits		*/
-long time_offset;			/* time adjustment (ns)		*/
-long time_constant = 2;			/* pll time constant		*/
-long time_precision = 1;		/* clock precision (us)		*/
+static long time_offset;		/* time adjustment (ns)		*/
+static long time_constant = 2;		/* pll time constant		*/
 long time_maxerror = NTP_PHASE_LIMIT;	/* maximum error (us)		*/
 long time_esterror = NTP_PHASE_LIMIT;	/* estimated error (us)		*/
 long time_freq;				/* frequency offset (scaled ppm)*/
-long time_reftime;			/* time at last adjustment (s)	*/
+static long time_reftime;		/* time at last adjustment (s)	*/
 long time_adjust;
 
+#define CLOCK_TICK_OVERFLOW	(LATCH * HZ - CLOCK_TICK_RATE)
+#define CLOCK_TICK_ADJUST	(((s64)CLOCK_TICK_OVERFLOW * NSEC_PER_SEC) / \
+					(s64)CLOCK_TICK_RATE)
+
+static void ntp_update_frequency(void)
+{
+	tick_length_base = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ) << TICK_LENGTH_SHIFT;
+	tick_length_base += (s64)CLOCK_TICK_ADJUST << TICK_LENGTH_SHIFT;
+	tick_length_base += (s64)time_freq << (TICK_LENGTH_SHIFT - SHIFT_NSEC);
+
+	do_div(tick_length_base, HZ);
+
+	tick_nsec = tick_length_base >> TICK_LENGTH_SHIFT;
+}
+
 /**
  * ntp_clear - Clears the NTP state variables
  *
@@ -59,20 +73,6 @@ void ntp_clear(void)
 	time_offset = 0;
 }
 
-#define CLOCK_TICK_OVERFLOW	(LATCH * HZ - CLOCK_TICK_RATE)
-#define CLOCK_TICK_ADJUST	(((s64)CLOCK_TICK_OVERFLOW * NSEC_PER_SEC) / (s64)CLOCK_TICK_RATE)
-
-void ntp_update_frequency(void)
-{
-	tick_length_base = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ) << TICK_LENGTH_SHIFT;
-	tick_length_base += (s64)CLOCK_TICK_ADJUST << TICK_LENGTH_SHIFT;
-	tick_length_base += (s64)time_freq << (TICK_LENGTH_SHIFT - SHIFT_NSEC);
-
-	do_div(tick_length_base, HZ);
-
-	tick_nsec = tick_length_base >> TICK_LENGTH_SHIFT;
-}
-
 /*
  * this routine handles the overflow of the microsecond field
  *
@@ -330,7 +330,7 @@ leave:	if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0)
 	txc->esterror	   = time_esterror;
 	txc->status	   = time_status;
 	txc->constant	   = time_constant;
-	txc->precision	   = time_precision;
+	txc->precision	   = 1;
 	txc->tolerance	   = MAXFREQ;
 	txc->tick	   = tick_usec;
 
-- 
cgit v1.2.3


From 8ef386092d7c2891bd7acefb2a87f878f7e9a0d6 Mon Sep 17 00:00:00 2001
From: Atsushi Nemoto <anemo@mba.ocn.ne.jp>
Date: Sat, 30 Sep 2006 23:28:31 -0700
Subject: [PATCH] kill wall_jiffies

With 2.6.18-rc4-mm2, now wall_jiffies will always be the same as jiffies.
So we can kill wall_jiffies completely.

This is just a cleanup and logically should not change any real behavior
except for one thing: RTC updating code in (old) ppc and xtensa use a
condition "jiffies - wall_jiffies == 1".  This condition is never met so I
suppose it is just a bug.  I just remove that condition only instead of
kill the whole "if" block.

[heiko.carstens@de.ibm.com: s390 build fix and cleanup]
Signed-off-by: Atsushi Nemoto <anemo@mba.ocn.ne.jp>
Cc: Andi Kleen <ak@muc.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: Ian Molton <spyro@f2s.com>
Cc: Mikael Starvik <starvik@axis.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Cc: Hirokazu Takata <takata.hirokazu@renesas.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Kyle McMartin <kyle@mcmartin.ca>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Kazumoto Kojima <kkojima@rr.iij4u.or.jp>
Cc: Richard Curnow <rc@rc0.org.uk>
Cc: William Lee Irwin III <wli@holomorphy.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Jeff Dike <jdike@addtoit.com>
Cc: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
Cc: Miles Bader <uclinux-v850@lsi.nec.co.jp>
Cc: Chris Zankel <chris@zankel.net>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Roman Zippel <zippel@linux-m68k.org>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/alpha/kernel/time.c         | 15 +++++----------
 arch/arm/kernel/time.c           | 10 +---------
 arch/arm26/kernel/time.c         | 12 ++----------
 arch/cris/kernel/time.c          |  7 -------
 arch/i386/kernel/time.c          |  3 ---
 arch/ia64/kernel/time.c          |  2 --
 arch/m32r/kernel/time.c          | 11 +----------
 arch/m68k/kernel/time.c          | 16 +++-------------
 arch/m68knommu/kernel/time.c     |  7 +------
 arch/mips/kernel/time.c          | 12 +-----------
 arch/mips/sgi-ip27/ip27-timer.c  |  2 --
 arch/parisc/kernel/time.c        |  5 +----
 arch/powerpc/kernel/time.c       |  7 -------
 arch/ppc/kernel/time.c           | 11 +++--------
 arch/s390/kernel/time.c          |  7 ++-----
 arch/sh/kernel/time.c            | 10 +---------
 arch/sh64/kernel/time.c          | 12 +-----------
 arch/sparc/kernel/pcic.c         | 16 ++--------------
 arch/sparc/kernel/time.c         | 18 +++---------------
 arch/sparc64/kernel/time.c       |  2 --
 arch/x86_64/kernel/time.c        | 12 +++---------
 arch/x86_64/kernel/vmlinux.lds.S |  3 ---
 arch/x86_64/kernel/vsyscall.c    |  3 +--
 arch/xtensa/kernel/time.c        | 13 +++----------
 include/asm-x86_64/vsyscall.h    |  3 ---
 kernel/timer.c                   |  6 +-----
 26 files changed, 35 insertions(+), 190 deletions(-)

(limited to 'kernel')

diff --git a/arch/alpha/kernel/time.c b/arch/alpha/kernel/time.c
index 7c1e44420a78..581ddcc22fc5 100644
--- a/arch/alpha/kernel/time.c
+++ b/arch/alpha/kernel/time.c
@@ -54,8 +54,6 @@
 #include "proto.h"
 #include "irq_impl.h"
 
-extern unsigned long wall_jiffies;	/* kernel/timer.c */
-
 static int set_rtc_mmss(unsigned long);
 
 DEFINE_SPINLOCK(rtc_lock);
@@ -413,7 +411,7 @@ void
 do_gettimeofday(struct timeval *tv)
 {
 	unsigned long flags;
-	unsigned long sec, usec, lost, seq;
+	unsigned long sec, usec, seq;
 	unsigned long delta_cycles, delta_usec, partial_tick;
 
 	do {
@@ -423,14 +421,13 @@ do_gettimeofday(struct timeval *tv)
 		sec = xtime.tv_sec;
 		usec = (xtime.tv_nsec / 1000);
 		partial_tick = state.partial_tick;
-		lost = jiffies - wall_jiffies;
 
 	} while (read_seqretry_irqrestore(&xtime_lock, seq, flags));
 
 #ifdef CONFIG_SMP
 	/* Until and unless we figure out how to get cpu cycle counters
 	   in sync and keep them there, we can't use the rpcc tricks.  */
-	delta_usec = lost * (1000000 / HZ);
+	delta_usec = 0;
 #else
 	/*
 	 * usec = cycles * ticks_per_cycle * 2**48 * 1e6 / (2**48 * ticks)
@@ -446,8 +443,7 @@ do_gettimeofday(struct timeval *tv)
 	 */
 
 	delta_usec = (delta_cycles * state.scaled_ticks_per_cycle 
-		      + partial_tick
-		      + (lost << FIX_SHIFT)) * 15625;
+		      + partial_tick) * 15625;
 	delta_usec = ((delta_usec / ((1UL << (FIX_SHIFT-6-1)) * HZ)) + 1) / 2;
 #endif
 
@@ -480,12 +476,11 @@ do_settimeofday(struct timespec *tv)
 	   time.  Without this, a full-tick error is possible.  */
 
 #ifdef CONFIG_SMP
-	delta_nsec = (jiffies - wall_jiffies) * (NSEC_PER_SEC / HZ);
+	delta_nsec = 0;
 #else
 	delta_nsec = rpcc() - state.last_time;
 	delta_nsec = (delta_nsec * state.scaled_ticks_per_cycle 
-		      + state.partial_tick
-		      + ((jiffies - wall_jiffies) << FIX_SHIFT)) * 15625;
+		      + state.partial_tick) * 15625;
 	delta_nsec = ((delta_nsec / ((1UL << (FIX_SHIFT-6-1)) * HZ)) + 1) / 2;
 	delta_nsec *= 1000;
 #endif
diff --git a/arch/arm/kernel/time.c b/arch/arm/kernel/time.c
index f7d5165796ef..b030320b17c7 100644
--- a/arch/arm/kernel/time.c
+++ b/arch/arm/kernel/time.c
@@ -37,8 +37,6 @@
  */
 struct sys_timer *system_timer;
 
-extern unsigned long wall_jiffies;
-
 /* this needs a better home */
 DEFINE_SPINLOCK(rtc_lock);
 
@@ -237,16 +235,11 @@ void do_gettimeofday(struct timeval *tv)
 {
 	unsigned long flags;
 	unsigned long seq;
-	unsigned long usec, sec, lost;
+	unsigned long usec, sec;
 
 	do {
 		seq = read_seqbegin_irqsave(&xtime_lock, flags);
 		usec = system_timer->offset();
-
-		lost = jiffies - wall_jiffies;
-		if (lost)
-			usec += lost * USECS_PER_JIFFY;
-
 		sec = xtime.tv_sec;
 		usec += xtime.tv_nsec / 1000;
 	} while (read_seqretry_irqrestore(&xtime_lock, seq, flags));
@@ -279,7 +272,6 @@ int do_settimeofday(struct timespec *tv)
 	 * done, and then undo it!
 	 */
 	nsec -= system_timer->offset() * NSEC_PER_USEC;
-	nsec -= (jiffies - wall_jiffies) * TICK_NSEC;
 
 	wtm_sec  = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec);
 	wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec);
diff --git a/arch/arm26/kernel/time.c b/arch/arm26/kernel/time.c
index 80adbd005fc5..1206469b2b86 100644
--- a/arch/arm26/kernel/time.c
+++ b/arch/arm26/kernel/time.c
@@ -33,8 +33,6 @@
 #include <asm/irq.h>
 #include <asm/ioc.h>
 
-extern unsigned long wall_jiffies;
-
 /* this needs a better home */
 DEFINE_SPINLOCK(rtc_lock);
 
@@ -136,16 +134,11 @@ void do_gettimeofday(struct timeval *tv)
 {
 	unsigned long flags;
 	unsigned long seq;
-	unsigned long usec, sec, lost;
+	unsigned long usec, sec;
 
 	do {
 		seq = read_seqbegin_irqsave(&xtime_lock, flags);
 		usec = gettimeoffset();
-
-		lost = jiffies - wall_jiffies;
-		if (lost)
-			usec += lost * USECS_PER_JIFFY;
-
 		sec = xtime.tv_sec;
 		usec += xtime.tv_nsec / 1000;
 	} while (read_seqretry_irqrestore(&xtime_lock, seq, flags));
@@ -174,8 +167,7 @@ int do_settimeofday(struct timespec *tv)
 	 * wall time.  Discover what correction gettimeofday() would have
 	 * done, and then undo it!
 	 */
-	tv->tv_nsec -= 1000 * (gettimeoffset() +
-			(jiffies - wall_jiffies) * USECS_PER_JIFFY);
+	tv->tv_nsec -= 1000 * gettimeoffset();
 
 	while (tv->tv_nsec < 0) {
 		tv->tv_nsec += NSEC_PER_SEC;
diff --git a/arch/cris/kernel/time.c b/arch/cris/kernel/time.c
index 66ba8898db07..0f9213cbd48e 100644
--- a/arch/cris/kernel/time.c
+++ b/arch/cris/kernel/time.c
@@ -37,7 +37,6 @@ int have_rtc;  /* used to remember if we have an RTC or not */;
 
 #define TICK_SIZE tick
 
-extern unsigned long wall_jiffies;
 extern unsigned long loops_per_jiffy; /* init/main.c */
 unsigned long loops_per_usec;
 
@@ -58,11 +57,6 @@ void do_gettimeofday(struct timeval *tv)
 	local_irq_save(flags);
 	local_irq_disable();
 	usec = do_gettimeoffset();
-	{
-		unsigned long lost = jiffies - wall_jiffies;
-		if (lost)
-			usec += lost * (1000000 / HZ);
-	}
 
         /*
 	 * If time_adjust is negative then NTP is slowing the clock
@@ -103,7 +97,6 @@ int do_settimeofday(struct timespec *tv)
 	 * made, and then undo it!
 	 */
 	nsec -= do_gettimeoffset() * NSEC_PER_USEC;
-	nsec -= (jiffies - wall_jiffies) * TICK_NSEC;
 
 	wtm_sec  = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec);
 	wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec);
diff --git a/arch/i386/kernel/time.c b/arch/i386/kernel/time.c
index 86944acfb647..58a2d5582419 100644
--- a/arch/i386/kernel/time.c
+++ b/arch/i386/kernel/time.c
@@ -76,8 +76,6 @@ int pit_latch_buggy;              /* extern */
 unsigned int cpu_khz;	/* Detected as we calibrate the TSC */
 EXPORT_SYMBOL(cpu_khz);
 
-extern unsigned long wall_jiffies;
-
 DEFINE_SPINLOCK(rtc_lock);
 EXPORT_SYMBOL(rtc_lock);
 
@@ -329,7 +327,6 @@ static int timer_resume(struct sys_device *dev)
 	do_settimeofday(&ts);
 	write_seqlock_irqsave(&xtime_lock, flags);
 	jiffies_64 += sleep_length;
-	wall_jiffies += sleep_length;
 	write_sequnlock_irqrestore(&xtime_lock, flags);
 	touch_softlockup_watchdog();
 	return 0;
diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c
index 16262687a103..62e07f906e05 100644
--- a/arch/ia64/kernel/time.c
+++ b/arch/ia64/kernel/time.c
@@ -29,8 +29,6 @@
 #include <asm/sections.h>
 #include <asm/system.h>
 
-extern unsigned long wall_jiffies;
-
 volatile int time_keeper_id = 0; /* smp_processor_id() of time-keeper */
 
 #ifdef CONFIG_IA64_DEBUG_IRQ
diff --git a/arch/m32r/kernel/time.c b/arch/m32r/kernel/time.c
index 7a896893cd28..d8af155db984 100644
--- a/arch/m32r/kernel/time.c
+++ b/arch/m32r/kernel/time.c
@@ -38,7 +38,6 @@ extern void send_IPI_allbutself(int, int);
 extern void smp_local_timer_interrupt(struct pt_regs *);
 #endif
 
-extern unsigned long wall_jiffies;
 #define TICK_SIZE	(tick_nsec / 1000)
 
 /*
@@ -108,24 +107,17 @@ void do_gettimeofday(struct timeval *tv)
 	unsigned long max_ntp_tick = tick_usec - tickadj;
 
 	do {
-		unsigned long lost;
-
 		seq = read_seqbegin(&xtime_lock);
 
 		usec = do_gettimeoffset();
-		lost = jiffies - wall_jiffies;
 
 		/*
 		 * If time_adjust is negative then NTP is slowing the clock
 		 * so make sure not to go into next possible interval.
 		 * Better to lose some accuracy than have time go backwards..
 		 */
-		if (unlikely(time_adjust < 0)) {
+		if (unlikely(time_adjust < 0))
 			usec = min(usec, max_ntp_tick);
-			if (lost)
-				usec += lost * max_ntp_tick;
-		} else if (unlikely(lost))
-			usec += lost * tick_usec;
 
 		sec = xtime.tv_sec;
 		usec += (xtime.tv_nsec / 1000);
@@ -158,7 +150,6 @@ int do_settimeofday(struct timespec *tv)
 	 * made, and then undo it!
 	 */
 	nsec -= do_gettimeoffset() * NSEC_PER_USEC;
-	nsec -= (jiffies - wall_jiffies) * TICK_NSEC;
 
 	wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec);
 	wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec);
diff --git a/arch/m68k/kernel/time.c b/arch/m68k/kernel/time.c
index 1072e4946a4a..6cfc984380d9 100644
--- a/arch/m68k/kernel/time.c
+++ b/arch/m68k/kernel/time.c
@@ -96,31 +96,23 @@ void time_init(void)
 void do_gettimeofday(struct timeval *tv)
 {
 	unsigned long flags;
-	extern unsigned long wall_jiffies;
 	unsigned long seq;
-	unsigned long usec, sec, lost;
+	unsigned long usec, sec;
 	unsigned long max_ntp_tick = tick_usec - tickadj;
 
 	do {
 		seq = read_seqbegin_irqsave(&xtime_lock, flags);
 
 		usec = mach_gettimeoffset();
-		lost = jiffies - wall_jiffies;
 
 		/*
 		 * If time_adjust is negative then NTP is slowing the clock
 		 * so make sure not to go into next possible interval.
 		 * Better to lose some accuracy than have time go backwards..
 		 */
-		if (unlikely(time_adjust < 0)) {
+		if (unlikely(time_adjust < 0))
 			usec = min(usec, max_ntp_tick);
 
-			if (lost)
-				usec += lost * max_ntp_tick;
-		}
-		else if (unlikely(lost))
-			usec += lost * tick_usec;
-
 		sec = xtime.tv_sec;
 		usec += xtime.tv_nsec/1000;
 	} while (read_seqretry_irqrestore(&xtime_lock, seq, flags));
@@ -141,7 +133,6 @@ int do_settimeofday(struct timespec *tv)
 {
 	time_t wtm_sec, sec = tv->tv_sec;
 	long wtm_nsec, nsec = tv->tv_nsec;
-	extern unsigned long wall_jiffies;
 
 	if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
 		return -EINVAL;
@@ -153,8 +144,7 @@ int do_settimeofday(struct timespec *tv)
 	 * Discover what correction gettimeofday
 	 * would have done, and then undo it!
 	 */
-	nsec -= 1000 * (mach_gettimeoffset() +
-			(jiffies - wall_jiffies) * (1000000 / HZ));
+	nsec -= 1000 * mach_gettimeoffset();
 
 	wtm_sec  = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec);
 	wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec);
diff --git a/arch/m68knommu/kernel/time.c b/arch/m68knommu/kernel/time.c
index db1e1ce0a349..c5667bdddd5e 100644
--- a/arch/m68knommu/kernel/time.c
+++ b/arch/m68knommu/kernel/time.c
@@ -26,8 +26,6 @@
 
 #define	TICK_SIZE (tick_nsec / 1000)
 
-extern unsigned long wall_jiffies;
-
 
 static inline int set_rtc_mmss(unsigned long nowtime)
 {
@@ -124,15 +122,12 @@ void time_init(void)
 void do_gettimeofday(struct timeval *tv)
 {
 	unsigned long flags;
-	unsigned long lost, seq;
+	unsigned long seq;
 	unsigned long usec, sec;
 
 	do {
 		seq = read_seqbegin_irqsave(&xtime_lock, flags);
 		usec = mach_gettimeoffset ? mach_gettimeoffset() : 0;
-		lost = jiffies - wall_jiffies;
-		if (lost)
-			usec += lost * (1000000 / HZ);
 		sec = xtime.tv_sec;
 		usec += (xtime.tv_nsec / 1000);
 	} while (read_seqretry_irqrestore(&xtime_lock, seq, flags));
diff --git a/arch/mips/kernel/time.c b/arch/mips/kernel/time.c
index 6ab8d975a974..845c7e55505d 100644
--- a/arch/mips/kernel/time.c
+++ b/arch/mips/kernel/time.c
@@ -47,8 +47,6 @@
 /*
  * forward reference
  */
-extern volatile unsigned long wall_jiffies;
-
 DEFINE_SPINLOCK(rtc_lock);
 
 /*
@@ -159,7 +157,6 @@ void (*mips_hpt_init)(unsigned int);
 void do_gettimeofday(struct timeval *tv)
 {
 	unsigned long seq;
-	unsigned long lost;
 	unsigned long usec, sec;
 	unsigned long max_ntp_tick;
 
@@ -168,8 +165,6 @@ void do_gettimeofday(struct timeval *tv)
 
 		usec = do_gettimeoffset();
 
-		lost = jiffies - wall_jiffies;
-
 		/*
 		 * If time_adjust is negative then NTP is slowing the clock
 		 * so make sure not to go into next possible interval.
@@ -178,11 +173,7 @@ void do_gettimeofday(struct timeval *tv)
 		if (unlikely(time_adjust < 0)) {
 			max_ntp_tick = (USEC_PER_SEC / HZ) - tickadj;
 			usec = min(usec, max_ntp_tick);
-
-			if (lost)
-				usec += lost * max_ntp_tick;
-		} else if (unlikely(lost))
-			usec += lost * (USEC_PER_SEC / HZ);
+		}
 
 		sec = xtime.tv_sec;
 		usec += (xtime.tv_nsec / 1000);
@@ -217,7 +208,6 @@ int do_settimeofday(struct timespec *tv)
 	 * made, and then undo it!
 	 */
 	nsec -= do_gettimeoffset() * NSEC_PER_USEC;
-	nsec -= (jiffies - wall_jiffies) * tick_nsec;
 
 	wtm_sec  = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec);
 	wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec);
diff --git a/arch/mips/sgi-ip27/ip27-timer.c b/arch/mips/sgi-ip27/ip27-timer.c
index c62a3a9ef867..257ce118e380 100644
--- a/arch/mips/sgi-ip27/ip27-timer.c
+++ b/arch/mips/sgi-ip27/ip27-timer.c
@@ -42,8 +42,6 @@
 static unsigned long ct_cur[NR_CPUS];	/* What counter should be at next timer irq */
 static long last_rtc_update;		/* Last time the rtc clock got updated */
 
-extern volatile unsigned long wall_jiffies;
-
 #if 0
 static int set_rtc_mmss(unsigned long nowtime)
 {
diff --git a/arch/parisc/kernel/time.c b/arch/parisc/kernel/time.c
index 700df10924dd..ab641d67f551 100644
--- a/arch/parisc/kernel/time.c
+++ b/arch/parisc/kernel/time.c
@@ -32,9 +32,6 @@
 
 #include <linux/timex.h>
 
-/* xtime and wall_jiffies keep wall-clock time */
-extern unsigned long wall_jiffies;
-
 static long clocktick __read_mostly;	/* timer cycles per tick */
 static long halftick __read_mostly;
 
@@ -112,7 +109,7 @@ EXPORT_SYMBOL(profile_pc);
 /*** converted from ia64 ***/
 /*
  * Return the number of micro-seconds that elapsed since the last
- * update to wall time (aka xtime aka wall_jiffies).  The xtime_lock
+ * update to wall time (aka xtime).  The xtime_lock
  * must be at least read-locked when calling this routine.
  */
 static inline unsigned long
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 71f71da98e7d..8b278d85ca4e 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -117,8 +117,6 @@ unsigned tb_to_ns_shift;
 
 struct gettimeofday_struct do_gtod;
 
-extern unsigned long wall_jiffies;
-
 extern struct timezone sys_tz;
 static long timezone_offset;
 
@@ -816,11 +814,6 @@ int do_settimeofday(struct timespec *tv)
 	/*
 	 * Subtract off the number of nanoseconds since the
 	 * beginning of the last tick.
-	 * Note that since we don't increment jiffies_64 anywhere other
-	 * than in do_timer (since we don't have a lost tick problem),
-	 * wall_jiffies will always be the same as jiffies,
-	 * and therefore the (jiffies - wall_jiffies) computation
-	 * has been removed.
 	 */
 	tb_delta = tb_ticks_since(tb_last_jiffy);
 	tb_delta = mulhdu(tb_delta, do_gtod.varp->tb_to_xs); /* in xsec */
diff --git a/arch/ppc/kernel/time.c b/arch/ppc/kernel/time.c
index 1e1f31554767..187388625a76 100644
--- a/arch/ppc/kernel/time.c
+++ b/arch/ppc/kernel/time.c
@@ -80,8 +80,6 @@ unsigned tb_to_us;
 unsigned tb_last_stamp;
 unsigned long tb_to_ns_scale;
 
-extern unsigned long wall_jiffies;
-
 /* used for timezone offset */
 static long timezone_offset;
 
@@ -173,8 +171,7 @@ void timer_interrupt(struct pt_regs * regs)
 		 */
 		if ( ppc_md.set_rtc_time && ntp_synced() &&
 		     xtime.tv_sec - last_rtc_update >= 659 &&
-		     abs((xtime.tv_nsec / 1000) - (1000000-1000000/HZ)) < 500000/HZ &&
-		     jiffies - wall_jiffies == 1) {
+		     abs((xtime.tv_nsec / 1000) - (1000000-1000000/HZ)) < 500000/HZ) {
 		  	if (ppc_md.set_rtc_time(xtime.tv_sec+1 + timezone_offset) == 0)
 				last_rtc_update = xtime.tv_sec+1;
 			else
@@ -200,7 +197,7 @@ void do_gettimeofday(struct timeval *tv)
 {
 	unsigned long flags;
 	unsigned long seq;
-	unsigned delta, lost_ticks, usec, sec;
+	unsigned delta, usec, sec;
 
 	do {
 		seq = read_seqbegin_irqsave(&xtime_lock, flags);
@@ -214,10 +211,9 @@ void do_gettimeofday(struct timeval *tv)
 		if (!smp_tb_synchronized)
 			delta = 0;
 #endif /* CONFIG_SMP */
-		lost_ticks = jiffies - wall_jiffies;
 	} while (read_seqretry_irqrestore(&xtime_lock, seq, flags));
 
-	usec += mulhwu(tb_to_us, tb_ticks_per_jiffy * lost_ticks + delta);
+	usec += mulhwu(tb_to_us, delta);
 	while (usec >= 1000000) {
 	  	sec++;
 		usec -= 1000000;
@@ -258,7 +254,6 @@ int do_settimeofday(struct timespec *tv)
 	 * still reasonable when gettimeofday resolution is 1 jiffy.
 	 */
 	tb_delta = tb_ticks_since(last_jiffy_stamp(smp_processor_id()));
-	tb_delta += (jiffies - wall_jiffies) * tb_ticks_per_jiffy;
 
 	new_nsec -= 1000 * mulhwu(tb_to_us, tb_delta);
 
diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c
index abab42e9f5f8..4bf66cc4a267 100644
--- a/arch/s390/kernel/time.c
+++ b/arch/s390/kernel/time.c
@@ -53,8 +53,6 @@ static u64 init_timer_cc;
 static u64 jiffies_timer_cc;
 static u64 xtime_cc;
 
-extern unsigned long wall_jiffies;
-
 /*
  * Scheduler clock - returns current time in nanosec units.
  */
@@ -87,9 +85,8 @@ static inline unsigned long do_gettimeoffset(void)
 {
 	__u64 now;
 
-        now = (get_clock() - jiffies_timer_cc) >> 12;
-	/* We require the offset from the latest update of xtime */
-	now -= (__u64) wall_jiffies*USECS_PER_JIFFY;
+	now = (get_clock() - jiffies_timer_cc) >> 12;
+	now -= (__u64) jiffies * USECS_PER_JIFFY;
 	return (unsigned long) now;
 }
 
diff --git a/arch/sh/kernel/time.c b/arch/sh/kernel/time.c
index f664a196c4f5..450c68f1df05 100644
--- a/arch/sh/kernel/time.c
+++ b/arch/sh/kernel/time.c
@@ -18,7 +18,6 @@
 #include <asm/timer.h>
 #include <asm/kgdb.h>
 
-extern unsigned long wall_jiffies;
 struct sys_timer *sys_timer;
 
 /* Move this somewhere more sensible.. */
@@ -52,16 +51,10 @@ void do_gettimeofday(struct timeval *tv)
 {
 	unsigned long seq;
 	unsigned long usec, sec;
-	unsigned long lost;
 
 	do {
 		seq = read_seqbegin(&xtime_lock);
 		usec = get_timer_offset();
-
-		lost = jiffies - wall_jiffies;
-		if (lost)
-			usec += lost * (1000000 / HZ);
-
 		sec = xtime.tv_sec;
 		usec += xtime.tv_nsec / 1000;
 	} while (read_seqretry(&xtime_lock, seq));
@@ -91,8 +84,7 @@ int do_settimeofday(struct timespec *tv)
 	 * wall time.  Discover what correction gettimeofday() would have
 	 * made, and then undo it!
 	 */
-	nsec -= 1000 * (get_timer_offset() +
-				(jiffies - wall_jiffies) * (1000000 / HZ));
+	nsec -= 1000 * get_timer_offset();
 
 	wtm_sec  = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec);
 	wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec);
diff --git a/arch/sh64/kernel/time.c b/arch/sh64/kernel/time.c
index 3b61e06f9d72..9c4a38a8698c 100644
--- a/arch/sh64/kernel/time.c
+++ b/arch/sh64/kernel/time.c
@@ -107,8 +107,6 @@
 
 #define TICK_SIZE (tick_nsec / 1000)
 
-extern unsigned long wall_jiffies;
-
 static unsigned long tmu_base, rtc_base;
 unsigned long cprc_base;
 
@@ -194,13 +192,6 @@ void do_gettimeofday(struct timeval *tv)
 	do {
 		seq = read_seqbegin_irqsave(&xtime_lock, flags);
 		usec = usecs_since_tick();
-		{
-			unsigned long lost = jiffies - wall_jiffies;
-
-			if (lost)
-				usec += lost * (1000000 / HZ);
-		}
-
 		sec = xtime.tv_sec;
 		usec += xtime.tv_nsec / 1000;
 	} while (read_seqretry_irqrestore(&xtime_lock, seq, flags));
@@ -229,8 +220,7 @@ int do_settimeofday(struct timespec *tv)
 	 * wall time.  Discover what correction gettimeofday() would have
 	 * made, and then undo it!
 	 */
-	nsec -= 1000 * (usecs_since_tick() +
-				(jiffies - wall_jiffies) * (1000000 / HZ));
+	nsec -= 1000 * usecs_since_tick();
 
 	wtm_sec  = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec);
 	wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec);
diff --git a/arch/sparc/kernel/pcic.c b/arch/sparc/kernel/pcic.c
index e19b1bad9bc5..edb6cc665f56 100644
--- a/arch/sparc/kernel/pcic.c
+++ b/arch/sparc/kernel/pcic.c
@@ -765,8 +765,6 @@ static __inline__ unsigned long do_gettimeoffset(void)
 	return count;
 }
 
-extern unsigned long wall_jiffies;
-
 static void pci_do_gettimeofday(struct timeval *tv)
 {
 	unsigned long flags;
@@ -775,26 +773,17 @@ static void pci_do_gettimeofday(struct timeval *tv)
 	unsigned long max_ntp_tick = tick_usec - tickadj;
 
 	do {
-		unsigned long lost;
-
 		seq = read_seqbegin_irqsave(&xtime_lock, flags);
 		usec = do_gettimeoffset();
-		lost = jiffies - wall_jiffies;
 
 		/*
 		 * If time_adjust is negative then NTP is slowing the clock
 		 * so make sure not to go into next possible interval.
 		 * Better to lose some accuracy than have time go backwards..
 		 */
-		if (unlikely(time_adjust < 0)) {
+		if (unlikely(time_adjust < 0))
 			usec = min(usec, max_ntp_tick);
 
-			if (lost)
-				usec += lost * max_ntp_tick;
-		}
-		else if (unlikely(lost))
-			usec += lost * tick_usec;
-
 		sec = xtime.tv_sec;
 		usec += (xtime.tv_nsec / 1000);
 	} while (read_seqretry_irqrestore(&xtime_lock, seq, flags));
@@ -819,8 +808,7 @@ static int pci_do_settimeofday(struct timespec *tv)
 	 * wall time.  Discover what correction gettimeofday() would have
 	 * made, and then undo it!
 	 */
-	tv->tv_nsec -= 1000 * (do_gettimeoffset() + 
-				(jiffies - wall_jiffies) * (USEC_PER_SEC / HZ));
+	tv->tv_nsec -= 1000 * do_gettimeoffset();
 	while (tv->tv_nsec < 0) {
 		tv->tv_nsec += NSEC_PER_SEC;
 		tv->tv_sec--;
diff --git a/arch/sparc/kernel/time.c b/arch/sparc/kernel/time.c
index 6f84fa1b58e5..e10dc831944d 100644
--- a/arch/sparc/kernel/time.c
+++ b/arch/sparc/kernel/time.c
@@ -43,8 +43,6 @@
 #include <asm/pcic.h>
 #include <asm/of_device.h>
 
-extern unsigned long wall_jiffies;
-
 DEFINE_SPINLOCK(rtc_lock);
 enum sparc_clock_type sp_clock_typ;
 DEFINE_SPINLOCK(mostek_lock);
@@ -449,7 +447,7 @@ unsigned long long sched_clock(void)
 
 /* Ok, my cute asm atomicity trick doesn't work anymore.
  * There are just too many variables that need to be protected
- * now (both members of xtime, wall_jiffies, et al.)
+ * now (both members of xtime, et al.)
  */
 void do_gettimeofday(struct timeval *tv)
 {
@@ -459,26 +457,17 @@ void do_gettimeofday(struct timeval *tv)
 	unsigned long max_ntp_tick = tick_usec - tickadj;
 
 	do {
-		unsigned long lost;
-
 		seq = read_seqbegin_irqsave(&xtime_lock, flags);
 		usec = do_gettimeoffset();
-		lost = jiffies - wall_jiffies;
 
 		/*
 		 * If time_adjust is negative then NTP is slowing the clock
 		 * so make sure not to go into next possible interval.
 		 * Better to lose some accuracy than have time go backwards..
 		 */
-		if (unlikely(time_adjust < 0)) {
+		if (unlikely(time_adjust < 0))
 			usec = min(usec, max_ntp_tick);
 
-			if (lost)
-				usec += lost * max_ntp_tick;
-		}
-		else if (unlikely(lost))
-			usec += lost * tick_usec;
-
 		sec = xtime.tv_sec;
 		usec += (xtime.tv_nsec / 1000);
 	} while (read_seqretry_irqrestore(&xtime_lock, seq, flags));
@@ -521,8 +510,7 @@ static int sbus_do_settimeofday(struct timespec *tv)
 	 * wall time.  Discover what correction gettimeofday() would have
 	 * made, and then undo it!
 	 */
-	nsec -= 1000 * (do_gettimeoffset() +
-			(jiffies - wall_jiffies) * (USEC_PER_SEC / HZ));
+	nsec -= 1000 * do_gettimeoffset();
 
 	wtm_sec  = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec);
 	wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec);
diff --git a/arch/sparc64/kernel/time.c b/arch/sparc64/kernel/time.c
index ca1193482f07..00f6fc4aaaff 100644
--- a/arch/sparc64/kernel/time.c
+++ b/arch/sparc64/kernel/time.c
@@ -53,8 +53,6 @@ void __iomem *mstk48t02_regs = NULL;
 unsigned long ds1287_regs = 0UL;
 #endif
 
-extern unsigned long wall_jiffies;
-
 static void __iomem *mstk48t08_regs;
 static void __iomem *mstk48t59_regs;
 
diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c
index 7ea3bf2a858c..557e92af7bea 100644
--- a/arch/x86_64/kernel/time.c
+++ b/arch/x86_64/kernel/time.c
@@ -77,7 +77,6 @@ unsigned long long monotonic_base;
 struct vxtime_data __vxtime __section_vxtime;	/* for vsyscalls */
 
 volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES;
-unsigned long __wall_jiffies __section_wall_jiffies = INITIAL_JIFFIES;
 struct timespec __xtime __section_xtime;
 struct timezone __sys_tz __section_sys_tz;
 
@@ -119,7 +118,7 @@ unsigned int (*do_gettimeoffset)(void) = do_gettimeoffset_tsc;
 
 void do_gettimeofday(struct timeval *tv)
 {
-	unsigned long seq, t;
+	unsigned long seq;
  	unsigned int sec, usec;
 
 	do {
@@ -136,10 +135,7 @@ void do_gettimeofday(struct timeval *tv)
 		   be found. Note when you fix it here you need to do the same
 		   in arch/x86_64/kernel/vsyscall.c and export all needed
 		   variables in vmlinux.lds. -AK */ 
-
-		t = (jiffies - wall_jiffies) * USEC_PER_TICK +
-			do_gettimeoffset();
-		usec += t;
+		usec += do_gettimeoffset();
 
 	} while (read_seqretry(&xtime_lock, seq));
 
@@ -165,8 +161,7 @@ int do_settimeofday(struct timespec *tv)
 
 	write_seqlock_irq(&xtime_lock);
 
-	nsec -= do_gettimeoffset() * NSEC_PER_USEC +
-		(jiffies - wall_jiffies) * NSEC_PER_TICK;
+	nsec -= do_gettimeoffset() * NSEC_PER_USEC;
 
 	wtm_sec  = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec);
 	wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec);
@@ -1071,7 +1066,6 @@ static int timer_resume(struct sys_device *dev)
 		vxtime.last_tsc = get_cycles_sync();
 	write_sequnlock_irqrestore(&xtime_lock,flags);
 	jiffies += sleep_length;
-	wall_jiffies += sleep_length;
 	monotonic_base += sleep_length * (NSEC_PER_SEC/HZ);
 	touch_softlockup_watchdog();
 	return 0;
diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S
index f8aeccf105fa..b9df2ab6529f 100644
--- a/arch/x86_64/kernel/vmlinux.lds.S
+++ b/arch/x86_64/kernel/vmlinux.lds.S
@@ -101,9 +101,6 @@ SECTIONS
   .vgetcpu_mode : AT(VLOAD(.vgetcpu_mode)) { *(.vgetcpu_mode) }
   vgetcpu_mode = VVIRT(.vgetcpu_mode);
 
-  .wall_jiffies : AT(VLOAD(.wall_jiffies)) { *(.wall_jiffies) }
-  wall_jiffies = VVIRT(.wall_jiffies);
-
   .sys_tz : AT(VLOAD(.sys_tz)) { *(.sys_tz) }
   sys_tz = VVIRT(.sys_tz);
 
diff --git a/arch/x86_64/kernel/vsyscall.c b/arch/x86_64/kernel/vsyscall.c
index 07c086382059..a98b460af6a1 100644
--- a/arch/x86_64/kernel/vsyscall.c
+++ b/arch/x86_64/kernel/vsyscall.c
@@ -66,8 +66,7 @@ static __always_inline void do_vgettimeofday(struct timeval * tv)
 		sequence = read_seqbegin(&__xtime_lock);
 		
 		sec = __xtime.tv_sec;
-		usec = (__xtime.tv_nsec / 1000) +
-			(__jiffies - __wall_jiffies) * (1000000 / HZ);
+		usec = __xtime.tv_nsec / 1000;
 
 		if (__vxtime.mode != VXTIME_HPET) {
 			t = get_cycles_sync();
diff --git a/arch/xtensa/kernel/time.c b/arch/xtensa/kernel/time.c
index 241db201f40e..37347e369987 100644
--- a/arch/xtensa/kernel/time.c
+++ b/arch/xtensa/kernel/time.c
@@ -26,8 +26,6 @@
 #include <asm/platform.h>
 
 
-extern volatile unsigned long wall_jiffies;
-
 DEFINE_SPINLOCK(rtc_lock);
 EXPORT_SYMBOL(rtc_lock);
 
@@ -110,7 +108,6 @@ int do_settimeofday(struct timespec *tv)
 	 */
 	ccount = get_ccount();
 	nsec -= (ccount - last_ccount_stamp) * CCOUNT_NSEC;
-	nsec -= (jiffies - wall_jiffies) * CCOUNT_PER_JIFFY * CCOUNT_NSEC;
 
 	wtm_sec  = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec);
 	wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec);
@@ -129,7 +126,7 @@ EXPORT_SYMBOL(do_settimeofday);
 void do_gettimeofday(struct timeval *tv)
 {
 	unsigned long flags;
-	unsigned long sec, usec, delta, lost, seq;
+	unsigned long sec, usec, delta, seq;
 
 	do {
 		seq = read_seqbegin_irqsave(&xtime_lock, flags);
@@ -137,12 +134,9 @@ void do_gettimeofday(struct timeval *tv)
 		delta = get_ccount() - last_ccount_stamp;
 		sec = xtime.tv_sec;
 		usec = (xtime.tv_nsec / NSEC_PER_USEC);
-
-		lost = jiffies - wall_jiffies;
-
 	} while (read_seqretry_irqrestore(&xtime_lock, seq, flags));
 
-	usec += lost * (1000000UL/HZ) + (delta * CCOUNT_NSEC) / NSEC_PER_USEC;
+	usec += (delta * CCOUNT_NSEC) / NSEC_PER_USEC;
 	for (; usec >= 1000000; sec++, usec -= 1000000)
 		;
 
@@ -179,8 +173,7 @@ again:
 
 		if (ntp_synced() &&
 		    xtime.tv_sec - last_rtc_update >= 659 &&
-		    abs((xtime.tv_nsec/1000)-(1000000-1000000/HZ))<5000000/HZ &&
-		    jiffies - wall_jiffies == 1) {
+		    abs((xtime.tv_nsec/1000)-(1000000-1000000/HZ))<5000000/HZ) {
 
 			if (platform_set_rtc_time(xtime.tv_sec+1) == 0)
 				last_rtc_update = xtime.tv_sec+1;
diff --git a/include/asm-x86_64/vsyscall.h b/include/asm-x86_64/vsyscall.h
index 2281e9399b96..fd452fc2c037 100644
--- a/include/asm-x86_64/vsyscall.h
+++ b/include/asm-x86_64/vsyscall.h
@@ -17,7 +17,6 @@ enum vsyscall_num {
 
 #define __section_vxtime __attribute__ ((unused, __section__ (".vxtime"), aligned(16)))
 #define __section_vgetcpu_mode __attribute__ ((unused, __section__ (".vgetcpu_mode"), aligned(16)))
-#define __section_wall_jiffies __attribute__ ((unused, __section__ (".wall_jiffies"), aligned(16)))
 #define __section_jiffies __attribute__ ((unused, __section__ (".jiffies"), aligned(16)))
 #define __section_sys_tz __attribute__ ((unused, __section__ (".sys_tz"), aligned(16)))
 #define __section_sysctl_vsyscall __attribute__ ((unused, __section__ (".sysctl_vsyscall"), aligned(16)))
@@ -48,14 +47,12 @@ extern struct vxtime_data __vxtime;
 extern int __vgetcpu_mode;
 extern struct timespec __xtime;
 extern volatile unsigned long __jiffies;
-extern unsigned long __wall_jiffies;
 extern struct timezone __sys_tz;
 extern seqlock_t __xtime_lock;
 
 /* kernel space (writeable) */
 extern struct vxtime_data vxtime;
 extern int vgetcpu_mode;
-extern unsigned long wall_jiffies;
 extern struct timezone sys_tz;
 extern int sysctl_vsyscall;
 extern seqlock_t xtime_lock;
diff --git a/kernel/timer.c b/kernel/timer.c
index 654dd5df2417..c1c7fbcffec1 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -768,7 +768,7 @@ static int timekeeping_suspended;
  * @dev:	unused
  *
  * This is for the generic clocksource timekeeping.
- * xtime/wall_to_monotonic/jiffies/wall_jiffies/etc are
+ * xtime/wall_to_monotonic/jiffies/etc are
  * still managed by arch specific suspend/resume code.
  */
 static int timekeeping_resume(struct sys_device *dev)
@@ -1016,9 +1016,6 @@ static inline void calc_load(unsigned long ticks)
 	}
 }
 
-/* jiffies at the most recent update of wall time */
-unsigned long wall_jiffies = INITIAL_JIFFIES;
-
 /*
  * This read-write spinlock protects us from races in SMP while
  * playing with xtime and avenrun.
@@ -1056,7 +1053,6 @@ void run_local_timers(void)
  */
 static inline void update_times(unsigned long ticks)
 {
-	wall_jiffies += ticks;
 	update_wall_time();
 	calc_load(ticks);
 }
-- 
cgit v1.2.3


From 0ae646845b603e9df5711084436d389f8371ffb3 Mon Sep 17 00:00:00 2001
From: Balbir Singh <balbir@in.ibm.com>
Date: Sat, 30 Sep 2006 23:28:53 -0700
Subject: [PATCH] Fix taskstats size calculation (use the new genetlink utility
 functions)

The addition of the CSA patch pushed the size of struct taskstats to 256
bytes.  This exposed a problem with prepare_reply(), we were not allocating
space for the netlink and genetlink header.  It worked earlier because
alloc_skb() would align the skb to SMP_CACHE_BYTES, which added some additonal
bytes.

Signed-off-by: Balbir Singh <balbir@in.ibm.com>
Cc: Jamal Hadi <hadi@cyberus.ca>
Cc: Shailabh Nagar <nagar@watson.ibm.com>
Cc: Thomas Graf <tgraf@suug.ch>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Jay Lan <jlan@engr.sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/taskstats.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 2ed4040d0dc5..c451af2ddb50 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -75,7 +75,7 @@ static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp,
 	/*
 	 * If new attributes are added, please revisit this allocation
 	 */
-	skb = nlmsg_new(size, GFP_KERNEL);
+	skb = nlmsg_new(genlmsg_total_size(size), GFP_KERNEL);
 	if (!skb)
 		return -ENOMEM;
 
-- 
cgit v1.2.3


From f3cef7a99469afc159fec3a61b42dc7ca5b6824f Mon Sep 17 00:00:00 2001
From: Jay Lan <jlan@engr.sgi.com>
Date: Sat, 30 Sep 2006 23:28:55 -0700
Subject: [PATCH] csa: basic accounting over taskstats

Add some basic accounting fields to the taskstats struct, add a new
kernel/tsacct.c to handle basic accounting data handling upon exit.  A handle
is added to taskstats.c to invoke the basic accounting data handling.

Signed-off-by: Jay Lan <jlan@sgi.com>
Cc: Shailabh Nagar <nagar@watson.ibm.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Jes Sorensen <jes@sgi.com>
Cc: Chris Sturtivant <csturtiv@sgi.com>
Cc: Tony Ernst <tee@sgi.com>
Cc: Guillaume Thouvenin <guillaume.thouvenin@bull.net>
Cc: "Michal Piotrowski" <michal.k.k.piotrowski@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/taskstats.h   | 29 ++++++++++++++----
 include/linux/tsacct_kern.h | 19 ++++++++++++
 kernel/Makefile             |  2 +-
 kernel/taskstats.c          |  4 +++
 kernel/tsacct.c             | 72 +++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 120 insertions(+), 6 deletions(-)
 create mode 100644 include/linux/tsacct_kern.h
 create mode 100644 kernel/tsacct.c

(limited to 'kernel')

diff --git a/include/linux/taskstats.h b/include/linux/taskstats.h
index f1cb6cddd19d..af93a63a5092 100644
--- a/include/linux/taskstats.h
+++ b/include/linux/taskstats.h
@@ -2,6 +2,7 @@
  *
  * Copyright (C) Shailabh Nagar, IBM Corp. 2006
  *           (C) Balbir Singh,   IBM Corp. 2006
+ *           (C) Jay Lan,        SGI, 2006
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of version 2.1 of the GNU Lesser General Public License
@@ -29,16 +30,18 @@
  *	c) add new fields after version comment; maintain 64-bit alignment
  */
 
-#define TASKSTATS_VERSION	1
+
+#define TASKSTATS_VERSION	2
+#define TS_COMM_LEN		16	/* should sync up with TASK_COMM_LEN
+					 * in linux/sched.h */
 
 struct taskstats {
 
 	/* Version 1 */
 	__u16	version;
-	__u16	padding[3];	/* Userspace should not interpret the padding
-				 * field which can be replaced by useful
-				 * fields if struct taskstats is extended.
-				 */
+	__u32	ac_exitcode;	/* Exit status */
+	__u8	ac_flag;		/* Record flags */
+	__u8	ac_nice;		/* task_nice */
 
 	/* Delay accounting fields start
 	 *
@@ -88,6 +91,22 @@ struct taskstats {
 	__u64	cpu_run_virtual_total;
 	/* Delay accounting fields end */
 	/* version 1 ends here */
+
+	/* Basic Accounting Fields start */
+	char	ac_comm[TS_COMM_LEN];	/* Command name */
+	__u8	ac_sched;		/* Scheduling discipline */
+	__u8	ac_pad[3];
+	__u32	ac_uid;			/* User ID */
+	__u32	ac_gid;			/* Group ID */
+	__u32	ac_pid;			/* Process ID */
+	__u32	ac_ppid;		/* Parent process ID */
+	__u32	ac_btime;		/* Begin time [sec since 1970] */
+	__u64	ac_etime;		/* Elapsed time [usec] */
+	__u64	ac_utime;		/* User CPU time [usec] */
+	__u64	ac_stime;		/* SYstem CPU time [usec] */
+	__u64	ac_minflt;		/* Minor Page Fault */
+	__u64	ac_majflt;		/* Major Page Fault */
+	/* Basic Accounting Fields end */
 };
 
 
diff --git a/include/linux/tsacct_kern.h b/include/linux/tsacct_kern.h
new file mode 100644
index 000000000000..7e8196a02118
--- /dev/null
+++ b/include/linux/tsacct_kern.h
@@ -0,0 +1,19 @@
+/*
+ * tsacct_kern.h - kernel header for system accounting over taskstats interface
+ *
+ * Copyright (C) Jay Lan	SGI
+ */
+
+#ifndef _LINUX_TSACCT_KERN_H
+#define _LINUX_TSACCT_KERN_H
+
+#include <linux/taskstats.h>
+
+#ifdef CONFIG_TASKSTATS
+extern void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk);
+#else
+static inline void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk)
+{}
+#endif /* CONFIG_TASKSTATS */
+
+#endif
diff --git a/kernel/Makefile b/kernel/Makefile
index e210e8cf7237..aacaafb28b9d 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -49,7 +49,7 @@ obj-$(CONFIG_SECCOMP) += seccomp.o
 obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
 obj-$(CONFIG_RELAY) += relay.o
 obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
-obj-$(CONFIG_TASKSTATS) += taskstats.o
+obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
 
 ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index c451af2ddb50..6c38dce88e8c 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -18,6 +18,7 @@
 
 #include <linux/kernel.h>
 #include <linux/taskstats_kern.h>
+#include <linux/tsacct_kern.h>
 #include <linux/delayacct.h>
 #include <linux/cpumask.h>
 #include <linux/percpu.h>
@@ -198,7 +199,10 @@ static int fill_pid(pid_t pid, struct task_struct *pidtsk,
 	 */
 
 	delayacct_add_tsk(stats, tsk);
+
+	/* fill in basic acct fields */
 	stats->version = TASKSTATS_VERSION;
+	bacct_add_tsk(stats, tsk);
 
 	/* Define err: label here if needed */
 	put_task_struct(tsk);
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
new file mode 100644
index 000000000000..899067950a88
--- /dev/null
+++ b/kernel/tsacct.c
@@ -0,0 +1,72 @@
+/*
+ * tsacct.c - System accounting over taskstats interface
+ *
+ * Copyright (C) Jay Lan,	<jlan@sgi.com>
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/tsacct_kern.h>
+#include <linux/acct.h>
+
+
+#define USEC_PER_TICK	(USEC_PER_SEC/HZ)
+/*
+ * fill in basic accounting fields
+ */
+void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk)
+{
+	struct timespec uptime, ts;
+	s64 ac_etime;
+
+	BUILD_BUG_ON(TS_COMM_LEN < TASK_COMM_LEN);
+
+	/* calculate task elapsed time in timespec */
+	do_posix_clock_monotonic_gettime(&uptime);
+	ts = timespec_sub(uptime, current->group_leader->start_time);
+	/* rebase elapsed time to usec */
+	ac_etime = timespec_to_ns(&ts);
+	do_div(ac_etime, NSEC_PER_USEC);
+	stats->ac_etime = ac_etime;
+	stats->ac_btime = xtime.tv_sec - ts.tv_sec;
+	if (thread_group_leader(tsk)) {
+		stats->ac_exitcode = tsk->exit_code;
+		if (tsk->flags & PF_FORKNOEXEC)
+			stats->ac_flag |= AFORK;
+	}
+	if (tsk->flags & PF_SUPERPRIV)
+		stats->ac_flag |= ASU;
+	if (tsk->flags & PF_DUMPCORE)
+		stats->ac_flag |= ACORE;
+	if (tsk->flags & PF_SIGNALED)
+		stats->ac_flag |= AXSIG;
+	stats->ac_nice	 = task_nice(tsk);
+	stats->ac_sched	 = tsk->policy;
+	stats->ac_uid	 = tsk->uid;
+	stats->ac_gid	 = tsk->gid;
+	stats->ac_pid	 = tsk->pid;
+	stats->ac_ppid	 = (tsk->parent) ? tsk->parent->pid : 0;
+	stats->ac_utime	 = cputime_to_msecs(tsk->utime) * USEC_PER_MSEC;
+	stats->ac_stime	 = cputime_to_msecs(tsk->stime) * USEC_PER_MSEC;
+	stats->ac_minflt = tsk->min_flt;
+	stats->ac_majflt = tsk->maj_flt;
+	/* Each process gets a minimum of one usec cpu time */
+	if ((stats->ac_utime == 0) && (stats->ac_stime == 0)) {
+		stats->ac_stime = 1;
+	}
+
+	strncpy(stats->ac_comm, tsk->comm, sizeof(stats->ac_comm));
+}
+
-- 
cgit v1.2.3


From 9acc1853519a0473620d424105f9d49ea5b4e62e Mon Sep 17 00:00:00 2001
From: Jay Lan <jlan@engr.sgi.com>
Date: Sat, 30 Sep 2006 23:28:58 -0700
Subject: [PATCH] csa: Extended system accounting over taskstats

Add extended system accounting handling over taskstats interface.  A
CONFIG_TASK_XACCT flag is created to enable the extended accounting code.

Signed-off-by: Jay Lan <jlan@sgi.com>
Cc: Shailabh Nagar <nagar@watson.ibm.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Jes Sorensen <jes@sgi.com>
Cc: Chris Sturtivant <csturtiv@sgi.com>
Cc: Tony Ernst <tee@sgi.com>
Cc: Guillaume Thouvenin <guillaume.thouvenin@bull.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/taskstats.h   | 11 +++++++++++
 include/linux/tsacct_kern.h |  9 +++++++++
 init/Kconfig                |  9 +++++++++
 kernel/taskstats.c          |  4 ++++
 kernel/tsacct.c             | 19 +++++++++++++++++++
 5 files changed, 52 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/taskstats.h b/include/linux/taskstats.h
index af93a63a5092..3d2c304886b0 100644
--- a/include/linux/taskstats.h
+++ b/include/linux/taskstats.h
@@ -107,6 +107,17 @@ struct taskstats {
 	__u64	ac_minflt;		/* Minor Page Fault */
 	__u64	ac_majflt;		/* Major Page Fault */
 	/* Basic Accounting Fields end */
+
+ 	/* Extended accounting fields start */
+ 	__u64	acct_rss_mem1;		/* accumulated rss usage */
+ 	__u64	acct_vm_mem1;		/* accumulated virtual memory usage */
+ 	__u64	hiwater_rss;		/* High-watermark of RSS usage */
+ 	__u64	hiwater_vm;		/* High-water virtual memory usage */
+ 	__u64	read_char;		/* bytes read */
+ 	__u64	write_char;		/* bytes written */
+ 	__u64	read_syscalls;		/* read syscalls */
+ 	__u64	write_syscalls;		/* write syscalls */
+ 	/* Extended accounting fields end */
 };
 
 
diff --git a/include/linux/tsacct_kern.h b/include/linux/tsacct_kern.h
index 7e8196a02118..74102dcae67a 100644
--- a/include/linux/tsacct_kern.h
+++ b/include/linux/tsacct_kern.h
@@ -16,4 +16,13 @@ static inline void bacct_add_tsk(struct taskstats *stats, struct task_struct *ts
 {}
 #endif /* CONFIG_TASKSTATS */
 
+#ifdef CONFIG_TASK_XACCT
+extern void xacct_add_tsk(struct taskstats *stats, struct task_struct *p);
+#else
+static inline void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
+{}
+#endif /* CONFIG_TASK_XACCT */
+
 #endif
+
+
diff --git a/init/Kconfig b/init/Kconfig
index d2d72704f875..f7a04d0daf07 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -257,6 +257,15 @@ config CC_OPTIMIZE_FOR_SIZE
 
 	  If unsure, say N.
 
+config TASK_XACCT
+	bool "Enable extended accounting over taskstats (EXPERIMENTAL)"
+	depends on TASKSTATS
+	help
+	  Collect extended task accounting data and send the data
+	  to userland for processing over the taskstats interface.
+
+	  Say N if unsure.
+
 config SYSCTL
 	bool
 
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 6c38dce88e8c..5d6a8c54ee85 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -20,6 +20,7 @@
 #include <linux/taskstats_kern.h>
 #include <linux/tsacct_kern.h>
 #include <linux/delayacct.h>
+#include <linux/tsacct_kern.h>
 #include <linux/cpumask.h>
 #include <linux/percpu.h>
 #include <net/genetlink.h>
@@ -204,6 +205,9 @@ static int fill_pid(pid_t pid, struct task_struct *pidtsk,
 	stats->version = TASKSTATS_VERSION;
 	bacct_add_tsk(stats, tsk);
 
+	/* fill in extended acct fields */
+	xacct_add_tsk(stats, tsk);
+
 	/* Define err: label here if needed */
 	put_task_struct(tsk);
 	return rc;
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 899067950a88..410483490cf6 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -70,3 +70,22 @@ void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk)
 	strncpy(stats->ac_comm, tsk->comm, sizeof(stats->ac_comm));
 }
 
+
+#ifdef CONFIG_TASK_XACCT
+/*
+ * fill in extended accounting fields
+ */
+void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
+{
+	stats->acct_rss_mem1 = p->acct_rss_mem1;
+	stats->acct_vm_mem1  = p->acct_vm_mem1;
+	if (p->mm) {
+		stats->hiwater_rss   = p->mm->hiwater_rss;
+		stats->hiwater_vm    = p->mm->hiwater_vm;
+	}
+	stats->read_char	= p->rchar;
+	stats->write_char	= p->wchar;
+	stats->read_syscalls	= p->syscr;
+	stats->write_syscalls	= p->syscw;
+}
+#endif
-- 
cgit v1.2.3


From 8f0ab5147951267134612570604cf8341901a80c Mon Sep 17 00:00:00 2001
From: Jay Lan <jlan@engr.sgi.com>
Date: Sat, 30 Sep 2006 23:28:59 -0700
Subject: [PATCH] csa: convert CONFIG tag for extended accounting routines

There were a few accounting data/macros that are used in CSA but are #ifdef'ed
inside CONFIG_BSD_PROCESS_ACCT.  This patch is to change those ifdef's from
CONFIG_BSD_PROCESS_ACCT to CONFIG_TASK_XACCT.  A few defines are moved from
kernel/acct.c and include/linux/acct.h to kernel/tsacct.c and
include/linux/tsacct_kern.h.

Signed-off-by: Jay Lan <jlan@sgi.com>
Cc: Shailabh Nagar <nagar@watson.ibm.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Jes Sorensen <jes@sgi.com>
Cc: Chris Sturtivant <csturtiv@sgi.com>
Cc: Tony Ernst <tee@sgi.com>
Cc: Guillaume Thouvenin <guillaume.thouvenin@bull.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/compat.c                 |  2 +-
 fs/exec.c                   |  2 +-
 include/linux/acct.h        |  4 ----
 include/linux/sched.h       |  2 +-
 include/linux/tsacct_kern.h |  6 ++++++
 kernel/acct.c               | 30 ------------------------------
 kernel/exit.c               |  1 +
 kernel/fork.c               |  1 +
 kernel/sched.c              |  2 +-
 kernel/tsacct.c             | 30 ++++++++++++++++++++++++++++++
 10 files changed, 42 insertions(+), 38 deletions(-)

(limited to 'kernel')

diff --git a/fs/compat.c b/fs/compat.c
index 6b90bf35f61d..13fb08d096c4 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -44,7 +44,7 @@
 #include <linux/nfsd/syscall.h>
 #include <linux/personality.h>
 #include <linux/rwsem.h>
-#include <linux/acct.h>
+#include <linux/tsacct_kern.h>
 #include <linux/mm.h>
 
 #include <net/sock.h>		/* siocdevprivate_ioctl */
diff --git a/fs/exec.c b/fs/exec.c
index a8efe35176b0..0db3fc3c5f0f 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -46,7 +46,7 @@
 #include <linux/security.h>
 #include <linux/syscalls.h>
 #include <linux/rmap.h>
-#include <linux/acct.h>
+#include <linux/tsacct_kern.h>
 #include <linux/cn_proc.h>
 #include <linux/audit.h>
 
diff --git a/include/linux/acct.h b/include/linux/acct.h
index e86bae7324d2..0496d1f09952 100644
--- a/include/linux/acct.h
+++ b/include/linux/acct.h
@@ -124,16 +124,12 @@ extern void acct_auto_close(struct super_block *sb);
 extern void acct_init_pacct(struct pacct_struct *pacct);
 extern void acct_collect(long exitcode, int group_dead);
 extern void acct_process(void);
-extern void acct_update_integrals(struct task_struct *tsk);
-extern void acct_clear_integrals(struct task_struct *tsk);
 #else
 #define acct_auto_close_mnt(x)	do { } while (0)
 #define acct_auto_close(x)	do { } while (0)
 #define acct_init_pacct(x)	do { } while (0)
 #define acct_collect(x,y)	do { } while (0)
 #define acct_process()		do { } while (0)
-#define acct_update_integrals(x)		do { } while (0)
-#define acct_clear_integrals(task)	do { } while (0)
 #endif
 
 /*
diff --git a/include/linux/sched.h b/include/linux/sched.h
index fc4a9873ec10..4ddeb0f982fb 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -981,7 +981,7 @@ struct task_struct {
 	wait_queue_t *io_wait;
 /* i/o counters(bytes read/written, #syscalls */
 	u64 rchar, wchar, syscr, syscw;
-#if defined(CONFIG_BSD_PROCESS_ACCT)
+#if defined(CONFIG_TASK_XACCT)
 	u64 acct_rss_mem1;	/* accumulated rss usage */
 	u64 acct_vm_mem1;	/* accumulated virtual memory usage */
 	clock_t acct_stimexpd;	/* clock_t-converted stime since last update */
diff --git a/include/linux/tsacct_kern.h b/include/linux/tsacct_kern.h
index 74102dcae67a..7e50ac795b0b 100644
--- a/include/linux/tsacct_kern.h
+++ b/include/linux/tsacct_kern.h
@@ -18,9 +18,15 @@ static inline void bacct_add_tsk(struct taskstats *stats, struct task_struct *ts
 
 #ifdef CONFIG_TASK_XACCT
 extern void xacct_add_tsk(struct taskstats *stats, struct task_struct *p);
+extern void acct_update_integrals(struct task_struct *tsk);
+extern void acct_clear_integrals(struct task_struct *tsk);
 #else
 static inline void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
 {}
+static inline void acct_update_integrals(struct task_struct *tsk)
+{}
+static inline void acct_clear_integrals(struct task_struct *tsk)
+{}
 #endif /* CONFIG_TASK_XACCT */
 
 #endif
diff --git a/kernel/acct.c b/kernel/acct.c
index f4330acead46..0aad5ca36a81 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -602,33 +602,3 @@ void acct_process(void)
 	do_acct_process(file);
 	fput(file);
 }
-
-
-/**
- * acct_update_integrals - update mm integral fields in task_struct
- * @tsk: task_struct for accounting
- */
-void acct_update_integrals(struct task_struct *tsk)
-{
-	if (likely(tsk->mm)) {
-		long delta =
-			cputime_to_jiffies(tsk->stime) - tsk->acct_stimexpd;
-
-		if (delta == 0)
-			return;
-		tsk->acct_stimexpd = tsk->stime;
-		tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm);
-		tsk->acct_vm_mem1 += delta * tsk->mm->total_vm;
-	}
-}
-
-/**
- * acct_clear_integrals - clear the mm integral fields in task_struct
- * @tsk: task_struct whose accounting fields are cleared
- */
-void acct_clear_integrals(struct task_struct *tsk)
-{
-	tsk->acct_stimexpd = 0;
-	tsk->acct_rss_mem1 = 0;
-	tsk->acct_vm_mem1 = 0;
-}
diff --git a/kernel/exit.c b/kernel/exit.c
index c189de2927ab..3b47f26985f2 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -18,6 +18,7 @@
 #include <linux/security.h>
 #include <linux/cpu.h>
 #include <linux/acct.h>
+#include <linux/tsacct_kern.h>
 #include <linux/file.h>
 #include <linux/binfmts.h>
 #include <linux/ptrace.h>
diff --git a/kernel/fork.c b/kernel/fork.c
index 1c999f3e0b47..89f666491d1f 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -42,6 +42,7 @@
 #include <linux/profile.h>
 #include <linux/rmap.h>
 #include <linux/acct.h>
+#include <linux/tsacct_kern.h>
 #include <linux/cn_proc.h>
 #include <linux/delayacct.h>
 #include <linux/taskstats_kern.h>
diff --git a/kernel/sched.c b/kernel/sched.c
index 74f169ac0773..2bbd948f0169 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -49,7 +49,7 @@
 #include <linux/seq_file.h>
 #include <linux/syscalls.h>
 #include <linux/times.h>
-#include <linux/acct.h>
+#include <linux/tsacct_kern.h>
 #include <linux/kprobes.h>
 #include <linux/delayacct.h>
 #include <asm/tlb.h>
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 410483490cf6..47c71daa416f 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -88,4 +88,34 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
 	stats->read_syscalls	= p->syscr;
 	stats->write_syscalls	= p->syscw;
 }
+
+
+/**
+ * acct_update_integrals - update mm integral fields in task_struct
+ * @tsk: task_struct for accounting
+ */
+void acct_update_integrals(struct task_struct *tsk)
+{
+	if (likely(tsk->mm)) {
+		long delta =
+			cputime_to_jiffies(tsk->stime) - tsk->acct_stimexpd;
+
+		if (delta == 0)
+			return;
+		tsk->acct_stimexpd = tsk->stime;
+		tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm);
+		tsk->acct_vm_mem1 += delta * tsk->mm->total_vm;
+	}
+}
+
+/**
+ * acct_clear_integrals - clear the mm integral fields in task_struct
+ * @tsk: task_struct whose accounting fields are cleared
+ */
+void acct_clear_integrals(struct task_struct *tsk)
+{
+	tsk->acct_stimexpd = 0;
+	tsk->acct_rss_mem1 = 0;
+	tsk->acct_vm_mem1 = 0;
+}
 #endif
-- 
cgit v1.2.3


From db5fed26b2e0beed939b773dd5896077a1794d65 Mon Sep 17 00:00:00 2001
From: Jay Lan <jlan@sgi.com>
Date: Sat, 30 Sep 2006 23:29:00 -0700
Subject: [PATCH] csa accounting taskstats update

ChangeLog:
   Feedbacks from Andrew Morton:
   - define TS_COMM_LEN to 32
   - change acct_stimexpd field of task_struct to be of
     cputime_t, which is to be used to save the tsk->stime
     of last timer interrupt update.
   - a new Documentation/accounting/taskstats-struct.txt
     to describe fields of taskstats struct.

   Feedback from Balbir Singh:
   - keep the stime of a task to be zero when both stime
     and utime are zero as recoreded in task_struct.

   Misc:
   - convert accumulated RSS/VM from platform dependent
     pages-ticks to MBytes-usecs in the kernel

Cc: Shailabh Nagar <nagar@watson.ibm.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Jes Sorensen <jes@sgi.com>
Cc: Chris Sturtivant <csturtiv@sgi.com>
Cc: Tony Ernst <tee@sgi.com>
Cc: Guillaume Thouvenin <guillaume.thouvenin@bull.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/accounting/taskstats-struct.txt | 161 ++++++++++++++++++++++++++
 include/linux/sched.h                         |   2 +-
 include/linux/taskstats.h                     |  40 +++++--
 kernel/tsacct.c                               |  25 ++--
 4 files changed, 207 insertions(+), 21 deletions(-)
 create mode 100644 Documentation/accounting/taskstats-struct.txt

(limited to 'kernel')

diff --git a/Documentation/accounting/taskstats-struct.txt b/Documentation/accounting/taskstats-struct.txt
new file mode 100644
index 000000000000..661c797eaf79
--- /dev/null
+++ b/Documentation/accounting/taskstats-struct.txt
@@ -0,0 +1,161 @@
+The struct taskstats
+--------------------
+
+This document contains an explanation of the struct taskstats fields.
+
+There are three different groups of fields in the struct taskstats:
+
+1) Common and basic accounting fields
+    If CONFIG_TASKSTATS is set, the taskstats inteface is enabled and
+    the common fields and basic accounting fields are collected for
+    delivery at do_exit() of a task.
+2) Delay accounting fields
+    These fields are placed between
+    /* Delay accounting fields start */
+    and
+    /* Delay accounting fields end */
+    Their values are collected if CONFIG_TASK_DELAY_ACCT is set.
+3) Extended accounting fields
+    These fields are placed between
+    /* Extended accounting fields start */
+    and
+    /* Extended accounting fields end */
+    Their values are collected if CONFIG_TASK_XACCT is set.
+
+Future extension should add fields to the end of the taskstats struct, and
+should not change the relative position of each field within the struct.
+
+
+struct taskstats {
+
+1) Common and basic accounting fields:
+	/* The version number of this struct. This field is always set to
+	 * TAKSTATS_VERSION, which is defined in <linux/taskstats.h>.
+	 * Each time the struct is changed, the value should be incremented.
+	 */
+	__u16	version;
+
+  	/* The exit code of a task. */
+	__u32	ac_exitcode;		/* Exit status */
+
+  	/* The accounting flags of a task as defined in <linux/acct.h>
+	 * Defined values are AFORK, ASU, ACOMPAT, ACORE, and AXSIG.
+	 */
+	__u8	ac_flag;		/* Record flags */
+
+  	/* The value of task_nice() of a task. */
+	__u8	ac_nice;		/* task_nice */
+
+  	/* The name of the command that started this task. */
+	char	ac_comm[TS_COMM_LEN];	/* Command name */
+
+  	/* The scheduling discipline as set in task->policy field. */
+	__u8	ac_sched;		/* Scheduling discipline */
+
+	__u8	ac_pad[3];
+	__u32	ac_uid;			/* User ID */
+	__u32	ac_gid;			/* Group ID */
+	__u32	ac_pid;			/* Process ID */
+	__u32	ac_ppid;		/* Parent process ID */
+
+  	/* The time when a task begins, in [secs] since 1970. */
+	__u32	ac_btime;		/* Begin time [sec since 1970] */
+
+  	/* The elapsed time of a task, in [usec]. */
+	__u64	ac_etime;		/* Elapsed time [usec] */
+
+  	/* The user CPU time of a task, in [usec]. */
+	__u64	ac_utime;		/* User CPU time [usec] */
+
+  	/* The system CPU time of a task, in [usec]. */
+	__u64	ac_stime;		/* System CPU time [usec] */
+
+  	/* The minor page fault count of a task, as set in task->min_flt. */
+	__u64	ac_minflt;		/* Minor Page Fault Count */
+
+	/* The major page fault count of a task, as set in task->maj_flt. */
+	__u64	ac_majflt;		/* Major Page Fault Count */
+
+
+2) Delay accounting fields:
+	/* Delay accounting fields start
+	 *
+	 * All values, until the comment "Delay accounting fields end" are
+	 * available only if delay accounting is enabled, even though the last
+	 * few fields are not delays
+	 *
+	 * xxx_count is the number of delay values recorded
+	 * xxx_delay_total is the corresponding cumulative delay in nanoseconds
+	 *
+	 * xxx_delay_total wraps around to zero on overflow
+	 * xxx_count incremented regardless of overflow
+	 */
+
+	/* Delay waiting for cpu, while runnable
+	 * count, delay_total NOT updated atomically
+	 */
+	__u64	cpu_count;
+	__u64	cpu_delay_total;
+
+	/* Following four fields atomically updated using task->delays->lock */
+
+	/* Delay waiting for synchronous block I/O to complete
+	 * does not account for delays in I/O submission
+	 */
+	__u64	blkio_count;
+	__u64	blkio_delay_total;
+
+	/* Delay waiting for page fault I/O (swap in only) */
+	__u64	swapin_count;
+	__u64	swapin_delay_total;
+
+	/* cpu "wall-clock" running time
+	 * On some architectures, value will adjust for cpu time stolen
+	 * from the kernel in involuntary waits due to virtualization.
+	 * Value is cumulative, in nanoseconds, without a corresponding count
+	 * and wraps around to zero silently on overflow
+	 */
+	__u64	cpu_run_real_total;
+
+	/* cpu "virtual" running time
+	 * Uses time intervals seen by the kernel i.e. no adjustment
+	 * for kernel's involuntary waits due to virtualization.
+	 * Value is cumulative, in nanoseconds, without a corresponding count
+	 * and wraps around to zero silently on overflow
+	 */
+	__u64	cpu_run_virtual_total;
+	/* Delay accounting fields end */
+	/* version 1 ends here */
+
+
+3) Extended accounting fields
+	/* Extended accounting fields start */
+
+	/* Accumulated RSS usage in duration of a task, in MBytes-usecs.
+	 * The current rss usage is added to this counter every time
+	 * a tick is charged to a task's system time. So, at the end we
+	 * will have memory usage multiplied by system time. Thus an
+	 * average usage per system time unit can be calculated.
+	 */
+	__u64	coremem;		/* accumulated RSS usage in MB-usec */
+
+  	/* Accumulated virtual memory usage in duration of a task.
+	 * Same as acct_rss_mem1 above except that we keep track of VM usage.
+	 */
+	__u64	virtmem;		/* accumulated VM usage in MB-usec */
+
+  	/* High watermark of RSS usage in duration of a task, in KBytes. */
+	__u64	hiwater_rss;		/* High-watermark of RSS usage */
+
+  	/* High watermark of VM  usage in duration of a task, in KBytes. */
+	__u64	hiwater_vm;		/* High-water virtual memory usage */
+
+	/* The following four fields are I/O statistics of a task. */
+	__u64	read_char;		/* bytes read */
+	__u64	write_char;		/* bytes written */
+	__u64	read_syscalls;		/* read syscalls */
+	__u64	write_syscalls;		/* write syscalls */
+
+	/* Extended accounting fields end */
+
+}
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4ddeb0f982fb..7ef899c47c29 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -984,7 +984,7 @@ struct task_struct {
 #if defined(CONFIG_TASK_XACCT)
 	u64 acct_rss_mem1;	/* accumulated rss usage */
 	u64 acct_vm_mem1;	/* accumulated virtual memory usage */
-	clock_t acct_stimexpd;	/* clock_t-converted stime since last update */
+	cputime_t acct_stimexpd;/* stime since last update */
 #endif
 #ifdef CONFIG_NUMA
   	struct mempolicy *mempolicy;
diff --git a/include/linux/taskstats.h b/include/linux/taskstats.h
index 3d2c304886b0..45248806ae9c 100644
--- a/include/linux/taskstats.h
+++ b/include/linux/taskstats.h
@@ -32,14 +32,21 @@
 
 
 #define TASKSTATS_VERSION	2
-#define TS_COMM_LEN		16	/* should sync up with TASK_COMM_LEN
+#define TS_COMM_LEN		32	/* should be >= TASK_COMM_LEN
 					 * in linux/sched.h */
 
 struct taskstats {
 
-	/* Version 1 */
+	/* The version number of this struct. This field is always set to
+	 * TAKSTATS_VERSION, which is defined in <linux/taskstats.h>.
+	 * Each time the struct is changed, the value should be incremented.
+	 */
 	__u16	version;
-	__u32	ac_exitcode;	/* Exit status */
+	__u32	ac_exitcode;		/* Exit status */
+
+	/* The accounting flags of a task as defined in <linux/acct.h>
+	 * Defined values are AFORK, ASU, ACOMPAT, ACORE, and AXSIG.
+	 */
 	__u8	ac_flag;		/* Record flags */
 	__u8	ac_nice;		/* task_nice */
 
@@ -104,15 +111,30 @@ struct taskstats {
 	__u64	ac_etime;		/* Elapsed time [usec] */
 	__u64	ac_utime;		/* User CPU time [usec] */
 	__u64	ac_stime;		/* SYstem CPU time [usec] */
-	__u64	ac_minflt;		/* Minor Page Fault */
-	__u64	ac_majflt;		/* Major Page Fault */
+	__u64	ac_minflt;		/* Minor Page Fault Count */
+	__u64	ac_majflt;		/* Major Page Fault Count */
 	/* Basic Accounting Fields end */
 
  	/* Extended accounting fields start */
- 	__u64	acct_rss_mem1;		/* accumulated rss usage */
- 	__u64	acct_vm_mem1;		/* accumulated virtual memory usage */
- 	__u64	hiwater_rss;		/* High-watermark of RSS usage */
- 	__u64	hiwater_vm;		/* High-water virtual memory usage */
+	/* Accumulated RSS usage in duration of a task, in MBytes-usecs.
+	 * The current rss usage is added to this counter every time
+	 * a tick is charged to a task's system time. So, at the end we
+	 * will have memory usage multiplied by system time. Thus an
+	 * average usage per system time unit can be calculated.
+	 */
+ 	__u64	coremem;		/* accumulated RSS usage in MB-usec */
+	/* Accumulated virtual memory usage in duration of a task.
+	 * Same as acct_rss_mem1 above except that we keep track of VM usage.
+	 */
+ 	__u64	virtmem;		/* accumulated VM  usage in MB-usec */
+
+	/* High watermark of RSS and virtual memory usage in duration of
+	 * a task, in KBytes.
+	 */
+ 	__u64	hiwater_rss;		/* High-watermark of RSS usage, in KB */
+ 	__u64	hiwater_vm;		/* High-water VM usage, in KB */
+
+	/* The following four fields are I/O statistics of a task. */
  	__u64	read_char;		/* bytes read */
  	__u64	write_char;		/* bytes written */
  	__u64	read_syscalls;		/* read syscalls */
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 47c71daa416f..db443221ba5b 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -20,6 +20,7 @@
 #include <linux/sched.h>
 #include <linux/tsacct_kern.h>
 #include <linux/acct.h>
+#include <linux/jiffies.h>
 
 
 #define USEC_PER_TICK	(USEC_PER_SEC/HZ)
@@ -62,33 +63,35 @@ void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk)
 	stats->ac_stime	 = cputime_to_msecs(tsk->stime) * USEC_PER_MSEC;
 	stats->ac_minflt = tsk->min_flt;
 	stats->ac_majflt = tsk->maj_flt;
-	/* Each process gets a minimum of one usec cpu time */
-	if ((stats->ac_utime == 0) && (stats->ac_stime == 0)) {
-		stats->ac_stime = 1;
-	}
 
 	strncpy(stats->ac_comm, tsk->comm, sizeof(stats->ac_comm));
 }
 
 
 #ifdef CONFIG_TASK_XACCT
+
+#define KB 1024
+#define MB (1024*KB)
 /*
  * fill in extended accounting fields
  */
 void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
 {
-	stats->acct_rss_mem1 = p->acct_rss_mem1;
-	stats->acct_vm_mem1  = p->acct_vm_mem1;
+	/* convert pages-jiffies to Mbyte-usec */
+	stats->coremem = jiffies_to_usecs(p->acct_rss_mem1) * PAGE_SIZE / MB;
+	stats->virtmem = jiffies_to_usecs(p->acct_vm_mem1) * PAGE_SIZE / MB;
 	if (p->mm) {
-		stats->hiwater_rss   = p->mm->hiwater_rss;
-		stats->hiwater_vm    = p->mm->hiwater_vm;
+		/* adjust to KB unit */
+		stats->hiwater_rss   = p->mm->hiwater_rss * PAGE_SIZE / KB;
+		stats->hiwater_vm    = p->mm->hiwater_vm * PAGE_SIZE / KB;
 	}
 	stats->read_char	= p->rchar;
 	stats->write_char	= p->wchar;
 	stats->read_syscalls	= p->syscr;
 	stats->write_syscalls	= p->syscw;
 }
-
+#undef KB
+#undef MB
 
 /**
  * acct_update_integrals - update mm integral fields in task_struct
@@ -97,8 +100,8 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
 void acct_update_integrals(struct task_struct *tsk)
 {
 	if (likely(tsk->mm)) {
-		long delta =
-			cputime_to_jiffies(tsk->stime) - tsk->acct_stimexpd;
+		long delta = cputime_to_jiffies(
+			cputime_sub(tsk->stime, tsk->acct_stimexpd));
 
 		if (delta == 0)
 			return;
-- 
cgit v1.2.3


From d8c76e6f45c111c32a4b3e50a2adc9210737b0d8 Mon Sep 17 00:00:00 2001
From: Dave Hansen <haveblue@us.ibm.com>
Date: Sat, 30 Sep 2006 23:29:04 -0700
Subject: [PATCH] r/o bind mount prepwork: inc_nlink() helper

This is mostly included for parity with dec_nlink(), where we will have some
more hooks.  This one should stay pretty darn straightforward for now.

Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Acked-by: Christoph Hellwig <hch@lst.de>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/infiniband/hw/ipath/ipath_fs.c | 4 ++--
 drivers/usb/core/inode.c               | 4 ++--
 fs/9p/vfs_inode.c                      | 2 +-
 fs/autofs/root.c                       | 2 +-
 fs/autofs4/root.c                      | 2 +-
 fs/bfs/dir.c                           | 2 +-
 fs/cifs/inode.c                        | 2 +-
 fs/coda/dir.c                          | 2 +-
 fs/configfs/dir.c                      | 4 ++--
 fs/configfs/mount.c                    | 2 +-
 fs/debugfs/inode.c                     | 4 ++--
 fs/ext3/namei.c                        | 6 +++---
 fs/fuse/control.c                      | 2 +-
 fs/hfsplus/dir.c                       | 2 +-
 fs/hpfs/namei.c                        | 4 ++--
 fs/hugetlbfs/inode.c                   | 4 ++--
 fs/jffs2/dir.c                         | 6 +++---
 fs/jffs2/fs.c                          | 6 +++---
 fs/jfs/namei.c                         | 6 +++---
 fs/libfs.c                             | 4 ++--
 fs/msdos/namei.c                       | 4 ++--
 fs/ocfs2/dlm/dlmfs.c                   | 6 +++---
 fs/ocfs2/namei.c                       | 8 ++++----
 fs/ramfs/inode.c                       | 4 ++--
 fs/reiserfs/namei.c                    | 6 +++---
 fs/sysfs/dir.c                         | 4 ++--
 fs/sysfs/mount.c                       | 2 +-
 fs/udf/inode.c                         | 2 +-
 fs/udf/namei.c                         | 6 +++---
 fs/vfat/namei.c                        | 4 ++--
 include/linux/fs.h                     | 7 ++++++-
 ipc/mqueue.c                           | 2 +-
 kernel/cpuset.c                        | 8 ++++----
 mm/shmem.c                             | 8 ++++----
 net/sunrpc/rpc_pipe.c                  | 6 +++---
 security/inode.c                       | 4 ++--
 security/selinux/selinuxfs.c           | 4 ++--
 37 files changed, 80 insertions(+), 75 deletions(-)

(limited to 'kernel')

diff --git a/drivers/infiniband/hw/ipath/ipath_fs.c b/drivers/infiniband/hw/ipath/ipath_fs.c
index a507d0b5be6c..d9ff283f725e 100644
--- a/drivers/infiniband/hw/ipath/ipath_fs.c
+++ b/drivers/infiniband/hw/ipath/ipath_fs.c
@@ -66,8 +66,8 @@ static int ipathfs_mknod(struct inode *dir, struct dentry *dentry,
 	inode->i_private = data;
 	if ((mode & S_IFMT) == S_IFDIR) {
 		inode->i_op = &simple_dir_inode_operations;
-		inode->i_nlink++;
-		dir->i_nlink++;
+		inc_nlink(inode);
+		inc_nlink(dir);
 	}
 
 	inode->i_fop = fops;
diff --git a/drivers/usb/core/inode.c b/drivers/usb/core/inode.c
index 88002e45a6b4..7c77c2d8d300 100644
--- a/drivers/usb/core/inode.c
+++ b/drivers/usb/core/inode.c
@@ -263,7 +263,7 @@ static struct inode *usbfs_get_inode (struct super_block *sb, int mode, dev_t de
 			inode->i_fop = &simple_dir_operations;
 
 			/* directory inodes start off with i_nlink == 2 (for "." entry) */
-			inode->i_nlink++;
+			inc_nlink(inode);
 			break;
 		}
 	}
@@ -295,7 +295,7 @@ static int usbfs_mkdir (struct inode *dir, struct dentry *dentry, int mode)
 	mode = (mode & (S_IRWXUGO | S_ISVTX)) | S_IFDIR;
 	res = usbfs_mknod (dir, dentry, mode, 0);
 	if (!res)
-		dir->i_nlink++;
+		inc_nlink(dir);
 	return res;
 }
 
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 7a7ec2d1d2f4..5241c600ce28 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -233,7 +233,7 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode)
 			inode->i_op = &v9fs_symlink_inode_operations;
 			break;
 		case S_IFDIR:
-			inode->i_nlink++;
+			inc_nlink(inode);
 			if(v9ses->extended)
 				inode->i_op = &v9fs_dir_inode_operations_ext;
 			else
diff --git a/fs/autofs/root.c b/fs/autofs/root.c
index 54ad70731927..368a1c33a3c8 100644
--- a/fs/autofs/root.c
+++ b/fs/autofs/root.c
@@ -466,7 +466,7 @@ static int autofs_root_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	ent->dentry = dentry;
 	autofs_hash_insert(dh,ent);
 
-	dir->i_nlink++;
+	inc_nlink(dir);
 	d_instantiate(dentry, iget(dir->i_sb,ino));
 	unlock_kernel();
 
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 348bec0982b0..e21bb4668201 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -713,7 +713,7 @@ static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	if (p_ino && dentry->d_parent != dentry)
 		atomic_inc(&p_ino->count);
 	ino->inode = inode;
-	dir->i_nlink++;
+	inc_nlink(dir);
 	dir->i_mtime = CURRENT_TIME;
 
 	return 0;
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index ce05d1643dd1..a650f1d0b85e 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -163,7 +163,7 @@ static int bfs_link(struct dentry * old, struct inode * dir, struct dentry * new
 		unlock_kernel();
 		return err;
 	}
-	inode->i_nlink++;
+	inc_nlink(inode);
 	inode->i_ctime = CURRENT_TIME_SEC;
 	mark_inode_dirty(inode);
 	atomic_inc(&inode->i_count);
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 74441a17e186..76b7fb34101a 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -735,7 +735,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
 		cFYI(1, ("cifs_mkdir returned 0x%x", rc));
 		d_drop(direntry);
 	} else {
-		inode->i_nlink++;
+		inc_nlink(inode);
 		if (pTcon->ses->capabilities & CAP_UNIX)
 			rc = cifs_get_inode_info_unix(&newinode, full_path,
 						      inode->i_sb,xid);
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 0a2fd8bb7579..0102b28a15fb 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -304,7 +304,7 @@ static int coda_link(struct dentry *source_de, struct inode *dir_inode,
 	coda_dir_changed(dir_inode, 0);
 	atomic_inc(&inode->i_count);
 	d_instantiate(de, inode);
-	inode->i_nlink++;
+	inc_nlink(inode);
         
 out:
 	unlock_kernel();
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 816e8ef64560..8a3b6a1a6ad1 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -139,7 +139,7 @@ static int init_dir(struct inode * inode)
 	inode->i_fop = &configfs_dir_operations;
 
 	/* directory inodes start off with i_nlink == 2 (for "." entry) */
-	inode->i_nlink++;
+	inc_nlink(inode);
 	return 0;
 }
 
@@ -169,7 +169,7 @@ static int create_dir(struct config_item * k, struct dentry * p,
 	if (!error) {
 		error = configfs_create(d, mode, init_dir);
 		if (!error) {
-			p->d_inode->i_nlink++;
+			inc_nlink(p->d_inode);
 			(d)->d_op = &configfs_dentry_ops;
 		} else {
 			struct configfs_dirent *sd = d->d_fsdata;
diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c
index 3e5fe843e1df..68bd5c93ca52 100644
--- a/fs/configfs/mount.c
+++ b/fs/configfs/mount.c
@@ -84,7 +84,7 @@ static int configfs_fill_super(struct super_block *sb, void *data, int silent)
 		inode->i_op = &configfs_dir_inode_operations;
 		inode->i_fop = &configfs_dir_operations;
 		/* directory inodes start off with i_nlink == 2 (for "." entry) */
-		inode->i_nlink++;
+		inc_nlink(inode);
 	} else {
 		pr_debug("configfs: could not get root inode\n");
 		return -ENOMEM;
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 269e649e6dc6..ecf3da9edf21 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -54,7 +54,7 @@ static struct inode *debugfs_get_inode(struct super_block *sb, int mode, dev_t d
 			inode->i_fop = &simple_dir_operations;
 
 			/* directory inodes start off with i_nlink == 2 (for "." entry) */
-			inode->i_nlink++;
+			inc_nlink(inode);
 			break;
 		}
 	}
@@ -87,7 +87,7 @@ static int debugfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	mode = (mode & (S_IRWXUGO | S_ISVTX)) | S_IFDIR;
 	res = debugfs_mknod(dir, dentry, mode, 0);
 	if (!res)
-		dir->i_nlink++;
+		inc_nlink(dir);
 	return res;
 }
 
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 14c55adfae83..b45c88bd5f73 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -1616,7 +1616,7 @@ static int ext3_delete_entry (handle_t *handle,
  */
 static inline void ext3_inc_count(handle_t *handle, struct inode *inode)
 {
-	inode->i_nlink++;
+	inc_nlink(inode);
 }
 
 static inline void ext3_dec_count(handle_t *handle, struct inode *inode)
@@ -1775,7 +1775,7 @@ retry:
 		iput (inode);
 		goto out_stop;
 	}
-	dir->i_nlink++;
+	inc_nlink(dir);
 	ext3_update_dx_flag(dir);
 	ext3_mark_inode_dirty(handle, dir);
 	d_instantiate(dentry, inode);
@@ -2341,7 +2341,7 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry,
 		if (new_inode) {
 			drop_nlink(new_inode);
 		} else {
-			new_dir->i_nlink++;
+			inc_nlink(new_dir);
 			ext3_update_dx_flag(new_dir);
 			ext3_mark_inode_dirty(handle, new_dir);
 		}
diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index 79ec1f23d4d2..16b39c053d47 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -116,7 +116,7 @@ int fuse_ctl_add_conn(struct fuse_conn *fc)
 		return 0;
 
 	parent = fuse_control_sb->s_root;
-	parent->d_inode->i_nlink++;
+	inc_nlink(parent->d_inode);
 	sprintf(name, "%llu", (unsigned long long) fc->id);
 	parent = fuse_ctl_add_dentry(parent, fc, name, S_IFDIR | 0500, 2,
 				     &simple_dir_inode_operations,
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index 9ceb0dfaa1cc..99b4ed1b87d2 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -298,7 +298,7 @@ static int hfsplus_link(struct dentry *src_dentry, struct inode *dst_dir,
 	if (res)
 		return res;
 
-	inode->i_nlink++;
+	inc_nlink(inode);
 	hfsplus_instantiate(dst_dentry, inode, cnid);
 	atomic_inc(&inode->i_count);
 	inode->i_ctime = CURRENT_TIME_SEC;
diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c
index 4078b0becc5e..25dd6f81eca7 100644
--- a/fs/hpfs/namei.c
+++ b/fs/hpfs/namei.c
@@ -89,7 +89,7 @@ static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	brelse(bh);
 	hpfs_mark_4buffers_dirty(&qbh0);
 	hpfs_brelse4(&qbh0);
-	dir->i_nlink++;
+	inc_nlink(dir);
 	insert_inode_hash(result);
 
 	if (result->i_uid != current->fsuid ||
@@ -635,7 +635,7 @@ static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	end:
 	hpfs_i(i)->i_parent_dir = new_dir->i_ino;
 	if (S_ISDIR(i->i_mode)) {
-		new_dir->i_nlink++;
+		inc_nlink(new_dir);
 		drop_nlink(old_dir);
 	}
 	if ((fnode = hpfs_map_fnode(i->i_sb, i->i_ino, &bh))) {
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index f5b8f329aca6..5e03b2f67b93 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -377,7 +377,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid,
 			inode->i_fop = &simple_dir_operations;
 
 			/* directory inodes start off with i_nlink == 2 (for "." entry) */
-			inode->i_nlink++;
+			inc_nlink(inode);
 			break;
 		case S_IFLNK:
 			inode->i_op = &page_symlink_inode_operations;
@@ -418,7 +418,7 @@ static int hugetlbfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 {
 	int retval = hugetlbfs_mknod(dir, dentry, mode | S_IFDIR, 0);
 	if (!retval)
-		dir->i_nlink++;
+		inc_nlink(dir);
 	return retval;
 }
 
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index a5e9f2205b33..9def6adf4a5d 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -588,7 +588,7 @@ static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, int mode)
 	}
 
 	dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(rd->mctime));
-	dir_i->i_nlink++;
+	inc_nlink(dir_i);
 
 	jffs2_free_raw_dirent(rd);
 
@@ -836,7 +836,7 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry,
 	/* If it was a directory we moved, and there was no victim,
 	   increase i_nlink on its new parent */
 	if (S_ISDIR(old_dentry->d_inode->i_mode) && !victim_f)
-		new_dir_i->i_nlink++;
+		inc_nlink(new_dir_i);
 
 	/* Unlink the original */
 	ret = jffs2_do_unlink(c, JFFS2_INODE_INFO(old_dir_i),
@@ -848,7 +848,7 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry,
 		/* Oh shit. We really ought to make a single node which can do both atomically */
 		struct jffs2_inode_info *f = JFFS2_INODE_INFO(old_dentry->d_inode);
 		down(&f->sem);
-		old_dentry->d_inode->i_nlink++;
+		inc_nlink(old_dentry->d_inode);
 		if (f->inocache)
 			f->inocache->nlink++;
 		up(&f->sem);
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 72d9909d95ff..7bc1a4201c0c 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -277,13 +277,13 @@ void jffs2_read_inode (struct inode *inode)
 
 		for (fd=f->dents; fd; fd = fd->next) {
 			if (fd->type == DT_DIR && fd->ino)
-				inode->i_nlink++;
+				inc_nlink(inode);
 		}
 		/* and '..' */
-		inode->i_nlink++;
+		inc_nlink(inode);
 		/* Root dir gets i_nlink 3 for some reason */
 		if (inode->i_ino == 1)
-			inode->i_nlink++;
+			inc_nlink(inode);
 
 		inode->i_op = &jffs2_dir_inode_operations;
 		inode->i_fop = &jffs2_dir_operations;
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 088b85976ac0..8cef88170aa4 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -292,7 +292,7 @@ static int jfs_mkdir(struct inode *dip, struct dentry *dentry, int mode)
 	mark_inode_dirty(ip);
 
 	/* update parent directory inode */
-	dip->i_nlink++;		/* for '..' from child directory */
+	inc_nlink(dip);		/* for '..' from child directory */
 	dip->i_ctime = dip->i_mtime = CURRENT_TIME;
 	mark_inode_dirty(dip);
 
@@ -822,7 +822,7 @@ static int jfs_link(struct dentry *old_dentry,
 		goto free_dname;
 
 	/* update object inode */
-	ip->i_nlink++;		/* for new link */
+	inc_nlink(ip);		/* for new link */
 	ip->i_ctime = CURRENT_TIME;
 	dir->i_ctime = dir->i_mtime = CURRENT_TIME;
 	mark_inode_dirty(dir);
@@ -1206,7 +1206,7 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 			goto out4;
 		}
 		if (S_ISDIR(old_ip->i_mode))
-			new_dir->i_nlink++;
+			inc_nlink(new_dir);
 	}
 	/*
 	 * Remove old directory entry
diff --git a/fs/libfs.c b/fs/libfs.c
index 9204feba75ac..bd08e0e64a8c 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -243,7 +243,7 @@ int simple_link(struct dentry *old_dentry, struct inode *dir, struct dentry *den
 	struct inode *inode = old_dentry->d_inode;
 
 	inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
-	inode->i_nlink++;
+	inc_nlink(inode);
 	atomic_inc(&inode->i_count);
 	dget(dentry);
 	d_instantiate(dentry, inode);
@@ -306,7 +306,7 @@ int simple_rename(struct inode *old_dir, struct dentry *old_dentry,
 			drop_nlink(old_dir);
 	} else if (they_are_dirs) {
 		drop_nlink(old_dir);
-		new_dir->i_nlink++;
+		inc_nlink(new_dir);
 	}
 
 	old_dir->i_ctime = old_dir->i_mtime = new_dir->i_ctime =
diff --git a/fs/msdos/namei.c b/fs/msdos/namei.c
index 635613f2f65a..fa868c755907 100644
--- a/fs/msdos/namei.c
+++ b/fs/msdos/namei.c
@@ -389,7 +389,7 @@ static int msdos_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	err = msdos_add_entry(dir, msdos_name, 1, is_hid, cluster, &ts, &sinfo);
 	if (err)
 		goto out_free;
-	dir->i_nlink++;
+	inc_nlink(dir);
 
 	inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos);
 	brelse(sinfo.bh);
@@ -551,7 +551,7 @@ static int do_msdos_rename(struct inode *old_dir, unsigned char *old_name,
 		}
 		drop_nlink(old_dir);
 		if (!new_inode)
-			new_dir->i_nlink++;
+			inc_nlink(new_dir);
 	}
 
 	err = fat_remove_entries(old_dir, &old_sinfo);	/* and releases bh */
diff --git a/fs/ocfs2/dlm/dlmfs.c b/fs/ocfs2/dlm/dlmfs.c
index 0368c6402182..16b8d1ba7066 100644
--- a/fs/ocfs2/dlm/dlmfs.c
+++ b/fs/ocfs2/dlm/dlmfs.c
@@ -338,7 +338,7 @@ static struct inode *dlmfs_get_root_inode(struct super_block *sb)
 		inode->i_blocks = 0;
 		inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info;
 		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
-		inode->i_nlink++;
+		inc_nlink(inode);
 
 		inode->i_fop = &simple_dir_operations;
 		inode->i_op = &dlmfs_root_inode_operations;
@@ -395,7 +395,7 @@ static struct inode *dlmfs_get_inode(struct inode *parent,
 
 		/* directory inodes start off with i_nlink ==
 		 * 2 (for "." entry) */
-		inode->i_nlink++;
+		inc_nlink(inode);
 		break;
 	}
 
@@ -449,7 +449,7 @@ static int dlmfs_mkdir(struct inode * dir,
 	}
 	ip->ip_dlm = dlm;
 
-	dir->i_nlink++;
+	inc_nlink(dir);
 	d_instantiate(dentry, inode);
 	dget(dentry);	/* Extra count - pin the dentry in core */
 
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 40f83f53053a..8c370a39e0c1 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -429,7 +429,7 @@ static int ocfs2_mknod(struct inode *dir,
 			mlog_errno(status);
 			goto leave;
 		}
-		dir->i_nlink++;
+		inc_nlink(dir);
 	}
 
 	status = ocfs2_add_entry(handle, dentry, inode,
@@ -730,7 +730,7 @@ static int ocfs2_link(struct dentry *old_dentry,
 		goto bail;
 	}
 
-	inode->i_nlink++;
+	inc_nlink(inode);
 	inode->i_ctime = CURRENT_TIME;
 	fe->i_links_count = cpu_to_le16(inode->i_nlink);
 	fe->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
@@ -952,7 +952,7 @@ static int ocfs2_unlink(struct inode *dir,
 						parent_node_bh);
 		if (status < 0) {
 			mlog_errno(status);
-			dir->i_nlink++;
+			inc_nlink(dir);
 		}
 	}
 
@@ -1382,7 +1382,7 @@ static int ocfs2_rename(struct inode *old_dir,
 		if (new_inode) {
 			new_inode->i_nlink--;
 		} else {
-			new_dir->i_nlink++;
+			inc_nlink(new_dir);
 			mark_inode_dirty(new_dir);
 		}
 	}
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index bc0e51662424..2faf4cdf61b0 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -75,7 +75,7 @@ struct inode *ramfs_get_inode(struct super_block *sb, int mode, dev_t dev)
 			inode->i_fop = &simple_dir_operations;
 
 			/* directory inodes start off with i_nlink == 2 (for "." entry) */
-			inode->i_nlink++;
+			inc_nlink(inode);
 			break;
 		case S_IFLNK:
 			inode->i_op = &page_symlink_inode_operations;
@@ -113,7 +113,7 @@ static int ramfs_mkdir(struct inode * dir, struct dentry * dentry, int mode)
 {
 	int retval = ramfs_mknod(dir, dentry, mode | S_IFDIR, 0);
 	if (!retval)
-		dir->i_nlink++;
+		inc_nlink(dir);
 	return retval;
 }
 
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index c76d427e027b..cf92e89515f2 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -19,7 +19,7 @@
 #include <linux/smp_lock.h>
 #include <linux/quotaops.h>
 
-#define INC_DIR_INODE_NLINK(i) if (i->i_nlink != 1) { i->i_nlink++; if (i->i_nlink >= REISERFS_LINK_MAX) i->i_nlink=1; }
+#define INC_DIR_INODE_NLINK(i) if (i->i_nlink != 1) { inc_nlink(i); if (i->i_nlink >= REISERFS_LINK_MAX) i->i_nlink=1; }
 #define DEC_DIR_INODE_NLINK(i) if (i->i_nlink != 1) drop_nlink(i);
 
 // directory item contains array of entry headers. This performs
@@ -1006,7 +1006,7 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry)
 	    reiserfs_cut_from_item(&th, &path, &(de.de_entry_key), dir, NULL,
 				   0);
 	if (retval < 0) {
-		inode->i_nlink++;
+		inc_nlink(inode);
 		goto end_unlink;
 	}
 	inode->i_ctime = CURRENT_TIME_SEC;
@@ -1143,7 +1143,7 @@ static int reiserfs_link(struct dentry *old_dentry, struct inode *dir,
 	}
 
 	/* inc before scheduling so reiserfs_unlink knows we are here */
-	inode->i_nlink++;
+	inc_nlink(inode);
 
 	retval = journal_begin(&th, dir->i_sb, jbegin_count);
 	if (retval) {
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 5f3d725d1125..3aa3434621ca 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -103,7 +103,7 @@ static int init_dir(struct inode * inode)
 	inode->i_fop = &sysfs_dir_operations;
 
 	/* directory inodes start off with i_nlink == 2 (for "." entry) */
-	inode->i_nlink++;
+	inc_nlink(inode);
 	return 0;
 }
 
@@ -137,7 +137,7 @@ static int create_dir(struct kobject * k, struct dentry * p,
 		if (!error) {
 			error = sysfs_create(*d, mode, init_dir);
 			if (!error) {
-				p->d_inode->i_nlink++;
+				inc_nlink(p->d_inode);
 				(*d)->d_op = &sysfs_dentry_ops;
 				d_rehash(*d);
 			}
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index 40190c489271..20551a1b8a56 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -49,7 +49,7 @@ static int sysfs_fill_super(struct super_block *sb, void *data, int silent)
 		inode->i_op = &sysfs_dir_inode_operations;
 		inode->i_fop = &sysfs_dir_operations;
 		/* directory inodes start off with i_nlink == 2 (for "." entry) */
-		inode->i_nlink++;	
+		inc_nlink(inode);
 	} else {
 		pr_debug("sysfs: could not get root inode\n");
 		return -ENOMEM;
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index b223b32db991..ae21a0e59e95 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -1165,7 +1165,7 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
 			inode->i_op = &udf_dir_inode_operations;
 			inode->i_fop = &udf_dir_operations;
 			inode->i_mode |= S_IFDIR;
-			inode->i_nlink ++;
+			inc_nlink(inode);
 			break;
 		}
 		case ICBTAG_FILE_TYPE_REALTIME:
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index d14d25534aa8..e40c95e65117 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -762,7 +762,7 @@ static int udf_mkdir(struct inode * dir, struct dentry * dentry, int mode)
 		cpu_to_le32(UDF_I_UNIQUE(inode) & 0x00000000FFFFFFFFUL);
 	cfi.fileCharacteristics |= FID_FILE_CHAR_DIRECTORY;
 	udf_write_fi(dir, &cfi, fi, &fibh, NULL, NULL);
-	dir->i_nlink++;
+	inc_nlink(dir);
 	mark_inode_dirty(dir);
 	d_instantiate(dentry, inode);
 	if (fibh.sbh != fibh.ebh)
@@ -1147,7 +1147,7 @@ static int udf_link(struct dentry * old_dentry, struct inode * dir,
 	if (fibh.sbh != fibh.ebh)
 		udf_release_data(fibh.ebh);
 	udf_release_data(fibh.sbh);
-	inode->i_nlink ++;
+	inc_nlink(inode);
 	inode->i_ctime = current_fs_time(inode->i_sb);
 	mark_inode_dirty(inode);
 	atomic_inc(&inode->i_count);
@@ -1282,7 +1282,7 @@ static int udf_rename (struct inode * old_dir, struct dentry * old_dentry,
 		}
 		else
 		{
-			new_dir->i_nlink ++;
+			inc_nlink(new_dir);
 			mark_inode_dirty(new_dir);
 		}
 	}
diff --git a/fs/vfat/namei.c b/fs/vfat/namei.c
index 090d74ffa061..5846ba2d5d9f 100644
--- a/fs/vfat/namei.c
+++ b/fs/vfat/namei.c
@@ -837,7 +837,7 @@ static int vfat_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	if (err)
 		goto out_free;
 	dir->i_version++;
-	dir->i_nlink++;
+	inc_nlink(dir);
 
 	inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos);
 	brelse(sinfo.bh);
@@ -932,7 +932,7 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry,
 		}
 		drop_nlink(old_dir);
 		if (!new_inode)
- 			new_dir->i_nlink++;
+ 			inc_nlink(new_dir);
 	}
 
 	err = fat_remove_entries(old_dir, &old_sinfo);	/* and releases bh */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 26d3c61116c0..6a5267da565f 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1219,9 +1219,14 @@ static inline void mark_inode_dirty_sync(struct inode *inode)
 	__mark_inode_dirty(inode, I_DIRTY_SYNC);
 }
 
-static inline void inode_inc_link_count(struct inode *inode)
+static inline void inc_nlink(struct inode *inode)
 {
 	inode->i_nlink++;
+}
+
+static inline void inode_inc_link_count(struct inode *inode)
+{
+	inc_nlink(inode);
 	mark_inode_dirty(inode);
 }
 
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 10aa8eeeb112..d75d0ba83360 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -168,7 +168,7 @@ static struct inode *mqueue_get_inode(struct super_block *sb, int mode,
 			/* all is ok */
 			info->user = get_uid(u);
 		} else if (S_ISDIR(mode)) {
-			inode->i_nlink++;
+			inc_nlink(inode);
 			/* Some things misbehave if size == 0 on a directory */
 			inode->i_size = 2 * DIRENT_SIZE;
 			inode->i_op = &mqueue_dir_inode_operations;
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 8c3c400cce91..9d850ae13b1b 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -377,7 +377,7 @@ static int cpuset_fill_super(struct super_block *sb, void *unused_data,
 		inode->i_op = &simple_dir_inode_operations;
 		inode->i_fop = &simple_dir_operations;
 		/* directories start off with i_nlink == 2 (for "." entry) */
-		inode->i_nlink++;
+		inc_nlink(inode);
 	} else {
 		return -ENOMEM;
 	}
@@ -1565,7 +1565,7 @@ static int cpuset_create_file(struct dentry *dentry, int mode)
 		inode->i_fop = &simple_dir_operations;
 
 		/* start off with i_nlink == 2 (for "." entry) */
-		inode->i_nlink++;
+		inc_nlink(inode);
 	} else if (S_ISREG(mode)) {
 		inode->i_size = 0;
 		inode->i_fop = &cpuset_file_operations;
@@ -1598,7 +1598,7 @@ static int cpuset_create_dir(struct cpuset *cs, const char *name, int mode)
 	error = cpuset_create_file(dentry, S_IFDIR | mode);
 	if (!error) {
 		dentry->d_fsdata = cs;
-		parent->d_inode->i_nlink++;
+		inc_nlink(parent->d_inode);
 		cs->dentry = dentry;
 	}
 	dput(dentry);
@@ -2033,7 +2033,7 @@ int __init cpuset_init(void)
 	}
 	root = cpuset_mount->mnt_sb->s_root;
 	root->d_fsdata = &top_cpuset;
-	root->d_inode->i_nlink++;
+	inc_nlink(root->d_inode);
 	top_cpuset.dentry = root;
 	root->d_inode->i_op = &cpuset_dir_inode_operations;
 	number_of_cpusets = 1;
diff --git a/mm/shmem.c b/mm/shmem.c
index 908dd947b1ea..bb8ca7ef7094 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1379,7 +1379,7 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
 							&sbinfo->policy_nodes);
 			break;
 		case S_IFDIR:
-			inode->i_nlink++;
+			inc_nlink(inode);
 			/* Some things misbehave if size == 0 on a directory */
 			inode->i_size = 2 * BOGO_DIRENT_SIZE;
 			inode->i_op = &shmem_dir_inode_operations;
@@ -1715,7 +1715,7 @@ static int shmem_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 
 	if ((error = shmem_mknod(dir, dentry, mode | S_IFDIR, 0)))
 		return error;
-	dir->i_nlink++;
+	inc_nlink(dir);
 	return 0;
 }
 
@@ -1750,7 +1750,7 @@ static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentr
 
 	dir->i_size += BOGO_DIRENT_SIZE;
 	inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
-	inode->i_nlink++;
+	inc_nlink(inode);
 	atomic_inc(&inode->i_count);	/* New dentry reference */
 	dget(dentry);		/* Extra pinning count for the created dentry */
 	d_instantiate(dentry, inode);
@@ -1807,7 +1807,7 @@ static int shmem_rename(struct inode *old_dir, struct dentry *old_dentry, struct
 			drop_nlink(old_dir);
 	} else if (they_are_dirs) {
 		drop_nlink(old_dir);
-		new_dir->i_nlink++;
+		inc_nlink(new_dir);
 	}
 
 	old_dir->i_size -= BOGO_DIRENT_SIZE;
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index 700c6e061a04..9a0b41a97f90 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -494,7 +494,7 @@ rpc_get_inode(struct super_block *sb, int mode)
 		case S_IFDIR:
 			inode->i_fop = &simple_dir_operations;
 			inode->i_op = &simple_dir_inode_operations;
-			inode->i_nlink++;
+			inc_nlink(inode);
 		default:
 			break;
 	}
@@ -571,7 +571,7 @@ rpc_populate(struct dentry *parent,
 		if (private)
 			rpc_inode_setowner(inode, private);
 		if (S_ISDIR(mode))
-			dir->i_nlink++;
+			inc_nlink(dir);
 		d_add(dentry, inode);
 	}
 	mutex_unlock(&dir->i_mutex);
@@ -593,7 +593,7 @@ __rpc_mkdir(struct inode *dir, struct dentry *dentry)
 		goto out_err;
 	inode->i_ino = iunique(dir->i_sb, 100);
 	d_instantiate(dentry, inode);
-	dir->i_nlink++;
+	inc_nlink(dir);
 	inode_dir_notify(dir, DN_CREATE);
 	return 0;
 out_err:
diff --git a/security/inode.c b/security/inode.c
index 49ee51529396..9b16e14f3a80 100644
--- a/security/inode.c
+++ b/security/inode.c
@@ -78,7 +78,7 @@ static struct inode *get_inode(struct super_block *sb, int mode, dev_t dev)
 			inode->i_fop = &simple_dir_operations;
 
 			/* directory inodes start off with i_nlink == 2 (for "." entry) */
-			inode->i_nlink++;
+			inc_nlink(inode);
 			break;
 		}
 	}
@@ -111,7 +111,7 @@ static int mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	mode = (mode & (S_IRWXUGO | S_ISVTX)) | S_IFDIR;
 	res = mknod(dir, dentry, mode, 0);
 	if (!res)
-		dir->i_nlink++;
+		inc_nlink(dir);
 	return res;
 }
 
diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c
index bab7b386cb8d..cd244419c980 100644
--- a/security/selinux/selinuxfs.c
+++ b/security/selinux/selinuxfs.c
@@ -1253,10 +1253,10 @@ static int sel_make_dir(struct inode *dir, struct dentry *dentry)
 	inode->i_op = &simple_dir_inode_operations;
 	inode->i_fop = &simple_dir_operations;
 	/* directory inodes start off with i_nlink == 2 (for "." entry) */
-	inode->i_nlink++;
+	inc_nlink(inode);
 	d_add(dentry, inode);
 	/* bump link count on parent directory, too */
-	dir->i_nlink++;
+	inc_nlink(dir);
 out:
 	return ret;
 }
-- 
cgit v1.2.3


From e239ca540594cff00adcce163dc332b27015d8e5 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Sat, 30 Sep 2006 23:29:27 -0700
Subject: [PATCH] Create call_usermodehelper_pipe()

A new member in the ever growing family of call_usermode* functions is
born.  The new call_usermodehelper_pipe() function allows to pipe data to
the stdin of the called user mode progam and behaves otherwise like the
normal call_usermodehelp() (except that it always waits for the child to
finish)

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/kmod.h |  4 ++++
 kernel/kmod.c        | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 58 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/linux/kmod.h b/include/linux/kmod.h
index 0db22a1ab474..10f505c8431d 100644
--- a/include/linux/kmod.h
+++ b/include/linux/kmod.h
@@ -47,4 +47,8 @@ call_usermodehelper(char *path, char **argv, char **envp, int wait)
 
 extern void usermodehelper_init(void);
 
+struct file;
+extern int call_usermodehelper_pipe(char *path, char *argv[], char *envp[],
+				    struct file **filp);
+
 #endif /* __LINUX_KMOD_H__ */
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 842f8015d7fd..5c63c53014a9 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -122,6 +122,7 @@ struct subprocess_info {
 	struct key *ring;
 	int wait;
 	int retval;
+	struct file *stdin;
 };
 
 /*
@@ -145,12 +146,26 @@ static int ____call_usermodehelper(void *data)
 
 	key_put(old_session);
 
+	/* Install input pipe when needed */
+	if (sub_info->stdin) {
+		struct files_struct *f = current->files;
+		struct fdtable *fdt;
+		/* no races because files should be private here */
+		sys_close(0);
+		fd_install(0, sub_info->stdin);
+		spin_lock(&f->file_lock);
+		fdt = files_fdtable(f);
+		FD_SET(0, fdt->open_fds);
+		FD_CLR(0, fdt->close_on_exec);
+		spin_unlock(&f->file_lock);
+	}
+
 	/* We can run anywhere, unlike our parent keventd(). */
 	set_cpus_allowed(current, CPU_MASK_ALL);
 
 	retval = -EPERM;
 	if (current->fs->root)
-		retval = execve(sub_info->path, sub_info->argv,sub_info->envp);
+		retval = execve(sub_info->path, sub_info->argv, sub_info->envp);
 
 	/* Exec failed? */
 	sub_info->retval = retval;
@@ -268,6 +283,44 @@ int call_usermodehelper_keys(char *path, char **argv, char **envp,
 }
 EXPORT_SYMBOL(call_usermodehelper_keys);
 
+int call_usermodehelper_pipe(char *path, char **argv, char **envp,
+			     struct file **filp)
+{
+	DECLARE_COMPLETION(done);
+	struct subprocess_info sub_info = {
+		.complete	= &done,
+		.path		= path,
+		.argv		= argv,
+		.envp		= envp,
+		.retval		= 0,
+	};
+	struct file *f;
+	DECLARE_WORK(work, __call_usermodehelper, &sub_info);
+
+	if (!khelper_wq)
+		return -EBUSY;
+
+	if (path[0] == '\0')
+		return 0;
+
+	f = create_write_pipe();
+	if (!f)
+		return -ENOMEM;
+	*filp = f;
+
+	f = create_read_pipe(f);
+	if (!f) {
+		free_write_pipe(*filp);
+		return -ENOMEM;
+	}
+	sub_info.stdin = f;
+
+	queue_work(khelper_wq, &work);
+	wait_for_completion(&done);
+	return sub_info.retval;
+}
+EXPORT_SYMBOL(call_usermodehelper_pipe);
+
 void __init usermodehelper_init(void)
 {
 	khelper_wq = create_singlethread_workqueue("khelper");
-- 
cgit v1.2.3


From d025c9db7f31fc0554ce7fb2dfc78d35a77f3487 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Sat, 30 Sep 2006 23:29:28 -0700
Subject: [PATCH] Support piping into commands in /proc/sys/kernel/core_pattern

Using the infrastructure created in previous patches implement support to
pipe core dumps into programs.

This is done by overloading the existing core_pattern sysctl
with a new syntax:

|program

When the first character of the pattern is a '|' the kernel will instead
threat the rest of the pattern as a command to run.  The core dump will be
written to the standard input of that program instead of to a file.

This is useful for having automatic core dump analysis without filling up
disks.  The program can do some simple analysis and save only a summary of
the core dump.

The core dump proces will run with the privileges and in the name space of
the process that caused the core dump.

I also increased the core pattern size to 128 bytes so that longer command
lines fit.

Most of the changes comes from allowing core dumps without seeks.  They are
fairly straight forward though.

One small incompatibility is that if someone had a core pattern previously
that started with '|' they will get suddenly new behaviour.  I think that's
unlikely to be a real problem though.

Additional background:

> Very nice, do you happen to have a program that can accept this kind of
> input for crash dumps?  I'm guessing that the embedded people will
> really want this functionality.

I had a cheesy demo/prototype.  Basically it wrote the dump to a file again,
ran gdb on it to get a backtrace and wrote the summary to a shared directory.
Then there was a simple CGI script to generate a "top 10" crashes HTML
listing.

Unfortunately this still had the disadvantage to needing full disk space for a
dump except for deleting it afterwards (in fact it was worse because over the
pipe holes didn't work so if you have a holey address map it would require
more space).

Fortunately gdb seems to be happy to handle /proc/pid/fd/xxx input pipes as
cores (at least it worked with zsh's =(cat core) syntax), so it would be
likely possible to do it without temporary space with a simple wrapper that
calls it in the right way.  I ran out of time before doing that though.

The demo prototype scripts weren't very good.  If there is really interest I
can dig them out (they are currently on a laptop disk on the desk with the
laptop itself being in service), but I would recommend to rewrite them for any
serious application of this and fix the disk space problem.

Also to be really useful it should probably find a way to automatically fetch
the debuginfos (I cheated and just installed them in advance).  If nobody else
does it I can probably do the rewrite myself again at some point.

My hope at some point was that desktops would support it in their builtin
crash reporters, but at least the KDE people I talked too seemed to be happy
with their user space only solution.

Alan sayeth:

  I don't believe that piping as such as neccessarily the right model, but
  the ability to intercept and processes core dumps from user space is asked
  for by many enterprise users as well.  They want to know about, capture,
  analyse and process core dumps, often centrally and in automated form.

[akpm@osdl.org: loff_t != unsigned long]
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/binfmt_elf.c | 77 +++++++++++++++++++++++++++++++++------------------------
 fs/exec.c       | 23 +++++++++++++----
 kernel/kmod.c   |  4 +++
 kernel/sysctl.c |  2 +-
 4 files changed, 68 insertions(+), 38 deletions(-)

(limited to 'kernel')

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index bad52433de69..06435f3665f4 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1151,11 +1151,23 @@ static int dump_write(struct file *file, const void *addr, int nr)
 
 static int dump_seek(struct file *file, loff_t off)
 {
-	if (file->f_op->llseek) {
-		if (file->f_op->llseek(file, off, 0) != off)
+	if (file->f_op->llseek && file->f_op->llseek != no_llseek) {
+		if (file->f_op->llseek(file, off, 1) != off)
 			return 0;
-	} else
-		file->f_pos = off;
+	} else {
+		char *buf = (char *)get_zeroed_page(GFP_KERNEL);
+		if (!buf)
+			return 0;
+		while (off > 0) {
+			unsigned long n = off;
+			if (n > PAGE_SIZE)
+				n = PAGE_SIZE;
+			if (!dump_write(file, buf, n))
+				return 0;
+			off -= n;
+		}
+		free_page((unsigned long)buf);
+	}
 	return 1;
 }
 
@@ -1203,30 +1215,35 @@ static int notesize(struct memelfnote *en)
 	return sz;
 }
 
-#define DUMP_WRITE(addr, nr)	\
-	do { if (!dump_write(file, (addr), (nr))) return 0; } while(0)
-#define DUMP_SEEK(off)	\
-	do { if (!dump_seek(file, (off))) return 0; } while(0)
+#define DUMP_WRITE(addr, nr, foffset)	\
+	do { if (!dump_write(file, (addr), (nr))) return 0; *foffset += (nr); } while(0)
 
-static int writenote(struct memelfnote *men, struct file *file)
+static int alignfile(struct file *file, loff_t *foffset)
 {
-	struct elf_note en;
+	char buf[4] = { 0, };
+	DUMP_WRITE(buf, roundup(*foffset, 4) - *foffset, foffset);
+	return 1;
+}
 
+static int writenote(struct memelfnote *men, struct file *file,
+			loff_t *foffset)
+{
+	struct elf_note en;
 	en.n_namesz = strlen(men->name) + 1;
 	en.n_descsz = men->datasz;
 	en.n_type = men->type;
 
-	DUMP_WRITE(&en, sizeof(en));
-	DUMP_WRITE(men->name, en.n_namesz);
-	/* XXX - cast from long long to long to avoid need for libgcc.a */
-	DUMP_SEEK(roundup((unsigned long)file->f_pos, 4));	/* XXX */
-	DUMP_WRITE(men->data, men->datasz);
-	DUMP_SEEK(roundup((unsigned long)file->f_pos, 4));	/* XXX */
+	DUMP_WRITE(&en, sizeof(en), foffset);
+	DUMP_WRITE(men->name, en.n_namesz, foffset);
+	if (!alignfile(file, foffset))
+		return 0;
+	DUMP_WRITE(men->data, men->datasz, foffset);
+	if (!alignfile(file, foffset))
+		return 0;
 
 	return 1;
 }
 #undef DUMP_WRITE
-#undef DUMP_SEEK
 
 #define DUMP_WRITE(addr, nr)	\
 	if ((size += (nr)) > limit || !dump_write(file, (addr), (nr))) \
@@ -1426,7 +1443,7 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file)
 	int i;
 	struct vm_area_struct *vma;
 	struct elfhdr *elf = NULL;
-	loff_t offset = 0, dataoff;
+	loff_t offset = 0, dataoff, foffset;
 	unsigned long limit = current->signal->rlim[RLIMIT_CORE].rlim_cur;
 	int numnote;
 	struct memelfnote *notes = NULL;
@@ -1569,7 +1586,8 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file)
 		DUMP_WRITE(&phdr, sizeof(phdr));
 	}
 
-	/* Page-align dumped data */
+	foffset = offset;
+
 	dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE);
 
 	/* Write program headers for segments dump */
@@ -1594,6 +1612,7 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file)
 		phdr.p_align = ELF_EXEC_PAGESIZE;
 
 		DUMP_WRITE(&phdr, sizeof(phdr));
+		foffset += sizeof(phdr);
 	}
 
 #ifdef ELF_CORE_WRITE_EXTRA_PHDRS
@@ -1602,7 +1621,7 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file)
 
  	/* write out the notes section */
 	for (i = 0; i < numnote; i++)
-		if (!writenote(notes + i, file))
+		if (!writenote(notes + i, file, &foffset))
 			goto end_coredump;
 
 	/* write out the thread status notes section */
@@ -1611,11 +1630,12 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file)
 				list_entry(t, struct elf_thread_status, list);
 
 		for (i = 0; i < tmp->num_notes; i++)
-			if (!writenote(&tmp->notes[i], file))
+			if (!writenote(&tmp->notes[i], file, &foffset))
 				goto end_coredump;
 	}
- 
-	DUMP_SEEK(dataoff);
+
+	/* Align to page */
+	DUMP_SEEK(dataoff - foffset);
 
 	for (vma = current->mm->mmap; vma != NULL; vma = vma->vm_next) {
 		unsigned long addr;
@@ -1631,10 +1651,10 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file)
 
 			if (get_user_pages(current, current->mm, addr, 1, 0, 1,
 						&page, &vma) <= 0) {
-				DUMP_SEEK(file->f_pos + PAGE_SIZE);
+				DUMP_SEEK(PAGE_SIZE);
 			} else {
 				if (page == ZERO_PAGE(addr)) {
-					DUMP_SEEK(file->f_pos + PAGE_SIZE);
+					DUMP_SEEK(PAGE_SIZE);
 				} else {
 					void *kaddr;
 					flush_cache_page(vma, addr,
@@ -1658,13 +1678,6 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file)
 	ELF_CORE_WRITE_EXTRA_DATA;
 #endif
 
-	if (file->f_pos != offset) {
-		/* Sanity check */
-		printk(KERN_WARNING
-		       "elf_core_dump: file->f_pos (%Ld) != offset (%Ld)\n",
-		       file->f_pos, offset);
-	}
-
 end_coredump:
 	set_fs(fs);
 
diff --git a/fs/exec.c b/fs/exec.c
index 0db3fc3c5f0f..6270f8f20a63 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -58,7 +58,7 @@
 #endif
 
 int core_uses_pid;
-char core_pattern[65] = "core";
+char core_pattern[128] = "core";
 int suid_dumpable = 0;
 
 EXPORT_SYMBOL(suid_dumpable);
@@ -1463,6 +1463,7 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs)
 	int retval = 0;
 	int fsuid = current->fsuid;
 	int flag = 0;
+	int ispipe = 0;
 
 	binfmt = current->binfmt;
 	if (!binfmt || !binfmt->core_dump)
@@ -1504,22 +1505,34 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs)
  	lock_kernel();
 	format_corename(corename, core_pattern, signr);
 	unlock_kernel();
-	file = filp_open(corename, O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag, 0600);
+ 	if (corename[0] == '|') {
+		/* SIGPIPE can happen, but it's just never processed */
+ 		if(call_usermodehelper_pipe(corename+1, NULL, NULL, &file)) {
+ 			printk(KERN_INFO "Core dump to %s pipe failed\n",
+			       corename);
+ 			goto fail_unlock;
+ 		}
+		ispipe = 1;
+ 	} else
+ 		file = filp_open(corename,
+				 O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE, 0600);
 	if (IS_ERR(file))
 		goto fail_unlock;
 	inode = file->f_dentry->d_inode;
 	if (inode->i_nlink > 1)
 		goto close_fail;	/* multiple links - don't dump */
-	if (d_unhashed(file->f_dentry))
+	if (!ispipe && d_unhashed(file->f_dentry))
 		goto close_fail;
 
-	if (!S_ISREG(inode->i_mode))
+	/* AK: actually i see no reason to not allow this for named pipes etc.,
+	   but keep the previous behaviour for now. */
+	if (!ispipe && !S_ISREG(inode->i_mode))
 		goto close_fail;
 	if (!file->f_op)
 		goto close_fail;
 	if (!file->f_op->write)
 		goto close_fail;
-	if (do_truncate(file->f_dentry, 0, 0, file) != 0)
+	if (!ispipe && do_truncate(file->f_dentry, 0, 0, file) != 0)
 		goto close_fail;
 
 	retval = binfmt->core_dump(signr, regs, file);
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 5c63c53014a9..f8121b95183f 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -35,6 +35,7 @@
 #include <linux/mount.h>
 #include <linux/kernel.h>
 #include <linux/init.h>
+#include <linux/resource.h>
 #include <asm/uaccess.h>
 
 extern int max_threads;
@@ -158,6 +159,9 @@ static int ____call_usermodehelper(void *data)
 		FD_SET(0, fdt->open_fds);
 		FD_CLR(0, fdt->close_on_exec);
 		spin_unlock(&f->file_lock);
+
+		/* and disallow core files too */
+		current->signal->rlim[RLIMIT_CORE] = (struct rlimit){0, 0};
 	}
 
 	/* We can run anywhere, unlike our parent keventd(). */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index c57c4532e296..ba42694f0453 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -294,7 +294,7 @@ static ctl_table kern_table[] = {
 		.ctl_name	= KERN_CORE_PATTERN,
 		.procname	= "core_pattern",
 		.data		= core_pattern,
-		.maxlen		= 64,
+		.maxlen		= 128,
 		.mode		= 0644,
 		.proc_handler	= &proc_dostring,
 		.strategy	= &sysctl_string,
-- 
cgit v1.2.3