summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/audit.c14
-rw-r--r--kernel/auditfilter.c2
-rw-r--r--kernel/auditsc.c6
-rw-r--r--kernel/cpu.c138
-rw-r--r--kernel/cpuset.c6
-rw-r--r--kernel/irq/handle.c2
-rw-r--r--kernel/power/Kconfig22
-rw-r--r--kernel/power/Makefile2
-rw-r--r--kernel/power/disk.c11
-rw-r--r--kernel/power/main.c40
-rw-r--r--kernel/power/power.h59
-rw-r--r--kernel/power/smp.c62
-rw-r--r--kernel/power/snapshot.c1155
-rw-r--r--kernel/power/swap.c270
-rw-r--r--kernel/power/swsusp.c14
-rw-r--r--kernel/power/user.c17
-rw-r--r--kernel/printk.c3
-rw-r--r--kernel/profile.c16
-rw-r--r--kernel/sched.c54
-rw-r--r--kernel/sysctl.c11
20 files changed, 1321 insertions, 583 deletions
diff --git a/kernel/audit.c b/kernel/audit.c
index 963fd15c9621..f9889ee77825 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -244,7 +244,7 @@ static int audit_set_rate_limit(int limit, uid_t loginuid, u32 sid)
char *ctx = NULL;
u32 len;
int rc;
- if ((rc = selinux_ctxid_to_string(sid, &ctx, &len)))
+ if ((rc = selinux_sid_to_string(sid, &ctx, &len)))
return rc;
else
audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
@@ -267,7 +267,7 @@ static int audit_set_backlog_limit(int limit, uid_t loginuid, u32 sid)
char *ctx = NULL;
u32 len;
int rc;
- if ((rc = selinux_ctxid_to_string(sid, &ctx, &len)))
+ if ((rc = selinux_sid_to_string(sid, &ctx, &len)))
return rc;
else
audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
@@ -293,7 +293,7 @@ static int audit_set_enabled(int state, uid_t loginuid, u32 sid)
char *ctx = NULL;
u32 len;
int rc;
- if ((rc = selinux_ctxid_to_string(sid, &ctx, &len)))
+ if ((rc = selinux_sid_to_string(sid, &ctx, &len)))
return rc;
else
audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
@@ -321,7 +321,7 @@ static int audit_set_failure(int state, uid_t loginuid, u32 sid)
char *ctx = NULL;
u32 len;
int rc;
- if ((rc = selinux_ctxid_to_string(sid, &ctx, &len)))
+ if ((rc = selinux_sid_to_string(sid, &ctx, &len)))
return rc;
else
audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
@@ -538,7 +538,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
if (status_get->mask & AUDIT_STATUS_PID) {
int old = audit_pid;
if (sid) {
- if ((err = selinux_ctxid_to_string(
+ if ((err = selinux_sid_to_string(
sid, &ctx, &len)))
return err;
else
@@ -576,7 +576,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
"user pid=%d uid=%u auid=%u",
pid, uid, loginuid);
if (sid) {
- if (selinux_ctxid_to_string(
+ if (selinux_sid_to_string(
sid, &ctx, &len)) {
audit_log_format(ab,
" ssid=%u", sid);
@@ -614,7 +614,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
loginuid, sid);
break;
case AUDIT_SIGNAL_INFO:
- err = selinux_ctxid_to_string(audit_sig_sid, &ctx, &len);
+ err = selinux_sid_to_string(audit_sig_sid, &ctx, &len);
if (err)
return err;
sig_data = kmalloc(sizeof(*sig_data) + len, GFP_KERNEL);
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index a44879b0c72f..1a58a81fb09d 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -1398,7 +1398,7 @@ static void audit_log_rule_change(uid_t loginuid, u32 sid, char *action,
if (sid) {
char *ctx = NULL;
u32 len;
- if (selinux_ctxid_to_string(sid, &ctx, &len))
+ if (selinux_sid_to_string(sid, &ctx, &len))
audit_log_format(ab, " ssid=%u", sid);
else
audit_log_format(ab, " subj=%s", ctx);
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 1bd8827a0102..fb83c5cb8c32 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -385,7 +385,7 @@ static int audit_filter_rules(struct task_struct *tsk,
logged upon error */
if (f->se_rule) {
if (need_sid) {
- selinux_task_ctxid(tsk, &sid);
+ selinux_get_task_sid(tsk, &sid);
need_sid = 0;
}
result = selinux_audit_rule_match(sid, f->type,
@@ -898,7 +898,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
if (axi->osid != 0) {
char *ctx = NULL;
u32 len;
- if (selinux_ctxid_to_string(
+ if (selinux_sid_to_string(
axi->osid, &ctx, &len)) {
audit_log_format(ab, " osid=%u",
axi->osid);
@@ -1005,7 +1005,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
if (n->osid != 0) {
char *ctx = NULL;
u32 len;
- if (selinux_ctxid_to_string(
+ if (selinux_sid_to_string(
n->osid, &ctx, &len)) {
audit_log_format(ab, " osid=%u", n->osid);
call_panic = 2;
diff --git a/kernel/cpu.c b/kernel/cpu.c
index f230f9ae01c2..32c96628463e 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -21,6 +21,11 @@ static DEFINE_MUTEX(cpu_bitmask_lock);
static __cpuinitdata BLOCKING_NOTIFIER_HEAD(cpu_chain);
+/* If set, cpu_up and cpu_down will return -EBUSY and do nothing.
+ * Should always be manipulated under cpu_add_remove_lock
+ */
+static int cpu_hotplug_disabled;
+
#ifdef CONFIG_HOTPLUG_CPU
/* Crappy recursive lock-takers in cpufreq! Complain loudly about idiots */
@@ -108,30 +113,25 @@ static int take_cpu_down(void *unused)
return 0;
}
-int cpu_down(unsigned int cpu)
+/* Requires cpu_add_remove_lock to be held */
+static int _cpu_down(unsigned int cpu)
{
int err;
struct task_struct *p;
cpumask_t old_allowed, tmp;
- mutex_lock(&cpu_add_remove_lock);
- if (num_online_cpus() == 1) {
- err = -EBUSY;
- goto out;
- }
+ if (num_online_cpus() == 1)
+ return -EBUSY;
- if (!cpu_online(cpu)) {
- err = -EINVAL;
- goto out;
- }
+ if (!cpu_online(cpu))
+ return -EINVAL;
err = blocking_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE,
(void *)(long)cpu);
if (err == NOTIFY_BAD) {
printk("%s: attempt to take down CPU %u failed\n",
__FUNCTION__, cpu);
- err = -EINVAL;
- goto out;
+ return -EINVAL;
}
/* Ensure that we are not runnable on dying cpu */
@@ -179,22 +179,32 @@ out_thread:
err = kthread_stop(p);
out_allowed:
set_cpus_allowed(current, old_allowed);
-out:
+ return err;
+}
+
+int cpu_down(unsigned int cpu)
+{
+ int err = 0;
+
+ mutex_lock(&cpu_add_remove_lock);
+ if (cpu_hotplug_disabled)
+ err = -EBUSY;
+ else
+ err = _cpu_down(cpu);
+
mutex_unlock(&cpu_add_remove_lock);
return err;
}
#endif /*CONFIG_HOTPLUG_CPU*/
-int __devinit cpu_up(unsigned int cpu)
+/* Requires cpu_add_remove_lock to be held */
+static int __devinit _cpu_up(unsigned int cpu)
{
int ret;
void *hcpu = (void *)(long)cpu;
- mutex_lock(&cpu_add_remove_lock);
- if (cpu_online(cpu) || !cpu_present(cpu)) {
- ret = -EINVAL;
- goto out;
- }
+ if (cpu_online(cpu) || !cpu_present(cpu))
+ return -EINVAL;
ret = blocking_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE, hcpu);
if (ret == NOTIFY_BAD) {
@@ -219,7 +229,95 @@ out_notify:
if (ret != 0)
blocking_notifier_call_chain(&cpu_chain,
CPU_UP_CANCELED, hcpu);
+
+ return ret;
+}
+
+int __devinit cpu_up(unsigned int cpu)
+{
+ int err = 0;
+
+ mutex_lock(&cpu_add_remove_lock);
+ if (cpu_hotplug_disabled)
+ err = -EBUSY;
+ else
+ err = _cpu_up(cpu);
+
+ mutex_unlock(&cpu_add_remove_lock);
+ return err;
+}
+
+#ifdef CONFIG_SUSPEND_SMP
+static cpumask_t frozen_cpus;
+
+int disable_nonboot_cpus(void)
+{
+ int cpu, first_cpu, error;
+
+ mutex_lock(&cpu_add_remove_lock);
+ first_cpu = first_cpu(cpu_present_map);
+ if (!cpu_online(first_cpu)) {
+ error = _cpu_up(first_cpu);
+ if (error) {
+ printk(KERN_ERR "Could not bring CPU%d up.\n",
+ first_cpu);
+ goto out;
+ }
+ }
+ error = set_cpus_allowed(current, cpumask_of_cpu(first_cpu));
+ if (error) {
+ printk(KERN_ERR "Could not run on CPU%d\n", first_cpu);
+ goto out;
+ }
+ /* We take down all of the non-boot CPUs in one shot to avoid races
+ * with the userspace trying to use the CPU hotplug at the same time
+ */
+ cpus_clear(frozen_cpus);
+ printk("Disabling non-boot CPUs ...\n");
+ for_each_online_cpu(cpu) {
+ if (cpu == first_cpu)
+ continue;
+ error = _cpu_down(cpu);
+ if (!error) {
+ cpu_set(cpu, frozen_cpus);
+ printk("CPU%d is down\n", cpu);
+ } else {
+ printk(KERN_ERR "Error taking CPU%d down: %d\n",
+ cpu, error);
+ break;
+ }
+ }
+ if (!error) {
+ BUG_ON(num_online_cpus() > 1);
+ /* Make sure the CPUs won't be enabled by someone else */
+ cpu_hotplug_disabled = 1;
+ } else {
+ printk(KERN_ERR "Non-boot CPUs are not disabled");
+ }
out:
mutex_unlock(&cpu_add_remove_lock);
- return ret;
+ return error;
+}
+
+void enable_nonboot_cpus(void)
+{
+ int cpu, error;
+
+ /* Allow everyone to use the CPU hotplug again */
+ mutex_lock(&cpu_add_remove_lock);
+ cpu_hotplug_disabled = 0;
+ mutex_unlock(&cpu_add_remove_lock);
+
+ printk("Enabling non-boot CPUs ...\n");
+ for_each_cpu_mask(cpu, frozen_cpus) {
+ error = cpu_up(cpu);
+ if (!error) {
+ printk("CPU%d is up\n", cpu);
+ continue;
+ }
+ printk(KERN_WARNING "Error taking CPU%d up: %d\n",
+ cpu, error);
+ }
+ cpus_clear(frozen_cpus);
}
+#endif
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 4ea6f0dc2fc5..cff41511269f 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2245,7 +2245,7 @@ int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl)
int i;
for (i = 0; zl->zones[i]; i++) {
- int nid = zl->zones[i]->zone_pgdat->node_id;
+ int nid = zone_to_nid(zl->zones[i]);
if (node_isset(nid, current->mems_allowed))
return 1;
@@ -2316,9 +2316,9 @@ int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
const struct cpuset *cs; /* current cpuset ancestors */
int allowed; /* is allocation in zone z allowed? */
- if (in_interrupt())
+ if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
return 1;
- node = z->zone_pgdat->node_id;
+ node = zone_to_nid(z);
might_sleep_if(!(gfp_mask & __GFP_HARDWALL));
if (node_isset(node, current->mems_allowed))
return 1;
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 48a53f68af96..4c6cdbaed661 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -154,6 +154,7 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct pt_regs *regs,
return retval;
}
+#ifndef CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ
/**
* __do_IRQ - original all in one highlevel IRQ handler
* @irq: the interrupt number
@@ -253,6 +254,7 @@ out:
return 1;
}
+#endif
#ifdef CONFIG_TRACE_IRQFLAGS
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 619ecabf7c58..825068ca3479 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -36,6 +36,17 @@ config PM_DEBUG
code. This is helpful when debugging and reporting various PM bugs,
like suspend support.
+config DISABLE_CONSOLE_SUSPEND
+ bool "Keep console(s) enabled during suspend/resume (DANGEROUS)"
+ depends on PM && PM_DEBUG
+ default n
+ ---help---
+ This option turns off the console suspend mechanism that prevents
+ debug messages from reaching the console during the suspend/resume
+ operations. This may be helpful when debugging device drivers'
+ suspend/resume routines, but may itself lead to problems, for example
+ if netconsole is used.
+
config PM_TRACE
bool "Suspend/resume event tracing"
depends on PM && PM_DEBUG && X86_32 && EXPERIMENTAL
@@ -53,6 +64,17 @@ config PM_TRACE
CAUTION: this option will cause your machine's real-time clock to be
set to an invalid time after a resume.
+config PM_SYSFS_DEPRECATED
+ bool "Driver model /sys/devices/.../power/state files (DEPRECATED)"
+ depends on PM && SYSFS
+ default n
+ help
+ The driver model started out with a sysfs file intended to provide
+ a userspace hook for device power management. This feature has never
+ worked very well, except for limited testing purposes, and so it will
+ be removed. It's not clear that a generic mechanism could really
+ handle the wide variability of device power states; any replacements
+ are likely to be bus or driver specific.
config SOFTWARE_SUSPEND
bool "Software Suspend"
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index 8d0af3d37a4b..38725f526afc 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -7,6 +7,4 @@ obj-y := main.o process.o console.o
obj-$(CONFIG_PM_LEGACY) += pm.o
obj-$(CONFIG_SOFTWARE_SUSPEND) += swsusp.o disk.o snapshot.o swap.o user.o
-obj-$(CONFIG_SUSPEND_SMP) += smp.o
-
obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index e13e74067845..d72234942798 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -18,6 +18,7 @@
#include <linux/fs.h>
#include <linux/mount.h>
#include <linux/pm.h>
+#include <linux/cpu.h>
#include "power.h"
@@ -72,7 +73,10 @@ static int prepare_processes(void)
int error;
pm_prepare_console();
- disable_nonboot_cpus();
+
+ error = disable_nonboot_cpus();
+ if (error)
+ goto enable_cpus;
if (freeze_processes()) {
error = -EBUSY;
@@ -84,6 +88,7 @@ static int prepare_processes(void)
return 0;
thaw:
thaw_processes();
+enable_cpus:
enable_nonboot_cpus();
pm_restore_console();
return error;
@@ -98,7 +103,7 @@ static void unprepare_processes(void)
}
/**
- * pm_suspend_disk - The granpappy of power management.
+ * pm_suspend_disk - The granpappy of hibernation power management.
*
* If we're going through the firmware, then get it over with quickly.
*
@@ -207,7 +212,7 @@ static int software_resume(void)
pr_debug("PM: Preparing devices for restore.\n");
- if ((error = device_suspend(PMSG_FREEZE))) {
+ if ((error = device_suspend(PMSG_PRETHAW))) {
printk("Some devices failed to suspend\n");
swsusp_free();
goto Thaw;
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 6d295c776794..873228c71dab 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -16,6 +16,8 @@
#include <linux/init.h>
#include <linux/pm.h>
#include <linux/console.h>
+#include <linux/cpu.h>
+#include <linux/resume-trace.h>
#include "power.h"
@@ -51,7 +53,7 @@ void pm_set_ops(struct pm_ops * ops)
static int suspend_prepare(suspend_state_t state)
{
- int error = 0;
+ int error;
unsigned int free_pages;
if (!pm_ops || !pm_ops->enter)
@@ -59,12 +61,9 @@ static int suspend_prepare(suspend_state_t state)
pm_prepare_console();
- disable_nonboot_cpus();
-
- if (num_online_cpus() != 1) {
- error = -EPERM;
+ error = disable_nonboot_cpus();
+ if (error)
goto Enable_cpu;
- }
if (freeze_processes()) {
error = -EAGAIN;
@@ -283,10 +282,39 @@ static ssize_t state_store(struct subsystem * subsys, const char * buf, size_t n
power_attr(state);
+#ifdef CONFIG_PM_TRACE
+int pm_trace_enabled;
+
+static ssize_t pm_trace_show(struct subsystem * subsys, char * buf)
+{
+ return sprintf(buf, "%d\n", pm_trace_enabled);
+}
+
+static ssize_t
+pm_trace_store(struct subsystem * subsys, const char * buf, size_t n)
+{
+ int val;
+
+ if (sscanf(buf, "%d", &val) == 1) {
+ pm_trace_enabled = !!val;
+ return n;
+ }
+ return -EINVAL;
+}
+
+power_attr(pm_trace);
+
+static struct attribute * g[] = {
+ &state_attr.attr,
+ &pm_trace_attr.attr,
+ NULL,
+};
+#else
static struct attribute * g[] = {
&state_attr.attr,
NULL,
};
+#endif /* CONFIG_PM_TRACE */
static struct attribute_group attr_group = {
.attrs = g,
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 57a792982fb9..bfe999f7b272 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -38,8 +38,6 @@ extern struct subsystem power_subsys;
/* References to section boundaries */
extern const void __nosave_begin, __nosave_end;
-extern struct pbe *pagedir_nosave;
-
/* Preferred image size in bytes (default 500 MB) */
extern unsigned long image_size;
extern int in_suspend;
@@ -50,21 +48,62 @@ extern asmlinkage int swsusp_arch_resume(void);
extern unsigned int count_data_pages(void);
+/**
+ * Auxiliary structure used for reading the snapshot image data and
+ * metadata from and writing them to the list of page backup entries
+ * (PBEs) which is the main data structure of swsusp.
+ *
+ * Using struct snapshot_handle we can transfer the image, including its
+ * metadata, as a continuous sequence of bytes with the help of
+ * snapshot_read_next() and snapshot_write_next().
+ *
+ * The code that writes the image to a storage or transfers it to
+ * the user land is required to use snapshot_read_next() for this
+ * purpose and it should not make any assumptions regarding the internal
+ * structure of the image. Similarly, the code that reads the image from
+ * a storage or transfers it from the user land is required to use
+ * snapshot_write_next().
+ *
+ * This may allow us to change the internal structure of the image
+ * in the future with considerably less effort.
+ */
+
struct snapshot_handle {
- loff_t offset;
- unsigned int page;
- unsigned int page_offset;
- unsigned int prev;
- struct pbe *pbe, *last_pbe;
- void *buffer;
- unsigned int buf_offset;
+ loff_t offset; /* number of the last byte ready for reading
+ * or writing in the sequence
+ */
+ unsigned int cur; /* number of the block of PAGE_SIZE bytes the
+ * next operation will refer to (ie. current)
+ */
+ unsigned int cur_offset; /* offset with respect to the current
+ * block (for the next operation)
+ */
+ unsigned int prev; /* number of the block of PAGE_SIZE bytes that
+ * was the current one previously
+ */
+ void *buffer; /* address of the block to read from
+ * or write to
+ */
+ unsigned int buf_offset; /* location to read from or write to,
+ * given as a displacement from 'buffer'
+ */
+ int sync_read; /* Set to one to notify the caller of
+ * snapshot_write_next() that it may
+ * need to call wait_on_bio_chain()
+ */
};
+/* This macro returns the address from/to which the caller of
+ * snapshot_read_next()/snapshot_write_next() is allowed to
+ * read/write data after the function returns
+ */
#define data_of(handle) ((handle).buffer + (handle).buf_offset)
+extern unsigned int snapshot_additional_pages(struct zone *zone);
extern int snapshot_read_next(struct snapshot_handle *handle, size_t count);
extern int snapshot_write_next(struct snapshot_handle *handle, size_t count);
-int snapshot_image_loaded(struct snapshot_handle *handle);
+extern int snapshot_image_loaded(struct snapshot_handle *handle);
+extern void snapshot_free_unused_memory(struct snapshot_handle *handle);
#define SNAPSHOT_IOC_MAGIC '3'
#define SNAPSHOT_FREEZE _IO(SNAPSHOT_IOC_MAGIC, 1)
diff --git a/kernel/power/smp.c b/kernel/power/smp.c
deleted file mode 100644
index 5957312b2d68..000000000000
--- a/kernel/power/smp.c
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * drivers/power/smp.c - Functions for stopping other CPUs.
- *
- * Copyright 2004 Pavel Machek <pavel@suse.cz>
- * Copyright (C) 2002-2003 Nigel Cunningham <ncunningham@clear.net.nz>
- *
- * This file is released under the GPLv2.
- */
-
-#undef DEBUG
-
-#include <linux/smp_lock.h>
-#include <linux/interrupt.h>
-#include <linux/suspend.h>
-#include <linux/module.h>
-#include <linux/cpu.h>
-#include <asm/atomic.h>
-#include <asm/tlbflush.h>
-
-/* This is protected by pm_sem semaphore */
-static cpumask_t frozen_cpus;
-
-void disable_nonboot_cpus(void)
-{
- int cpu, error;
-
- error = 0;
- cpus_clear(frozen_cpus);
- printk("Freezing cpus ...\n");
- for_each_online_cpu(cpu) {
- if (cpu == 0)
- continue;
- error = cpu_down(cpu);
- if (!error) {
- cpu_set(cpu, frozen_cpus);
- printk("CPU%d is down\n", cpu);
- continue;
- }
- printk("Error taking cpu %d down: %d\n", cpu, error);
- }
- BUG_ON(raw_smp_processor_id() != 0);
- if (error)
- panic("cpus not sleeping");
-}
-
-void enable_nonboot_cpus(void)
-{
- int cpu, error;
-
- printk("Thawing cpus ...\n");
- for_each_cpu_mask(cpu, frozen_cpus) {
- error = cpu_up(cpu);
- if (!error) {
- printk("CPU%d is up\n", cpu);
- continue;
- }
- printk("Error taking cpu %d up: %d\n", cpu, error);
- panic("Not enough cpus");
- }
- cpus_clear(frozen_cpus);
-}
-
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 75d4886e648e..1b84313cbab5 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -34,10 +34,12 @@
#include "power.h"
-struct pbe *pagedir_nosave;
+/* List of PBEs used for creating and restoring the suspend image */
+struct pbe *restore_pblist;
+
static unsigned int nr_copy_pages;
static unsigned int nr_meta_pages;
-static unsigned long *buffer;
+static void *buffer;
#ifdef CONFIG_HIGHMEM
unsigned int count_highmem_pages(void)
@@ -156,240 +158,637 @@ static inline int save_highmem(void) {return 0;}
static inline int restore_highmem(void) {return 0;}
#endif
-static int pfn_is_nosave(unsigned long pfn)
+/**
+ * @safe_needed - on resume, for storing the PBE list and the image,
+ * we can only use memory pages that do not conflict with the pages
+ * used before suspend.
+ *
+ * The unsafe pages are marked with the PG_nosave_free flag
+ * and we count them using unsafe_pages
+ */
+
+#define PG_ANY 0
+#define PG_SAFE 1
+#define PG_UNSAFE_CLEAR 1
+#define PG_UNSAFE_KEEP 0
+
+static unsigned int allocated_unsafe_pages;
+
+static void *alloc_image_page(gfp_t gfp_mask, int safe_needed)
{
- unsigned long nosave_begin_pfn = __pa(&__nosave_begin) >> PAGE_SHIFT;
- unsigned long nosave_end_pfn = PAGE_ALIGN(__pa(&__nosave_end)) >> PAGE_SHIFT;
- return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn);
+ void *res;
+
+ res = (void *)get_zeroed_page(gfp_mask);
+ if (safe_needed)
+ while (res && PageNosaveFree(virt_to_page(res))) {
+ /* The page is unsafe, mark it for swsusp_free() */
+ SetPageNosave(virt_to_page(res));
+ allocated_unsafe_pages++;
+ res = (void *)get_zeroed_page(gfp_mask);
+ }
+ if (res) {
+ SetPageNosave(virt_to_page(res));
+ SetPageNosaveFree(virt_to_page(res));
+ }
+ return res;
+}
+
+unsigned long get_safe_page(gfp_t gfp_mask)
+{
+ return (unsigned long)alloc_image_page(gfp_mask, PG_SAFE);
}
/**
- * saveable - Determine whether a page should be cloned or not.
- * @pfn: The page
- *
- * We save a page if it's Reserved, and not in the range of pages
- * statically defined as 'unsaveable', or if it isn't reserved, and
- * isn't part of a free chunk of pages.
+ * free_image_page - free page represented by @addr, allocated with
+ * alloc_image_page (page flags set by it must be cleared)
*/
-static int saveable(struct zone *zone, unsigned long *zone_pfn)
+static inline void free_image_page(void *addr, int clear_nosave_free)
{
- unsigned long pfn = *zone_pfn + zone->zone_start_pfn;
- struct page *page;
+ ClearPageNosave(virt_to_page(addr));
+ if (clear_nosave_free)
+ ClearPageNosaveFree(virt_to_page(addr));
+ free_page((unsigned long)addr);
+}
- if (!pfn_valid(pfn))
- return 0;
+/* struct linked_page is used to build chains of pages */
- page = pfn_to_page(pfn);
- BUG_ON(PageReserved(page) && PageNosave(page));
- if (PageNosave(page))
- return 0;
- if (PageReserved(page) && pfn_is_nosave(pfn))
- return 0;
- if (PageNosaveFree(page))
- return 0;
+#define LINKED_PAGE_DATA_SIZE (PAGE_SIZE - sizeof(void *))
- return 1;
-}
+struct linked_page {
+ struct linked_page *next;
+ char data[LINKED_PAGE_DATA_SIZE];
+} __attribute__((packed));
-unsigned int count_data_pages(void)
+static inline void
+free_list_of_pages(struct linked_page *list, int clear_page_nosave)
{
- struct zone *zone;
- unsigned long zone_pfn;
- unsigned int n = 0;
+ while (list) {
+ struct linked_page *lp = list->next;
- for_each_zone (zone) {
- if (is_highmem(zone))
- continue;
- mark_free_pages(zone);
- for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
- n += saveable(zone, &zone_pfn);
+ free_image_page(list, clear_page_nosave);
+ list = lp;
}
- return n;
}
-static void copy_data_pages(struct pbe *pblist)
+/**
+ * struct chain_allocator is used for allocating small objects out of
+ * a linked list of pages called 'the chain'.
+ *
+ * The chain grows each time when there is no room for a new object in
+ * the current page. The allocated objects cannot be freed individually.
+ * It is only possible to free them all at once, by freeing the entire
+ * chain.
+ *
+ * NOTE: The chain allocator may be inefficient if the allocated objects
+ * are not much smaller than PAGE_SIZE.
+ */
+
+struct chain_allocator {
+ struct linked_page *chain; /* the chain */
+ unsigned int used_space; /* total size of objects allocated out
+ * of the current page
+ */
+ gfp_t gfp_mask; /* mask for allocating pages */
+ int safe_needed; /* if set, only "safe" pages are allocated */
+};
+
+static void
+chain_init(struct chain_allocator *ca, gfp_t gfp_mask, int safe_needed)
{
- struct zone *zone;
- unsigned long zone_pfn;
- struct pbe *pbe, *p;
+ ca->chain = NULL;
+ ca->used_space = LINKED_PAGE_DATA_SIZE;
+ ca->gfp_mask = gfp_mask;
+ ca->safe_needed = safe_needed;
+}
- pbe = pblist;
- for_each_zone (zone) {
- if (is_highmem(zone))
- continue;
- mark_free_pages(zone);
- /* This is necessary for swsusp_free() */
- for_each_pb_page (p, pblist)
- SetPageNosaveFree(virt_to_page(p));
- for_each_pbe (p, pblist)
- SetPageNosaveFree(virt_to_page(p->address));
- for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) {
- if (saveable(zone, &zone_pfn)) {
- struct page *page;
- long *src, *dst;
- int n;
-
- page = pfn_to_page(zone_pfn + zone->zone_start_pfn);
- BUG_ON(!pbe);
- pbe->orig_address = (unsigned long)page_address(page);
- /* copy_page and memcpy are not usable for copying task structs. */
- dst = (long *)pbe->address;
- src = (long *)pbe->orig_address;
- for (n = PAGE_SIZE / sizeof(long); n; n--)
- *dst++ = *src++;
- pbe = pbe->next;
- }
- }
+static void *chain_alloc(struct chain_allocator *ca, unsigned int size)
+{
+ void *ret;
+
+ if (LINKED_PAGE_DATA_SIZE - ca->used_space < size) {
+ struct linked_page *lp;
+
+ lp = alloc_image_page(ca->gfp_mask, ca->safe_needed);
+ if (!lp)
+ return NULL;
+
+ lp->next = ca->chain;
+ ca->chain = lp;
+ ca->used_space = 0;
}
- BUG_ON(pbe);
+ ret = ca->chain->data + ca->used_space;
+ ca->used_space += size;
+ return ret;
}
+static void chain_free(struct chain_allocator *ca, int clear_page_nosave)
+{
+ free_list_of_pages(ca->chain, clear_page_nosave);
+ memset(ca, 0, sizeof(struct chain_allocator));
+}
/**
- * free_pagedir - free pages allocated with alloc_pagedir()
+ * Data types related to memory bitmaps.
+ *
+ * Memory bitmap is a structure consiting of many linked lists of
+ * objects. The main list's elements are of type struct zone_bitmap
+ * and each of them corresonds to one zone. For each zone bitmap
+ * object there is a list of objects of type struct bm_block that
+ * represent each blocks of bit chunks in which information is
+ * stored.
+ *
+ * struct memory_bitmap contains a pointer to the main list of zone
+ * bitmap objects, a struct bm_position used for browsing the bitmap,
+ * and a pointer to the list of pages used for allocating all of the
+ * zone bitmap objects and bitmap block objects.
+ *
+ * NOTE: It has to be possible to lay out the bitmap in memory
+ * using only allocations of order 0. Additionally, the bitmap is
+ * designed to work with arbitrary number of zones (this is over the
+ * top for now, but let's avoid making unnecessary assumptions ;-).
+ *
+ * struct zone_bitmap contains a pointer to a list of bitmap block
+ * objects and a pointer to the bitmap block object that has been
+ * most recently used for setting bits. Additionally, it contains the
+ * pfns that correspond to the start and end of the represented zone.
+ *
+ * struct bm_block contains a pointer to the memory page in which
+ * information is stored (in the form of a block of bit chunks
+ * of type unsigned long each). It also contains the pfns that
+ * correspond to the start and end of the represented memory area and
+ * the number of bit chunks in the block.
+ *
+ * NOTE: Memory bitmaps are used for two types of operations only:
+ * "set a bit" and "find the next bit set". Moreover, the searching
+ * is always carried out after all of the "set a bit" operations
+ * on given bitmap.
*/
-static void free_pagedir(struct pbe *pblist, int clear_nosave_free)
+#define BM_END_OF_MAP (~0UL)
+
+#define BM_CHUNKS_PER_BLOCK (PAGE_SIZE / sizeof(long))
+#define BM_BITS_PER_CHUNK (sizeof(long) << 3)
+#define BM_BITS_PER_BLOCK (PAGE_SIZE << 3)
+
+struct bm_block {
+ struct bm_block *next; /* next element of the list */
+ unsigned long start_pfn; /* pfn represented by the first bit */
+ unsigned long end_pfn; /* pfn represented by the last bit plus 1 */
+ unsigned int size; /* number of bit chunks */
+ unsigned long *data; /* chunks of bits representing pages */
+};
+
+struct zone_bitmap {
+ struct zone_bitmap *next; /* next element of the list */
+ unsigned long start_pfn; /* minimal pfn in this zone */
+ unsigned long end_pfn; /* maximal pfn in this zone plus 1 */
+ struct bm_block *bm_blocks; /* list of bitmap blocks */
+ struct bm_block *cur_block; /* recently used bitmap block */
+};
+
+/* strcut bm_position is used for browsing memory bitmaps */
+
+struct bm_position {
+ struct zone_bitmap *zone_bm;
+ struct bm_block *block;
+ int chunk;
+ int bit;
+};
+
+struct memory_bitmap {
+ struct zone_bitmap *zone_bm_list; /* list of zone bitmaps */
+ struct linked_page *p_list; /* list of pages used to store zone
+ * bitmap objects and bitmap block
+ * objects
+ */
+ struct bm_position cur; /* most recently used bit position */
+};
+
+/* Functions that operate on memory bitmaps */
+
+static inline void memory_bm_reset_chunk(struct memory_bitmap *bm)
{
- struct pbe *pbe;
+ bm->cur.chunk = 0;
+ bm->cur.bit = -1;
+}
- while (pblist) {
- pbe = (pblist + PB_PAGE_SKIP)->next;
- ClearPageNosave(virt_to_page(pblist));
- if (clear_nosave_free)
- ClearPageNosaveFree(virt_to_page(pblist));
- free_page((unsigned long)pblist);
- pblist = pbe;
- }
+static void memory_bm_position_reset(struct memory_bitmap *bm)
+{
+ struct zone_bitmap *zone_bm;
+
+ zone_bm = bm->zone_bm_list;
+ bm->cur.zone_bm = zone_bm;
+ bm->cur.block = zone_bm->bm_blocks;
+ memory_bm_reset_chunk(bm);
}
+static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free);
+
/**
- * fill_pb_page - Create a list of PBEs on a given memory page
+ * create_bm_block_list - create a list of block bitmap objects
*/
-static inline void fill_pb_page(struct pbe *pbpage)
+static inline struct bm_block *
+create_bm_block_list(unsigned int nr_blocks, struct chain_allocator *ca)
{
- struct pbe *p;
+ struct bm_block *bblist = NULL;
+
+ while (nr_blocks-- > 0) {
+ struct bm_block *bb;
- p = pbpage;
- pbpage += PB_PAGE_SKIP;
- do
- p->next = p + 1;
- while (++p < pbpage);
+ bb = chain_alloc(ca, sizeof(struct bm_block));
+ if (!bb)
+ return NULL;
+
+ bb->next = bblist;
+ bblist = bb;
+ }
+ return bblist;
}
/**
- * create_pbe_list - Create a list of PBEs on top of a given chain
- * of memory pages allocated with alloc_pagedir()
+ * create_zone_bm_list - create a list of zone bitmap objects
*/
-static inline void create_pbe_list(struct pbe *pblist, unsigned int nr_pages)
+static inline struct zone_bitmap *
+create_zone_bm_list(unsigned int nr_zones, struct chain_allocator *ca)
{
- struct pbe *pbpage, *p;
- unsigned int num = PBES_PER_PAGE;
+ struct zone_bitmap *zbmlist = NULL;
- for_each_pb_page (pbpage, pblist) {
- if (num >= nr_pages)
- break;
+ while (nr_zones-- > 0) {
+ struct zone_bitmap *zbm;
+
+ zbm = chain_alloc(ca, sizeof(struct zone_bitmap));
+ if (!zbm)
+ return NULL;
+
+ zbm->next = zbmlist;
+ zbmlist = zbm;
+ }
+ return zbmlist;
+}
+
+/**
+ * memory_bm_create - allocate memory for a memory bitmap
+ */
+
+static int
+memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)
+{
+ struct chain_allocator ca;
+ struct zone *zone;
+ struct zone_bitmap *zone_bm;
+ struct bm_block *bb;
+ unsigned int nr;
+
+ chain_init(&ca, gfp_mask, safe_needed);
- fill_pb_page(pbpage);
- num += PBES_PER_PAGE;
+ /* Compute the number of zones */
+ nr = 0;
+ for_each_zone (zone)
+ if (populated_zone(zone) && !is_highmem(zone))
+ nr++;
+
+ /* Allocate the list of zones bitmap objects */
+ zone_bm = create_zone_bm_list(nr, &ca);
+ bm->zone_bm_list = zone_bm;
+ if (!zone_bm) {
+ chain_free(&ca, PG_UNSAFE_CLEAR);
+ return -ENOMEM;
}
- if (pbpage) {
- for (num -= PBES_PER_PAGE - 1, p = pbpage; num < nr_pages; p++, num++)
- p->next = p + 1;
- p->next = NULL;
+
+ /* Initialize the zone bitmap objects */
+ for_each_zone (zone) {
+ unsigned long pfn;
+
+ if (!populated_zone(zone) || is_highmem(zone))
+ continue;
+
+ zone_bm->start_pfn = zone->zone_start_pfn;
+ zone_bm->end_pfn = zone->zone_start_pfn + zone->spanned_pages;
+ /* Allocate the list of bitmap block objects */
+ nr = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK);
+ bb = create_bm_block_list(nr, &ca);
+ zone_bm->bm_blocks = bb;
+ zone_bm->cur_block = bb;
+ if (!bb)
+ goto Free;
+
+ nr = zone->spanned_pages;
+ pfn = zone->zone_start_pfn;
+ /* Initialize the bitmap block objects */
+ while (bb) {
+ unsigned long *ptr;
+
+ ptr = alloc_image_page(gfp_mask, safe_needed);
+ bb->data = ptr;
+ if (!ptr)
+ goto Free;
+
+ bb->start_pfn = pfn;
+ if (nr >= BM_BITS_PER_BLOCK) {
+ pfn += BM_BITS_PER_BLOCK;
+ bb->size = BM_CHUNKS_PER_BLOCK;
+ nr -= BM_BITS_PER_BLOCK;
+ } else {
+ /* This is executed only once in the loop */
+ pfn += nr;
+ bb->size = DIV_ROUND_UP(nr, BM_BITS_PER_CHUNK);
+ }
+ bb->end_pfn = pfn;
+ bb = bb->next;
+ }
+ zone_bm = zone_bm->next;
}
+ bm->p_list = ca.chain;
+ memory_bm_position_reset(bm);
+ return 0;
+
+Free:
+ bm->p_list = ca.chain;
+ memory_bm_free(bm, PG_UNSAFE_CLEAR);
+ return -ENOMEM;
}
-static unsigned int unsafe_pages;
+/**
+ * memory_bm_free - free memory occupied by the memory bitmap @bm
+ */
+
+static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free)
+{
+ struct zone_bitmap *zone_bm;
+
+ /* Free the list of bit blocks for each zone_bitmap object */
+ zone_bm = bm->zone_bm_list;
+ while (zone_bm) {
+ struct bm_block *bb;
+
+ bb = zone_bm->bm_blocks;
+ while (bb) {
+ if (bb->data)
+ free_image_page(bb->data, clear_nosave_free);
+ bb = bb->next;
+ }
+ zone_bm = zone_bm->next;
+ }
+ free_list_of_pages(bm->p_list, clear_nosave_free);
+ bm->zone_bm_list = NULL;
+}
/**
- * @safe_needed - on resume, for storing the PBE list and the image,
- * we can only use memory pages that do not conflict with the pages
- * used before suspend.
+ * memory_bm_set_bit - set the bit in the bitmap @bm that corresponds
+ * to given pfn. The cur_zone_bm member of @bm and the cur_block member
+ * of @bm->cur_zone_bm are updated.
*
- * The unsafe pages are marked with the PG_nosave_free flag
- * and we count them using unsafe_pages
+ * If the bit cannot be set, the function returns -EINVAL .
*/
-static inline void *alloc_image_page(gfp_t gfp_mask, int safe_needed)
+static int
+memory_bm_set_bit(struct memory_bitmap *bm, unsigned long pfn)
{
- void *res;
-
- res = (void *)get_zeroed_page(gfp_mask);
- if (safe_needed)
- while (res && PageNosaveFree(virt_to_page(res))) {
- /* The page is unsafe, mark it for swsusp_free() */
- SetPageNosave(virt_to_page(res));
- unsafe_pages++;
- res = (void *)get_zeroed_page(gfp_mask);
+ struct zone_bitmap *zone_bm;
+ struct bm_block *bb;
+
+ /* Check if the pfn is from the current zone */
+ zone_bm = bm->cur.zone_bm;
+ if (pfn < zone_bm->start_pfn || pfn >= zone_bm->end_pfn) {
+ zone_bm = bm->zone_bm_list;
+ /* We don't assume that the zones are sorted by pfns */
+ while (pfn < zone_bm->start_pfn || pfn >= zone_bm->end_pfn) {
+ zone_bm = zone_bm->next;
+ if (unlikely(!zone_bm))
+ return -EINVAL;
}
- if (res) {
- SetPageNosave(virt_to_page(res));
- SetPageNosaveFree(virt_to_page(res));
+ bm->cur.zone_bm = zone_bm;
}
- return res;
+ /* Check if the pfn corresponds to the current bitmap block */
+ bb = zone_bm->cur_block;
+ if (pfn < bb->start_pfn)
+ bb = zone_bm->bm_blocks;
+
+ while (pfn >= bb->end_pfn) {
+ bb = bb->next;
+ if (unlikely(!bb))
+ return -EINVAL;
+ }
+ zone_bm->cur_block = bb;
+ pfn -= bb->start_pfn;
+ set_bit(pfn % BM_BITS_PER_CHUNK, bb->data + pfn / BM_BITS_PER_CHUNK);
+ return 0;
}
-unsigned long get_safe_page(gfp_t gfp_mask)
+/* Two auxiliary functions for memory_bm_next_pfn */
+
+/* Find the first set bit in the given chunk, if there is one */
+
+static inline int next_bit_in_chunk(int bit, unsigned long *chunk_p)
{
- return (unsigned long)alloc_image_page(gfp_mask, 1);
+ bit++;
+ while (bit < BM_BITS_PER_CHUNK) {
+ if (test_bit(bit, chunk_p))
+ return bit;
+
+ bit++;
+ }
+ return -1;
+}
+
+/* Find a chunk containing some bits set in given block of bits */
+
+static inline int next_chunk_in_block(int n, struct bm_block *bb)
+{
+ n++;
+ while (n < bb->size) {
+ if (bb->data[n])
+ return n;
+
+ n++;
+ }
+ return -1;
}
/**
- * alloc_pagedir - Allocate the page directory.
- *
- * First, determine exactly how many pages we need and
- * allocate them.
+ * memory_bm_next_pfn - find the pfn that corresponds to the next set bit
+ * in the bitmap @bm. If the pfn cannot be found, BM_END_OF_MAP is
+ * returned.
*
- * We arrange the pages in a chain: each page is an array of PBES_PER_PAGE
- * struct pbe elements (pbes) and the last element in the page points
- * to the next page.
+ * It is required to run memory_bm_position_reset() before the first call to
+ * this function.
+ */
+
+static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm)
+{
+ struct zone_bitmap *zone_bm;
+ struct bm_block *bb;
+ int chunk;
+ int bit;
+
+ do {
+ bb = bm->cur.block;
+ do {
+ chunk = bm->cur.chunk;
+ bit = bm->cur.bit;
+ do {
+ bit = next_bit_in_chunk(bit, bb->data + chunk);
+ if (bit >= 0)
+ goto Return_pfn;
+
+ chunk = next_chunk_in_block(chunk, bb);
+ bit = -1;
+ } while (chunk >= 0);
+ bb = bb->next;
+ bm->cur.block = bb;
+ memory_bm_reset_chunk(bm);
+ } while (bb);
+ zone_bm = bm->cur.zone_bm->next;
+ if (zone_bm) {
+ bm->cur.zone_bm = zone_bm;
+ bm->cur.block = zone_bm->bm_blocks;
+ memory_bm_reset_chunk(bm);
+ }
+ } while (zone_bm);
+ memory_bm_position_reset(bm);
+ return BM_END_OF_MAP;
+
+Return_pfn:
+ bm->cur.chunk = chunk;
+ bm->cur.bit = bit;
+ return bb->start_pfn + chunk * BM_BITS_PER_CHUNK + bit;
+}
+
+/**
+ * snapshot_additional_pages - estimate the number of additional pages
+ * be needed for setting up the suspend image data structures for given
+ * zone (usually the returned value is greater than the exact number)
+ */
+
+unsigned int snapshot_additional_pages(struct zone *zone)
+{
+ unsigned int res;
+
+ res = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK);
+ res += DIV_ROUND_UP(res * sizeof(struct bm_block), PAGE_SIZE);
+ return res;
+}
+
+/**
+ * pfn_is_nosave - check if given pfn is in the 'nosave' section
+ */
+
+static inline int pfn_is_nosave(unsigned long pfn)
+{
+ unsigned long nosave_begin_pfn = __pa(&__nosave_begin) >> PAGE_SHIFT;
+ unsigned long nosave_end_pfn = PAGE_ALIGN(__pa(&__nosave_end)) >> PAGE_SHIFT;
+ return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn);
+}
+
+/**
+ * saveable - Determine whether a page should be cloned or not.
+ * @pfn: The page
*
- * On each page we set up a list of struct_pbe elements.
+ * We save a page if it isn't Nosave, and is not in the range of pages
+ * statically defined as 'unsaveable', and it
+ * isn't a part of a free chunk of pages.
*/
-static struct pbe *alloc_pagedir(unsigned int nr_pages, gfp_t gfp_mask,
- int safe_needed)
+static struct page *saveable_page(unsigned long pfn)
{
- unsigned int num;
- struct pbe *pblist, *pbe;
+ struct page *page;
+
+ if (!pfn_valid(pfn))
+ return NULL;
- if (!nr_pages)
+ page = pfn_to_page(pfn);
+
+ if (PageNosave(page))
+ return NULL;
+ if (PageReserved(page) && pfn_is_nosave(pfn))
return NULL;
+ if (PageNosaveFree(page))
+ return NULL;
+
+ return page;
+}
+
+unsigned int count_data_pages(void)
+{
+ struct zone *zone;
+ unsigned long pfn, max_zone_pfn;
+ unsigned int n = 0;
- pblist = alloc_image_page(gfp_mask, safe_needed);
- /* FIXME: rewrite this ugly loop */
- for (pbe = pblist, num = PBES_PER_PAGE; pbe && num < nr_pages;
- pbe = pbe->next, num += PBES_PER_PAGE) {
- pbe += PB_PAGE_SKIP;
- pbe->next = alloc_image_page(gfp_mask, safe_needed);
+ for_each_zone (zone) {
+ if (is_highmem(zone))
+ continue;
+ mark_free_pages(zone);
+ max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
+ for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
+ n += !!saveable_page(pfn);
}
- if (!pbe) { /* get_zeroed_page() failed */
- free_pagedir(pblist, 1);
- pblist = NULL;
- } else
- create_pbe_list(pblist, nr_pages);
- return pblist;
+ return n;
+}
+
+static inline void copy_data_page(long *dst, long *src)
+{
+ int n;
+
+ /* copy_page and memcpy are not usable for copying task structs. */
+ for (n = PAGE_SIZE / sizeof(long); n; n--)
+ *dst++ = *src++;
+}
+
+static void
+copy_data_pages(struct memory_bitmap *copy_bm, struct memory_bitmap *orig_bm)
+{
+ struct zone *zone;
+ unsigned long pfn;
+
+ for_each_zone (zone) {
+ unsigned long max_zone_pfn;
+
+ if (is_highmem(zone))
+ continue;
+
+ mark_free_pages(zone);
+ max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
+ for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
+ if (saveable_page(pfn))
+ memory_bm_set_bit(orig_bm, pfn);
+ }
+ memory_bm_position_reset(orig_bm);
+ memory_bm_position_reset(copy_bm);
+ do {
+ pfn = memory_bm_next_pfn(orig_bm);
+ if (likely(pfn != BM_END_OF_MAP)) {
+ struct page *page;
+ void *src;
+
+ page = pfn_to_page(pfn);
+ src = page_address(page);
+ page = pfn_to_page(memory_bm_next_pfn(copy_bm));
+ copy_data_page(page_address(page), src);
+ }
+ } while (pfn != BM_END_OF_MAP);
}
/**
- * Free pages we allocated for suspend. Suspend pages are alocated
- * before atomic copy, so we need to free them after resume.
+ * swsusp_free - free pages allocated for the suspend.
+ *
+ * Suspend pages are alocated before the atomic copy is made, so we
+ * need to release them after the resume.
*/
void swsusp_free(void)
{
struct zone *zone;
- unsigned long zone_pfn;
+ unsigned long pfn, max_zone_pfn;
for_each_zone(zone) {
- for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
- if (pfn_valid(zone_pfn + zone->zone_start_pfn)) {
- struct page *page;
- page = pfn_to_page(zone_pfn + zone->zone_start_pfn);
+ max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
+ for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
+ if (pfn_valid(pfn)) {
+ struct page *page = pfn_to_page(pfn);
+
if (PageNosave(page) && PageNosaveFree(page)) {
ClearPageNosave(page);
ClearPageNosaveFree(page);
@@ -399,7 +798,7 @@ void swsusp_free(void)
}
nr_copy_pages = 0;
nr_meta_pages = 0;
- pagedir_nosave = NULL;
+ restore_pblist = NULL;
buffer = NULL;
}
@@ -414,46 +813,57 @@ void swsusp_free(void)
static int enough_free_mem(unsigned int nr_pages)
{
struct zone *zone;
- unsigned int n = 0;
+ unsigned int free = 0, meta = 0;
for_each_zone (zone)
- if (!is_highmem(zone))
- n += zone->free_pages;
- pr_debug("swsusp: available memory: %u pages\n", n);
- return n > (nr_pages + PAGES_FOR_IO +
- (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE);
-}
+ if (!is_highmem(zone)) {
+ free += zone->free_pages;
+ meta += snapshot_additional_pages(zone);
+ }
-static int alloc_data_pages(struct pbe *pblist, gfp_t gfp_mask, int safe_needed)
-{
- struct pbe *p;
+ pr_debug("swsusp: pages needed: %u + %u + %u, available pages: %u\n",
+ nr_pages, PAGES_FOR_IO, meta, free);
- for_each_pbe (p, pblist) {
- p->address = (unsigned long)alloc_image_page(gfp_mask, safe_needed);
- if (!p->address)
- return -ENOMEM;
- }
- return 0;
+ return free > nr_pages + PAGES_FOR_IO + meta;
}
-static struct pbe *swsusp_alloc(unsigned int nr_pages)
+static int
+swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm,
+ unsigned int nr_pages)
{
- struct pbe *pblist;
+ int error;
- if (!(pblist = alloc_pagedir(nr_pages, GFP_ATOMIC | __GFP_COLD, 0))) {
- printk(KERN_ERR "suspend: Allocating pagedir failed.\n");
- return NULL;
- }
+ error = memory_bm_create(orig_bm, GFP_ATOMIC | __GFP_COLD, PG_ANY);
+ if (error)
+ goto Free;
- if (alloc_data_pages(pblist, GFP_ATOMIC | __GFP_COLD, 0)) {
- printk(KERN_ERR "suspend: Allocating image pages failed.\n");
- swsusp_free();
- return NULL;
+ error = memory_bm_create(copy_bm, GFP_ATOMIC | __GFP_COLD, PG_ANY);
+ if (error)
+ goto Free;
+
+ while (nr_pages-- > 0) {
+ struct page *page = alloc_page(GFP_ATOMIC | __GFP_COLD);
+ if (!page)
+ goto Free;
+
+ SetPageNosave(page);
+ SetPageNosaveFree(page);
+ memory_bm_set_bit(copy_bm, page_to_pfn(page));
}
+ return 0;
- return pblist;
+Free:
+ swsusp_free();
+ return -ENOMEM;
}
+/* Memory bitmap used for marking saveable pages */
+static struct memory_bitmap orig_bm;
+/* Memory bitmap used for marking allocated pages that will contain the copies
+ * of saveable pages
+ */
+static struct memory_bitmap copy_bm;
+
asmlinkage int swsusp_save(void)
{
unsigned int nr_pages;
@@ -464,25 +874,19 @@ asmlinkage int swsusp_save(void)
nr_pages = count_data_pages();
printk("swsusp: Need to copy %u pages\n", nr_pages);
- pr_debug("swsusp: pages needed: %u + %lu + %u, free: %u\n",
- nr_pages,
- (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE,
- PAGES_FOR_IO, nr_free_pages());
-
if (!enough_free_mem(nr_pages)) {
printk(KERN_ERR "swsusp: Not enough free memory\n");
return -ENOMEM;
}
- pagedir_nosave = swsusp_alloc(nr_pages);
- if (!pagedir_nosave)
+ if (swsusp_alloc(&orig_bm, &copy_bm, nr_pages))
return -ENOMEM;
/* During allocating of suspend pagedir, new cold pages may appear.
* Kill them.
*/
drain_local_pages();
- copy_data_pages(pagedir_nosave);
+ copy_data_pages(&copy_bm, &orig_bm);
/*
* End of critical section. From now on, we can write to memory,
@@ -511,22 +915,20 @@ static void init_header(struct swsusp_info *info)
}
/**
- * pack_orig_addresses - the .orig_address fields of the PBEs from the
- * list starting at @pbe are stored in the array @buf[] (1 page)
+ * pack_pfns - pfns corresponding to the set bits found in the bitmap @bm
+ * are stored in the array @buf[] (1 page at a time)
*/
-static inline struct pbe *pack_orig_addresses(unsigned long *buf, struct pbe *pbe)
+static inline void
+pack_pfns(unsigned long *buf, struct memory_bitmap *bm)
{
int j;
- for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) {
- buf[j] = pbe->orig_address;
- pbe = pbe->next;
+ for (j = 0; j < PAGE_SIZE / sizeof(long); j++) {
+ buf[j] = memory_bm_next_pfn(bm);
+ if (unlikely(buf[j] == BM_END_OF_MAP))
+ break;
}
- if (!pbe)
- for (; j < PAGE_SIZE / sizeof(long); j++)
- buf[j] = 0;
- return pbe;
}
/**
@@ -553,37 +955,39 @@ static inline struct pbe *pack_orig_addresses(unsigned long *buf, struct pbe *pb
int snapshot_read_next(struct snapshot_handle *handle, size_t count)
{
- if (handle->page > nr_meta_pages + nr_copy_pages)
+ if (handle->cur > nr_meta_pages + nr_copy_pages)
return 0;
+
if (!buffer) {
/* This makes the buffer be freed by swsusp_free() */
- buffer = alloc_image_page(GFP_ATOMIC, 0);
+ buffer = alloc_image_page(GFP_ATOMIC, PG_ANY);
if (!buffer)
return -ENOMEM;
}
if (!handle->offset) {
init_header((struct swsusp_info *)buffer);
handle->buffer = buffer;
- handle->pbe = pagedir_nosave;
+ memory_bm_position_reset(&orig_bm);
+ memory_bm_position_reset(&copy_bm);
}
- if (handle->prev < handle->page) {
- if (handle->page <= nr_meta_pages) {
- handle->pbe = pack_orig_addresses(buffer, handle->pbe);
- if (!handle->pbe)
- handle->pbe = pagedir_nosave;
+ if (handle->prev < handle->cur) {
+ if (handle->cur <= nr_meta_pages) {
+ memset(buffer, 0, PAGE_SIZE);
+ pack_pfns(buffer, &orig_bm);
} else {
- handle->buffer = (void *)handle->pbe->address;
- handle->pbe = handle->pbe->next;
+ unsigned long pfn = memory_bm_next_pfn(&copy_bm);
+
+ handle->buffer = page_address(pfn_to_page(pfn));
}
- handle->prev = handle->page;
+ handle->prev = handle->cur;
}
- handle->buf_offset = handle->page_offset;
- if (handle->page_offset + count >= PAGE_SIZE) {
- count = PAGE_SIZE - handle->page_offset;
- handle->page_offset = 0;
- handle->page++;
+ handle->buf_offset = handle->cur_offset;
+ if (handle->cur_offset + count >= PAGE_SIZE) {
+ count = PAGE_SIZE - handle->cur_offset;
+ handle->cur_offset = 0;
+ handle->cur++;
} else {
- handle->page_offset += count;
+ handle->cur_offset += count;
}
handle->offset += count;
return count;
@@ -595,47 +999,50 @@ int snapshot_read_next(struct snapshot_handle *handle, size_t count)
* had been used before suspend
*/
-static int mark_unsafe_pages(struct pbe *pblist)
+static int mark_unsafe_pages(struct memory_bitmap *bm)
{
struct zone *zone;
- unsigned long zone_pfn;
- struct pbe *p;
-
- if (!pblist) /* a sanity check */
- return -EINVAL;
+ unsigned long pfn, max_zone_pfn;
/* Clear page flags */
for_each_zone (zone) {
- for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
- if (pfn_valid(zone_pfn + zone->zone_start_pfn))
- ClearPageNosaveFree(pfn_to_page(zone_pfn +
- zone->zone_start_pfn));
+ max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
+ for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
+ if (pfn_valid(pfn))
+ ClearPageNosaveFree(pfn_to_page(pfn));
}
- /* Mark orig addresses */
- for_each_pbe (p, pblist) {
- if (virt_addr_valid(p->orig_address))
- SetPageNosaveFree(virt_to_page(p->orig_address));
- else
- return -EFAULT;
- }
+ /* Mark pages that correspond to the "original" pfns as "unsafe" */
+ memory_bm_position_reset(bm);
+ do {
+ pfn = memory_bm_next_pfn(bm);
+ if (likely(pfn != BM_END_OF_MAP)) {
+ if (likely(pfn_valid(pfn)))
+ SetPageNosaveFree(pfn_to_page(pfn));
+ else
+ return -EFAULT;
+ }
+ } while (pfn != BM_END_OF_MAP);
- unsafe_pages = 0;
+ allocated_unsafe_pages = 0;
return 0;
}
-static void copy_page_backup_list(struct pbe *dst, struct pbe *src)
+static void
+duplicate_memory_bitmap(struct memory_bitmap *dst, struct memory_bitmap *src)
{
- /* We assume both lists contain the same number of elements */
- while (src) {
- dst->orig_address = src->orig_address;
- dst = dst->next;
- src = src->next;
+ unsigned long pfn;
+
+ memory_bm_position_reset(src);
+ pfn = memory_bm_next_pfn(src);
+ while (pfn != BM_END_OF_MAP) {
+ memory_bm_set_bit(dst, pfn);
+ pfn = memory_bm_next_pfn(src);
}
}
-static int check_header(struct swsusp_info *info)
+static inline int check_header(struct swsusp_info *info)
{
char *reason = NULL;
@@ -662,19 +1069,14 @@ static int check_header(struct swsusp_info *info)
* load header - check the image header and copy data from it
*/
-static int load_header(struct snapshot_handle *handle,
- struct swsusp_info *info)
+static int
+load_header(struct swsusp_info *info)
{
int error;
- struct pbe *pblist;
+ restore_pblist = NULL;
error = check_header(info);
if (!error) {
- pblist = alloc_pagedir(info->image_pages, GFP_ATOMIC, 0);
- if (!pblist)
- return -ENOMEM;
- pagedir_nosave = pblist;
- handle->pbe = pblist;
nr_copy_pages = info->image_pages;
nr_meta_pages = info->pages - info->image_pages - 1;
}
@@ -682,113 +1084,137 @@ static int load_header(struct snapshot_handle *handle,
}
/**
- * unpack_orig_addresses - copy the elements of @buf[] (1 page) to
- * the PBEs in the list starting at @pbe
+ * unpack_orig_pfns - for each element of @buf[] (1 page at a time) set
+ * the corresponding bit in the memory bitmap @bm
*/
-static inline struct pbe *unpack_orig_addresses(unsigned long *buf,
- struct pbe *pbe)
+static inline void
+unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm)
{
int j;
- for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) {
- pbe->orig_address = buf[j];
- pbe = pbe->next;
+ for (j = 0; j < PAGE_SIZE / sizeof(long); j++) {
+ if (unlikely(buf[j] == BM_END_OF_MAP))
+ break;
+
+ memory_bm_set_bit(bm, buf[j]);
}
- return pbe;
}
/**
- * prepare_image - use metadata contained in the PBE list
- * pointed to by pagedir_nosave to mark the pages that will
- * be overwritten in the process of restoring the system
- * memory state from the image ("unsafe" pages) and allocate
- * memory for the image
+ * prepare_image - use the memory bitmap @bm to mark the pages that will
+ * be overwritten in the process of restoring the system memory state
+ * from the suspend image ("unsafe" pages) and allocate memory for the
+ * image.
*
- * The idea is to allocate the PBE list first and then
- * allocate as many pages as it's needed for the image data,
- * but not to assign these pages to the PBEs initially.
- * Instead, we just mark them as allocated and create a list
- * of "safe" which will be used later
+ * The idea is to allocate a new memory bitmap first and then allocate
+ * as many pages as needed for the image data, but not to assign these
+ * pages to specific tasks initially. Instead, we just mark them as
+ * allocated and create a list of "safe" pages that will be used later.
*/
-struct safe_page {
- struct safe_page *next;
- char padding[PAGE_SIZE - sizeof(void *)];
-};
+#define PBES_PER_LINKED_PAGE (LINKED_PAGE_DATA_SIZE / sizeof(struct pbe))
-static struct safe_page *safe_pages;
+static struct linked_page *safe_pages_list;
-static int prepare_image(struct snapshot_handle *handle)
+static int
+prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
{
- int error = 0;
- unsigned int nr_pages = nr_copy_pages;
- struct pbe *p, *pblist = NULL;
+ unsigned int nr_pages;
+ struct linked_page *sp_list, *lp;
+ int error;
- p = pagedir_nosave;
- error = mark_unsafe_pages(p);
- if (!error) {
- pblist = alloc_pagedir(nr_pages, GFP_ATOMIC, 1);
- if (pblist)
- copy_page_backup_list(pblist, p);
- free_pagedir(p, 0);
- if (!pblist)
+ error = mark_unsafe_pages(bm);
+ if (error)
+ goto Free;
+
+ error = memory_bm_create(new_bm, GFP_ATOMIC, PG_SAFE);
+ if (error)
+ goto Free;
+
+ duplicate_memory_bitmap(new_bm, bm);
+ memory_bm_free(bm, PG_UNSAFE_KEEP);
+ /* Reserve some safe pages for potential later use.
+ *
+ * NOTE: This way we make sure there will be enough safe pages for the
+ * chain_alloc() in get_buffer(). It is a bit wasteful, but
+ * nr_copy_pages cannot be greater than 50% of the memory anyway.
+ */
+ sp_list = NULL;
+ /* nr_copy_pages cannot be lesser than allocated_unsafe_pages */
+ nr_pages = nr_copy_pages - allocated_unsafe_pages;
+ nr_pages = DIV_ROUND_UP(nr_pages, PBES_PER_LINKED_PAGE);
+ while (nr_pages > 0) {
+ lp = alloc_image_page(GFP_ATOMIC, PG_SAFE);
+ if (!lp) {
error = -ENOMEM;
+ goto Free;
+ }
+ lp->next = sp_list;
+ sp_list = lp;
+ nr_pages--;
}
- safe_pages = NULL;
- if (!error && nr_pages > unsafe_pages) {
- nr_pages -= unsafe_pages;
- while (nr_pages--) {
- struct safe_page *ptr;
-
- ptr = (struct safe_page *)get_zeroed_page(GFP_ATOMIC);
- if (!ptr) {
- error = -ENOMEM;
- break;
- }
- if (!PageNosaveFree(virt_to_page(ptr))) {
- /* The page is "safe", add it to the list */
- ptr->next = safe_pages;
- safe_pages = ptr;
- }
- /* Mark the page as allocated */
- SetPageNosave(virt_to_page(ptr));
- SetPageNosaveFree(virt_to_page(ptr));
+ /* Preallocate memory for the image */
+ safe_pages_list = NULL;
+ nr_pages = nr_copy_pages - allocated_unsafe_pages;
+ while (nr_pages > 0) {
+ lp = (struct linked_page *)get_zeroed_page(GFP_ATOMIC);
+ if (!lp) {
+ error = -ENOMEM;
+ goto Free;
+ }
+ if (!PageNosaveFree(virt_to_page(lp))) {
+ /* The page is "safe", add it to the list */
+ lp->next = safe_pages_list;
+ safe_pages_list = lp;
}
+ /* Mark the page as allocated */
+ SetPageNosave(virt_to_page(lp));
+ SetPageNosaveFree(virt_to_page(lp));
+ nr_pages--;
}
- if (!error) {
- pagedir_nosave = pblist;
- } else {
- handle->pbe = NULL;
- swsusp_free();
+ /* Free the reserved safe pages so that chain_alloc() can use them */
+ while (sp_list) {
+ lp = sp_list->next;
+ free_image_page(sp_list, PG_UNSAFE_CLEAR);
+ sp_list = lp;
}
+ return 0;
+
+Free:
+ swsusp_free();
return error;
}
-static void *get_buffer(struct snapshot_handle *handle)
+/**
+ * get_buffer - compute the address that snapshot_write_next() should
+ * set for its caller to write to.
+ */
+
+static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)
{
- struct pbe *pbe = handle->pbe, *last = handle->last_pbe;
- struct page *page = virt_to_page(pbe->orig_address);
+ struct pbe *pbe;
+ struct page *page = pfn_to_page(memory_bm_next_pfn(bm));
- if (PageNosave(page) && PageNosaveFree(page)) {
- /*
- * We have allocated the "original" page frame and we can
- * use it directly to store the read page
+ if (PageNosave(page) && PageNosaveFree(page))
+ /* We have allocated the "original" page frame and we can
+ * use it directly to store the loaded page.
*/
- pbe->address = 0;
- if (last && last->next)
- last->next = NULL;
- return (void *)pbe->orig_address;
- }
- /*
- * The "original" page frame has not been allocated and we have to
- * use a "safe" page frame to store the read page
+ return page_address(page);
+
+ /* The "original" page frame has not been allocated and we have to
+ * use a "safe" page frame to store the loaded page.
*/
- pbe->address = (unsigned long)safe_pages;
- safe_pages = safe_pages->next;
- if (last)
- last->next = pbe;
- handle->last_pbe = pbe;
+ pbe = chain_alloc(ca, sizeof(struct pbe));
+ if (!pbe) {
+ swsusp_free();
+ return NULL;
+ }
+ pbe->orig_address = (unsigned long)page_address(page);
+ pbe->address = (unsigned long)safe_pages_list;
+ safe_pages_list = safe_pages_list->next;
+ pbe->next = restore_pblist;
+ restore_pblist = pbe;
return (void *)pbe->address;
}
@@ -816,46 +1242,60 @@ static void *get_buffer(struct snapshot_handle *handle)
int snapshot_write_next(struct snapshot_handle *handle, size_t count)
{
+ static struct chain_allocator ca;
int error = 0;
- if (handle->prev && handle->page > nr_meta_pages + nr_copy_pages)
+ /* Check if we have already loaded the entire image */
+ if (handle->prev && handle->cur > nr_meta_pages + nr_copy_pages)
return 0;
+
if (!buffer) {
/* This makes the buffer be freed by swsusp_free() */
- buffer = alloc_image_page(GFP_ATOMIC, 0);
+ buffer = alloc_image_page(GFP_ATOMIC, PG_ANY);
if (!buffer)
return -ENOMEM;
}
if (!handle->offset)
handle->buffer = buffer;
- if (handle->prev < handle->page) {
- if (!handle->prev) {
- error = load_header(handle, (struct swsusp_info *)buffer);
+ handle->sync_read = 1;
+ if (handle->prev < handle->cur) {
+ if (handle->prev == 0) {
+ error = load_header(buffer);
if (error)
return error;
+
+ error = memory_bm_create(&copy_bm, GFP_ATOMIC, PG_ANY);
+ if (error)
+ return error;
+
} else if (handle->prev <= nr_meta_pages) {
- handle->pbe = unpack_orig_addresses(buffer, handle->pbe);
- if (!handle->pbe) {
- error = prepare_image(handle);
+ unpack_orig_pfns(buffer, &copy_bm);
+ if (handle->prev == nr_meta_pages) {
+ error = prepare_image(&orig_bm, &copy_bm);
if (error)
return error;
- handle->pbe = pagedir_nosave;
- handle->last_pbe = NULL;
- handle->buffer = get_buffer(handle);
+
+ chain_init(&ca, GFP_ATOMIC, PG_SAFE);
+ memory_bm_position_reset(&orig_bm);
+ restore_pblist = NULL;
+ handle->buffer = get_buffer(&orig_bm, &ca);
+ handle->sync_read = 0;
+ if (!handle->buffer)
+ return -ENOMEM;
}
} else {
- handle->pbe = handle->pbe->next;
- handle->buffer = get_buffer(handle);
+ handle->buffer = get_buffer(&orig_bm, &ca);
+ handle->sync_read = 0;
}
- handle->prev = handle->page;
+ handle->prev = handle->cur;
}
- handle->buf_offset = handle->page_offset;
- if (handle->page_offset + count >= PAGE_SIZE) {
- count = PAGE_SIZE - handle->page_offset;
- handle->page_offset = 0;
- handle->page++;
+ handle->buf_offset = handle->cur_offset;
+ if (handle->cur_offset + count >= PAGE_SIZE) {
+ count = PAGE_SIZE - handle->cur_offset;
+ handle->cur_offset = 0;
+ handle->cur++;
} else {
- handle->page_offset += count;
+ handle->cur_offset += count;
}
handle->offset += count;
return count;
@@ -863,6 +1303,13 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count)
int snapshot_image_loaded(struct snapshot_handle *handle)
{
- return !(!handle->pbe || handle->pbe->next || !nr_copy_pages ||
- handle->page <= nr_meta_pages + nr_copy_pages);
+ return !(!nr_copy_pages ||
+ handle->cur <= nr_meta_pages + nr_copy_pages);
+}
+
+void snapshot_free_unused_memory(struct snapshot_handle *handle)
+{
+ /* Free only if we have loaded the image entirely */
+ if (handle->prev && handle->cur > nr_meta_pages + nr_copy_pages)
+ memory_bm_free(&orig_bm, PG_UNSAFE_CLEAR);
}
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index f1dd146bd64d..9b2ee5344dee 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -22,6 +22,7 @@
#include <linux/device.h>
#include <linux/buffer_head.h>
#include <linux/bio.h>
+#include <linux/blkdev.h>
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/pm.h>
@@ -49,18 +50,16 @@ static int mark_swapfiles(swp_entry_t start)
{
int error;
- rw_swap_page_sync(READ,
- swp_entry(root_swap, 0),
- virt_to_page((unsigned long)&swsusp_header));
+ rw_swap_page_sync(READ, swp_entry(root_swap, 0),
+ virt_to_page((unsigned long)&swsusp_header), NULL);
if (!memcmp("SWAP-SPACE",swsusp_header.sig, 10) ||
!memcmp("SWAPSPACE2",swsusp_header.sig, 10)) {
memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10);
memcpy(swsusp_header.sig,SWSUSP_SIG, 10);
swsusp_header.image = start;
- error = rw_swap_page_sync(WRITE,
- swp_entry(root_swap, 0),
- virt_to_page((unsigned long)
- &swsusp_header));
+ error = rw_swap_page_sync(WRITE, swp_entry(root_swap, 0),
+ virt_to_page((unsigned long)&swsusp_header),
+ NULL);
} else {
pr_debug("swsusp: Partition is not swap space.\n");
error = -ENODEV;
@@ -88,16 +87,37 @@ static int swsusp_swap_check(void) /* This is called before saving image */
* write_page - Write one page to given swap location.
* @buf: Address we're writing.
* @offset: Offset of the swap page we're writing to.
+ * @bio_chain: Link the next write BIO here
*/
-static int write_page(void *buf, unsigned long offset)
+static int write_page(void *buf, unsigned long offset, struct bio **bio_chain)
{
swp_entry_t entry;
int error = -ENOSPC;
if (offset) {
+ struct page *page = virt_to_page(buf);
+
+ if (bio_chain) {
+ /*
+ * Whether or not we successfully allocated a copy page,
+ * we take a ref on the page here. It gets undone in
+ * wait_on_bio_chain().
+ */
+ struct page *page_copy;
+ page_copy = alloc_page(GFP_ATOMIC);
+ if (page_copy == NULL) {
+ WARN_ON_ONCE(1);
+ bio_chain = NULL; /* Go synchronous */
+ get_page(page);
+ } else {
+ memcpy(page_address(page_copy),
+ page_address(page), PAGE_SIZE);
+ page = page_copy;
+ }
+ }
entry = swp_entry(root_swap, offset);
- error = rw_swap_page_sync(WRITE, entry, virt_to_page(buf));
+ error = rw_swap_page_sync(WRITE, entry, page, bio_chain);
}
return error;
}
@@ -146,6 +166,26 @@ static void release_swap_writer(struct swap_map_handle *handle)
handle->bitmap = NULL;
}
+static void show_speed(struct timeval *start, struct timeval *stop,
+ unsigned nr_pages, char *msg)
+{
+ s64 elapsed_centisecs64;
+ int centisecs;
+ int k;
+ int kps;
+
+ elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start);
+ do_div(elapsed_centisecs64, NSEC_PER_SEC / 100);
+ centisecs = elapsed_centisecs64;
+ if (centisecs == 0)
+ centisecs = 1; /* avoid div-by-zero */
+ k = nr_pages * (PAGE_SIZE / 1024);
+ kps = (k * 100) / centisecs;
+ printk("%s %d kbytes in %d.%02d seconds (%d.%02d MB/s)\n", msg, k,
+ centisecs / 100, centisecs % 100,
+ kps / 1000, (kps % 1000) / 10);
+}
+
static int get_swap_writer(struct swap_map_handle *handle)
{
handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_KERNEL);
@@ -165,37 +205,70 @@ static int get_swap_writer(struct swap_map_handle *handle)
return 0;
}
-static int swap_write_page(struct swap_map_handle *handle, void *buf)
+static int wait_on_bio_chain(struct bio **bio_chain)
{
- int error;
+ struct bio *bio;
+ struct bio *next_bio;
+ int ret = 0;
+
+ if (bio_chain == NULL)
+ return 0;
+
+ bio = *bio_chain;
+ if (bio == NULL)
+ return 0;
+ while (bio) {
+ struct page *page;
+
+ next_bio = bio->bi_private;
+ page = bio->bi_io_vec[0].bv_page;
+ wait_on_page_locked(page);
+ if (!PageUptodate(page) || PageError(page))
+ ret = -EIO;
+ put_page(page);
+ bio_put(bio);
+ bio = next_bio;
+ }
+ *bio_chain = NULL;
+ return ret;
+}
+
+static int swap_write_page(struct swap_map_handle *handle, void *buf,
+ struct bio **bio_chain)
+{
+ int error = 0;
unsigned long offset;
if (!handle->cur)
return -EINVAL;
offset = alloc_swap_page(root_swap, handle->bitmap);
- error = write_page(buf, offset);
+ error = write_page(buf, offset, bio_chain);
if (error)
return error;
handle->cur->entries[handle->k++] = offset;
if (handle->k >= MAP_PAGE_ENTRIES) {
+ error = wait_on_bio_chain(bio_chain);
+ if (error)
+ goto out;
offset = alloc_swap_page(root_swap, handle->bitmap);
if (!offset)
return -ENOSPC;
handle->cur->next_swap = offset;
- error = write_page(handle->cur, handle->cur_swap);
+ error = write_page(handle->cur, handle->cur_swap, NULL);
if (error)
- return error;
+ goto out;
memset(handle->cur, 0, PAGE_SIZE);
handle->cur_swap = offset;
handle->k = 0;
}
- return 0;
+out:
+ return error;
}
static int flush_swap_writer(struct swap_map_handle *handle)
{
if (handle->cur && handle->cur_swap)
- return write_page(handle->cur, handle->cur_swap);
+ return write_page(handle->cur, handle->cur_swap, NULL);
else
return -EINVAL;
}
@@ -206,21 +279,29 @@ static int flush_swap_writer(struct swap_map_handle *handle)
static int save_image(struct swap_map_handle *handle,
struct snapshot_handle *snapshot,
- unsigned int nr_pages)
+ unsigned int nr_to_write)
{
unsigned int m;
int ret;
int error = 0;
+ int nr_pages;
+ int err2;
+ struct bio *bio;
+ struct timeval start;
+ struct timeval stop;
- printk("Saving image data pages (%u pages) ... ", nr_pages);
- m = nr_pages / 100;
+ printk("Saving image data pages (%u pages) ... ", nr_to_write);
+ m = nr_to_write / 100;
if (!m)
m = 1;
nr_pages = 0;
+ bio = NULL;
+ do_gettimeofday(&start);
do {
ret = snapshot_read_next(snapshot, PAGE_SIZE);
if (ret > 0) {
- error = swap_write_page(handle, data_of(*snapshot));
+ error = swap_write_page(handle, data_of(*snapshot),
+ &bio);
if (error)
break;
if (!(nr_pages % m))
@@ -228,8 +309,13 @@ static int save_image(struct swap_map_handle *handle,
nr_pages++;
}
} while (ret > 0);
+ err2 = wait_on_bio_chain(&bio);
+ do_gettimeofday(&stop);
+ if (!error)
+ error = err2;
if (!error)
printk("\b\b\b\bdone\n");
+ show_speed(&start, &stop, nr_to_write, "Wrote");
return error;
}
@@ -245,8 +331,7 @@ static int enough_swap(unsigned int nr_pages)
unsigned int free_swap = count_swap_pages(root_swap, 1);
pr_debug("swsusp: free swap pages: %u\n", free_swap);
- return free_swap > (nr_pages + PAGES_FOR_IO +
- (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE);
+ return free_swap > nr_pages + PAGES_FOR_IO;
}
/**
@@ -266,7 +351,8 @@ int swsusp_write(void)
int error;
if ((error = swsusp_swap_check())) {
- printk(KERN_ERR "swsusp: Cannot find swap device, try swapon -a.\n");
+ printk(KERN_ERR "swsusp: Cannot find swap device, try "
+ "swapon -a.\n");
return error;
}
memset(&snapshot, 0, sizeof(struct snapshot_handle));
@@ -281,7 +367,7 @@ int swsusp_write(void)
error = get_swap_writer(&handle);
if (!error) {
unsigned long start = handle.cur_swap;
- error = swap_write_page(&handle, header);
+ error = swap_write_page(&handle, header, NULL);
if (!error)
error = save_image(&handle, &snapshot,
header->pages - 1);
@@ -298,27 +384,6 @@ int swsusp_write(void)
return error;
}
-/*
- * Using bio to read from swap.
- * This code requires a bit more work than just using buffer heads
- * but, it is the recommended way for 2.5/2.6.
- * The following are to signal the beginning and end of I/O. Bios
- * finish asynchronously, while we want them to happen synchronously.
- * A simple atomic_t, and a wait loop take care of this problem.
- */
-
-static atomic_t io_done = ATOMIC_INIT(0);
-
-static int end_io(struct bio *bio, unsigned int num, int err)
-{
- if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
- printk(KERN_ERR "I/O error reading swsusp image.\n");
- return -EIO;
- }
- atomic_set(&io_done, 0);
- return 0;
-}
-
static struct block_device *resume_bdev;
/**
@@ -326,15 +391,15 @@ static struct block_device *resume_bdev;
* @rw: READ or WRITE.
* @off physical offset of page.
* @page: page we're reading or writing.
+ * @bio_chain: list of pending biod (for async reading)
*
* Straight from the textbook - allocate and initialize the bio.
- * If we're writing, make sure the page is marked as dirty.
- * Then submit it and wait.
+ * If we're reading, make sure the page is marked as dirty.
+ * Then submit it and, if @bio_chain == NULL, wait.
*/
-
-static int submit(int rw, pgoff_t page_off, void *page)
+static int submit(int rw, pgoff_t page_off, struct page *page,
+ struct bio **bio_chain)
{
- int error = 0;
struct bio *bio;
bio = bio_alloc(GFP_ATOMIC, 1);
@@ -342,33 +407,40 @@ static int submit(int rw, pgoff_t page_off, void *page)
return -ENOMEM;
bio->bi_sector = page_off * (PAGE_SIZE >> 9);
bio->bi_bdev = resume_bdev;
- bio->bi_end_io = end_io;
+ bio->bi_end_io = end_swap_bio_read;
- if (bio_add_page(bio, virt_to_page(page), PAGE_SIZE, 0) < PAGE_SIZE) {
- printk("swsusp: ERROR: adding page to bio at %ld\n",page_off);
- error = -EFAULT;
- goto Done;
+ if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
+ printk("swsusp: ERROR: adding page to bio at %ld\n", page_off);
+ bio_put(bio);
+ return -EFAULT;
}
- atomic_set(&io_done, 1);
- submit_bio(rw | (1 << BIO_RW_SYNC), bio);
- while (atomic_read(&io_done))
- yield();
- if (rw == READ)
- bio_set_pages_dirty(bio);
- Done:
- bio_put(bio);
- return error;
+ lock_page(page);
+ bio_get(bio);
+
+ if (bio_chain == NULL) {
+ submit_bio(rw | (1 << BIO_RW_SYNC), bio);
+ wait_on_page_locked(page);
+ if (rw == READ)
+ bio_set_pages_dirty(bio);
+ bio_put(bio);
+ } else {
+ get_page(page);
+ bio->bi_private = *bio_chain;
+ *bio_chain = bio;
+ submit_bio(rw | (1 << BIO_RW_SYNC), bio);
+ }
+ return 0;
}
-static int bio_read_page(pgoff_t page_off, void *page)
+static int bio_read_page(pgoff_t page_off, void *addr, struct bio **bio_chain)
{
- return submit(READ, page_off, page);
+ return submit(READ, page_off, virt_to_page(addr), bio_chain);
}
-static int bio_write_page(pgoff_t page_off, void *page)
+static int bio_write_page(pgoff_t page_off, void *addr)
{
- return submit(WRITE, page_off, page);
+ return submit(WRITE, page_off, virt_to_page(addr), NULL);
}
/**
@@ -393,7 +465,7 @@ static int get_swap_reader(struct swap_map_handle *handle,
handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC);
if (!handle->cur)
return -ENOMEM;
- error = bio_read_page(swp_offset(start), handle->cur);
+ error = bio_read_page(swp_offset(start), handle->cur, NULL);
if (error) {
release_swap_reader(handle);
return error;
@@ -402,7 +474,8 @@ static int get_swap_reader(struct swap_map_handle *handle,
return 0;
}
-static int swap_read_page(struct swap_map_handle *handle, void *buf)
+static int swap_read_page(struct swap_map_handle *handle, void *buf,
+ struct bio **bio_chain)
{
unsigned long offset;
int error;
@@ -412,16 +485,17 @@ static int swap_read_page(struct swap_map_handle *handle, void *buf)
offset = handle->cur->entries[handle->k];
if (!offset)
return -EFAULT;
- error = bio_read_page(offset, buf);
+ error = bio_read_page(offset, buf, bio_chain);
if (error)
return error;
if (++handle->k >= MAP_PAGE_ENTRIES) {
+ error = wait_on_bio_chain(bio_chain);
handle->k = 0;
offset = handle->cur->next_swap;
if (!offset)
release_swap_reader(handle);
- else
- error = bio_read_page(offset, handle->cur);
+ else if (!error)
+ error = bio_read_page(offset, handle->cur, NULL);
}
return error;
}
@@ -434,33 +508,49 @@ static int swap_read_page(struct swap_map_handle *handle, void *buf)
static int load_image(struct swap_map_handle *handle,
struct snapshot_handle *snapshot,
- unsigned int nr_pages)
+ unsigned int nr_to_read)
{
unsigned int m;
- int ret;
int error = 0;
+ struct timeval start;
+ struct timeval stop;
+ struct bio *bio;
+ int err2;
+ unsigned nr_pages;
- printk("Loading image data pages (%u pages) ... ", nr_pages);
- m = nr_pages / 100;
+ printk("Loading image data pages (%u pages) ... ", nr_to_read);
+ m = nr_to_read / 100;
if (!m)
m = 1;
nr_pages = 0;
- do {
- ret = snapshot_write_next(snapshot, PAGE_SIZE);
- if (ret > 0) {
- error = swap_read_page(handle, data_of(*snapshot));
- if (error)
- break;
- if (!(nr_pages % m))
- printk("\b\b\b\b%3d%%", nr_pages / m);
- nr_pages++;
- }
- } while (ret > 0);
+ bio = NULL;
+ do_gettimeofday(&start);
+ for ( ; ; ) {
+ error = snapshot_write_next(snapshot, PAGE_SIZE);
+ if (error <= 0)
+ break;
+ error = swap_read_page(handle, data_of(*snapshot), &bio);
+ if (error)
+ break;
+ if (snapshot->sync_read)
+ error = wait_on_bio_chain(&bio);
+ if (error)
+ break;
+ if (!(nr_pages % m))
+ printk("\b\b\b\b%3d%%", nr_pages / m);
+ nr_pages++;
+ }
+ err2 = wait_on_bio_chain(&bio);
+ do_gettimeofday(&stop);
+ if (!error)
+ error = err2;
if (!error) {
printk("\b\b\b\bdone\n");
+ snapshot_free_unused_memory(snapshot);
if (!snapshot_image_loaded(snapshot))
error = -ENODATA;
}
+ show_speed(&start, &stop, nr_to_read, "Read");
return error;
}
@@ -483,7 +573,7 @@ int swsusp_read(void)
header = (struct swsusp_info *)data_of(snapshot);
error = get_swap_reader(&handle, swsusp_header.image);
if (!error)
- error = swap_read_page(&handle, header);
+ error = swap_read_page(&handle, header, NULL);
if (!error)
error = load_image(&handle, &snapshot, header->pages - 1);
release_swap_reader(&handle);
@@ -509,7 +599,7 @@ int swsusp_check(void)
if (!IS_ERR(resume_bdev)) {
set_blocksize(resume_bdev, PAGE_SIZE);
memset(&swsusp_header, 0, sizeof(swsusp_header));
- if ((error = bio_read_page(0, &swsusp_header)))
+ if ((error = bio_read_page(0, &swsusp_header, NULL)))
return error;
if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) {
memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10);
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 17f669c83012..0b66659dc516 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -193,14 +193,13 @@ int swsusp_shrink_memory(void)
printk("Shrinking memory... ");
do {
size = 2 * count_highmem_pages();
- size += size / 50 + count_data_pages();
- size += (size + PBES_PER_PAGE - 1) / PBES_PER_PAGE +
- PAGES_FOR_IO;
+ size += size / 50 + count_data_pages() + PAGES_FOR_IO;
tmp = size;
for_each_zone (zone)
if (!is_highmem(zone) && populated_zone(zone)) {
tmp -= zone->free_pages;
tmp += zone->lowmem_reserve[ZONE_NORMAL];
+ tmp += snapshot_additional_pages(zone);
}
if (tmp > 0) {
tmp = __shrink_memory(tmp);
@@ -248,6 +247,9 @@ int swsusp_suspend(void)
restore_processor_state();
Restore_highmem:
restore_highmem();
+ /* NOTE: device_power_up() is just a resume() for devices
+ * that suspended with irqs off ... no overall powerup.
+ */
device_power_up();
Enable_irqs:
local_irq_enable();
@@ -257,8 +259,12 @@ Enable_irqs:
int swsusp_resume(void)
{
int error;
+
local_irq_disable();
- if (device_power_down(PMSG_FREEZE))
+ /* NOTE: device_power_down() is just a suspend() with irqs off;
+ * it has no special "power things down" semantics
+ */
+ if (device_power_down(PMSG_PRETHAW))
printk(KERN_ERR "Some devices failed to power down, very bad\n");
/* We'll ignore saved state, but this gets preempt count (etc) right */
save_processor_state();
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 3f1539fbe48a..72825c853cd7 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -19,6 +19,7 @@
#include <linux/swapops.h>
#include <linux/pm.h>
#include <linux/fs.h>
+#include <linux/cpu.h>
#include <asm/uaccess.h>
@@ -139,12 +140,15 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
if (data->frozen)
break;
down(&pm_sem);
- disable_nonboot_cpus();
- if (freeze_processes()) {
- thaw_processes();
- enable_nonboot_cpus();
- error = -EBUSY;
+ error = disable_nonboot_cpus();
+ if (!error) {
+ error = freeze_processes();
+ if (error) {
+ thaw_processes();
+ error = -EBUSY;
+ }
}
+ enable_nonboot_cpus();
up(&pm_sem);
if (!error)
data->frozen = 1;
@@ -189,9 +193,10 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
error = -EPERM;
break;
}
+ snapshot_free_unused_memory(&data->handle);
down(&pm_sem);
pm_prepare_console();
- error = device_suspend(PMSG_FREEZE);
+ error = device_suspend(PMSG_PRETHAW);
if (!error) {
error = swsusp_resume();
device_resume();
diff --git a/kernel/printk.c b/kernel/printk.c
index 1149365e989e..771f5e861bcd 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -721,6 +721,7 @@ int __init add_preferred_console(char *name, int idx, char *options)
return 0;
}
+#ifndef CONFIG_DISABLE_CONSOLE_SUSPEND
/**
* suspend_console - suspend the console subsystem
*
@@ -728,6 +729,7 @@ int __init add_preferred_console(char *name, int idx, char *options)
*/
void suspend_console(void)
{
+ printk("Suspending console(s)\n");
acquire_console_sem();
console_suspended = 1;
}
@@ -737,6 +739,7 @@ void resume_console(void)
console_suspended = 0;
release_console_sem();
}
+#endif /* CONFIG_DISABLE_CONSOLE_SUSPEND */
/**
* acquire_console_sem - lock the console system for exclusive use.
diff --git a/kernel/profile.c b/kernel/profile.c
index d5bd75e7501c..fb660c7d35ba 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -309,13 +309,17 @@ static int __devinit profile_cpu_callback(struct notifier_block *info,
node = cpu_to_node(cpu);
per_cpu(cpu_profile_flip, cpu) = 0;
if (!per_cpu(cpu_profile_hits, cpu)[1]) {
- page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
+ page = alloc_pages_node(node,
+ GFP_KERNEL | __GFP_ZERO | GFP_THISNODE,
+ 0);
if (!page)
return NOTIFY_BAD;
per_cpu(cpu_profile_hits, cpu)[1] = page_address(page);
}
if (!per_cpu(cpu_profile_hits, cpu)[0]) {
- page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
+ page = alloc_pages_node(node,
+ GFP_KERNEL | __GFP_ZERO | GFP_THISNODE,
+ 0);
if (!page)
goto out_free;
per_cpu(cpu_profile_hits, cpu)[0] = page_address(page);
@@ -491,12 +495,16 @@ static int __init create_hash_tables(void)
int node = cpu_to_node(cpu);
struct page *page;
- page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
+ page = alloc_pages_node(node,
+ GFP_KERNEL | __GFP_ZERO | GFP_THISNODE,
+ 0);
if (!page)
goto out_cleanup;
per_cpu(cpu_profile_hits, cpu)[1]
= (struct profile_hit *)page_address(page);
- page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
+ page = alloc_pages_node(node,
+ GFP_KERNEL | __GFP_ZERO | GFP_THISNODE,
+ 0);
if (!page)
goto out_cleanup;
per_cpu(cpu_profile_hits, cpu)[0]
diff --git a/kernel/sched.c b/kernel/sched.c
index a234fbee1238..5c848fd4e461 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -238,6 +238,7 @@ struct rq {
/* For active balancing */
int active_balance;
int push_cpu;
+ int cpu; /* cpu of this runqueue */
struct task_struct *migration_thread;
struct list_head migration_queue;
@@ -267,6 +268,15 @@ struct rq {
static DEFINE_PER_CPU(struct rq, runqueues);
+static inline int cpu_of(struct rq *rq)
+{
+#ifdef CONFIG_SMP
+ return rq->cpu;
+#else
+ return 0;
+#endif
+}
+
/*
* The domain tree (rq->sd) is protected by RCU's quiescent state transition.
* See detach_destroy_domains: synchronize_sched for details.
@@ -2211,7 +2221,8 @@ out:
*/
static struct sched_group *
find_busiest_group(struct sched_domain *sd, int this_cpu,
- unsigned long *imbalance, enum idle_type idle, int *sd_idle)
+ unsigned long *imbalance, enum idle_type idle, int *sd_idle,
+ cpumask_t *cpus)
{
struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
unsigned long max_load, avg_load, total_load, this_load, total_pwr;
@@ -2248,7 +2259,12 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
sum_weighted_load = sum_nr_running = avg_load = 0;
for_each_cpu_mask(i, group->cpumask) {
- struct rq *rq = cpu_rq(i);
+ struct rq *rq;
+
+ if (!cpu_isset(i, *cpus))
+ continue;
+
+ rq = cpu_rq(i);
if (*sd_idle && !idle_cpu(i))
*sd_idle = 0;
@@ -2466,13 +2482,17 @@ ret:
*/
static struct rq *
find_busiest_queue(struct sched_group *group, enum idle_type idle,
- unsigned long imbalance)
+ unsigned long imbalance, cpumask_t *cpus)
{
struct rq *busiest = NULL, *rq;
unsigned long max_load = 0;
int i;
for_each_cpu_mask(i, group->cpumask) {
+
+ if (!cpu_isset(i, *cpus))
+ continue;
+
rq = cpu_rq(i);
if (rq->nr_running == 1 && rq->raw_weighted_load > imbalance)
@@ -2511,6 +2531,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
struct sched_group *group;
unsigned long imbalance;
struct rq *busiest;
+ cpumask_t cpus = CPU_MASK_ALL;
if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
!sched_smt_power_savings)
@@ -2518,13 +2539,15 @@ static int load_balance(int this_cpu, struct rq *this_rq,
schedstat_inc(sd, lb_cnt[idle]);
- group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle);
+redo:
+ group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
+ &cpus);
if (!group) {
schedstat_inc(sd, lb_nobusyg[idle]);
goto out_balanced;
}
- busiest = find_busiest_queue(group, idle, imbalance);
+ busiest = find_busiest_queue(group, idle, imbalance, &cpus);
if (!busiest) {
schedstat_inc(sd, lb_nobusyq[idle]);
goto out_balanced;
@@ -2549,8 +2572,12 @@ static int load_balance(int this_cpu, struct rq *this_rq,
double_rq_unlock(this_rq, busiest);
/* All tasks on this runqueue were pinned by CPU affinity */
- if (unlikely(all_pinned))
+ if (unlikely(all_pinned)) {
+ cpu_clear(cpu_of(busiest), cpus);
+ if (!cpus_empty(cpus))
+ goto redo;
goto out_balanced;
+ }
}
if (!nr_moved) {
@@ -2639,18 +2666,22 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
unsigned long imbalance;
int nr_moved = 0;
int sd_idle = 0;
+ cpumask_t cpus = CPU_MASK_ALL;
if (sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings)
sd_idle = 1;
schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
- group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE, &sd_idle);
+redo:
+ group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE,
+ &sd_idle, &cpus);
if (!group) {
schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]);
goto out_balanced;
}
- busiest = find_busiest_queue(group, NEWLY_IDLE, imbalance);
+ busiest = find_busiest_queue(group, NEWLY_IDLE, imbalance,
+ &cpus);
if (!busiest) {
schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]);
goto out_balanced;
@@ -2668,6 +2699,12 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
minus_1_or_zero(busiest->nr_running),
imbalance, sd, NEWLY_IDLE, NULL);
spin_unlock(&busiest->lock);
+
+ if (!nr_moved) {
+ cpu_clear(cpu_of(busiest), cpus);
+ if (!cpus_empty(cpus))
+ goto redo;
+ }
}
if (!nr_moved) {
@@ -6747,6 +6784,7 @@ void __init sched_init(void)
rq->cpu_load[j] = 0;
rq->active_balance = 0;
rq->push_cpu = 0;
+ rq->cpu = i;
rq->migration_thread = NULL;
INIT_LIST_HEAD(&rq->migration_queue);
#endif
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 220e20564124..bcb3a181dbb2 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -960,6 +960,17 @@ static ctl_table vm_table[] = {
.extra1 = &zero,
.extra2 = &one_hundred,
},
+ {
+ .ctl_name = VM_MIN_SLAB,
+ .procname = "min_slab_ratio",
+ .data = &sysctl_min_slab_ratio,
+ .maxlen = sizeof(sysctl_min_slab_ratio),
+ .mode = 0644,
+ .proc_handler = &sysctl_min_slab_ratio_sysctl_handler,
+ .strategy = &sysctl_intvec,
+ .extra1 = &zero,
+ .extra2 = &one_hundred,
+ },
#endif
#ifdef CONFIG_X86_32
{