diff options
Diffstat (limited to 'init')
-rw-r--r-- | init/Kconfig | 439 | ||||
-rw-r--r-- | init/do_mounts.c | 87 | ||||
-rw-r--r-- | init/do_mounts_initrd.c | 11 | ||||
-rw-r--r-- | init/do_mounts_md.c | 2 | ||||
-rw-r--r-- | init/do_mounts_rd.c | 5 | ||||
-rw-r--r-- | init/initramfs.c | 29 | ||||
-rw-r--r-- | init/main.c | 149 | ||||
-rw-r--r-- | init/noinitramfs.c | 6 |
8 files changed, 461 insertions, 267 deletions
diff --git a/init/Kconfig b/init/Kconfig index 313506d8be6e..4e337906016e 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -21,6 +21,13 @@ config CONSTRUCTORS depends on !UML default y +config HAVE_IRQ_WORK + bool + +config IRQ_WORK + bool + depends on HAVE_IRQ_WORK + menu "General setup" config EXPERIMENTAL @@ -64,7 +71,7 @@ config BROKEN_ON_SMP config LOCK_KERNEL bool - depends on SMP || PREEMPT + depends on (SMP || PREEMPT) && BKL default y config INIT_ENV_ARG_LIMIT @@ -123,13 +130,16 @@ config HAVE_KERNEL_BZIP2 config HAVE_KERNEL_LZMA bool +config HAVE_KERNEL_XZ + bool + config HAVE_KERNEL_LZO bool choice prompt "Kernel compression mode" default KERNEL_GZIP - depends on HAVE_KERNEL_GZIP || HAVE_KERNEL_BZIP2 || HAVE_KERNEL_LZMA || HAVE_KERNEL_LZO + depends on HAVE_KERNEL_GZIP || HAVE_KERNEL_BZIP2 || HAVE_KERNEL_LZMA || HAVE_KERNEL_XZ || HAVE_KERNEL_LZO help The linux kernel is a kind of self-extracting executable. Several compression algorithms are available, which differ @@ -174,12 +184,27 @@ config KERNEL_LZMA two. Compression is slowest. The kernel size is about 33% smaller with LZMA in comparison to gzip. +config KERNEL_XZ + bool "XZ" + depends on HAVE_KERNEL_XZ + help + XZ uses the LZMA2 algorithm and instruction set specific + BCJ filters which can improve compression ratio of executable + code. The size of the kernel is about 30% smaller with XZ in + comparison to gzip. On architectures for which there is a BCJ + filter (i386, x86_64, ARM, IA-64, PowerPC, and SPARC), XZ + will create a few percent smaller kernel than plain LZMA. + + The speed is about the same as with LZMA: The decompression + speed of XZ is better than that of bzip2 but worse than gzip + and LZO. Compression is slow. + config KERNEL_LZO bool "LZO" depends on HAVE_KERNEL_LZO help Its compression ratio is the poorest among the 4. The kernel - size is about about 10% bigger than gzip; however its speed + size is about 10% bigger than gzip; however its speed (both compression and decompression) is the fastest. endchoice @@ -320,13 +345,19 @@ config AUDITSYSCALL help Enable low-overhead system-call auditing infrastructure that can be used independently or with another kernel subsystem, - such as SELinux. To use audit's filesystem watch feature, please - ensure that INOTIFY is configured. + such as SELinux. + +config AUDIT_WATCH + def_bool y + depends on AUDITSYSCALL + select FSNOTIFY config AUDIT_TREE def_bool y depends on AUDITSYSCALL - select INOTIFY + select FSNOTIFY + +source "kernel/irq/Kconfig" menu "RCU Subsystem" @@ -336,6 +367,7 @@ choice config TREE_RCU bool "Tree-based hierarchical RCU" + depends on !PREEMPT && SMP help This option selects the RCU implementation that is designed for very large SMP system with hundreds or @@ -343,7 +375,7 @@ config TREE_RCU smaller systems. config TREE_PREEMPT_RCU - bool "Preemptable tree-based hierarchical RCU" + bool "Preemptible tree-based hierarchical RCU" depends on PREEMPT help This option selects the RCU implementation that is @@ -361,11 +393,24 @@ config TINY_RCU is not required. This option greatly reduces the memory footprint of RCU. +config TINY_PREEMPT_RCU + bool "Preemptible UP-only small-memory-footprint RCU" + depends on !SMP && PREEMPT + help + This option selects the RCU implementation that is designed + for real-time UP systems. This option greatly reduces the + memory footprint of RCU. + endchoice +config PREEMPT_RCU + def_bool ( TREE_PREEMPT_RCU || TINY_PREEMPT_RCU ) + help + This option enables preemptible-RCU code that is common between + the TREE_PREEMPT_RCU and TINY_PREEMPT_RCU implementations. + config RCU_TRACE bool "Enable tracing for RCU" - depends on TREE_RCU || TREE_PREEMPT_RCU help This option provides tracing in RCU which presents stats in debugfs for debugging RCU implementation. @@ -383,9 +428,12 @@ config RCU_FANOUT help This option controls the fanout of hierarchical implementations of RCU, allowing RCU to work efficiently on machines with - large numbers of CPUs. This value must be at least the cube - root of NR_CPUS, which allows NR_CPUS up to 32,768 for 32-bit - systems and up to 262,144 for 64-bit systems. + large numbers of CPUs. This value must be at least the fourth + root of NR_CPUS, which allows NR_CPUS to be insanely large. + The default value of RCU_FANOUT should be used for production + systems, but if you are stress-testing the RCU implementation + itself, small RCU_FANOUT values allow you to test large-system + code paths on small(er) systems. Select a specific number if testing RCU itself. Take the default if unsure. @@ -404,6 +452,22 @@ config RCU_FANOUT_EXACT Say N if unsure. +config RCU_FAST_NO_HZ + bool "Accelerate last non-dyntick-idle CPU's grace periods" + depends on TREE_RCU && NO_HZ && SMP + default n + help + This option causes RCU to attempt to accelerate grace periods + in order to allow the final CPU to enter dynticks-idle state + more quickly. On the other hand, this option increases the + overhead of the dynticks-idle checking, particularly on systems + with large numbers of CPUs. + + Say Y if energy efficiency is critically important, particularly + if you have relatively few CPUs. + + Say N if you are unsure. + config TREE_RCU_TRACE def_bool RCU_TRACE && ( TREE_RCU || TREE_PREEMPT_RCU ) select DEBUG_FS @@ -412,6 +476,45 @@ config TREE_RCU_TRACE TREE_PREEMPT_RCU implementations, permitting Makefile to trivially select kernel/rcutree_trace.c. +config RCU_BOOST + bool "Enable RCU priority boosting" + depends on RT_MUTEXES && TINY_PREEMPT_RCU + default n + help + This option boosts the priority of preempted RCU readers that + block the current preemptible RCU grace period for too long. + This option also prevents heavy loads from blocking RCU + callback invocation for all flavors of RCU. + + Say Y here if you are working with real-time apps or heavy loads + Say N here if you are unsure. + +config RCU_BOOST_PRIO + int "Real-time priority to boost RCU readers to" + range 1 99 + depends on RCU_BOOST + default 1 + help + This option specifies the real-time priority to which preempted + RCU readers are to be boosted. If you are working with CPU-bound + real-time applications, you should specify a priority higher then + the highest-priority CPU-bound application. + + Specify the real-time priority, or take the default if unsure. + +config RCU_BOOST_DELAY + int "Milliseconds to delay boosting after RCU grace-period start" + range 0 3000 + depends on RCU_BOOST + default 500 + help + This option specifies the time to wait after the beginning of + a given grace period before priority-boosting preempted RCU + readers blocking that grace period. Note that any RCU reader + blocking an expedited RCU grace period is boosted immediately. + + Accept the default if unsure. + endmenu # "RCU Subsystem" config IKCONFIG @@ -453,59 +556,9 @@ config LOG_BUF_SHIFT config HAVE_UNSTABLE_SCHED_CLOCK bool -config GROUP_SCHED - bool "Group CPU scheduler" - depends on EXPERIMENTAL - default n - help - This feature lets CPU scheduler recognize task groups and control CPU - bandwidth allocation to such task groups. - In order to create a group from arbitrary set of processes, use - CONFIG_CGROUPS. (See Control Group support.) - -config FAIR_GROUP_SCHED - bool "Group scheduling for SCHED_OTHER" - depends on GROUP_SCHED - default GROUP_SCHED - -config RT_GROUP_SCHED - bool "Group scheduling for SCHED_RR/FIFO" - depends on EXPERIMENTAL - depends on GROUP_SCHED - default n - help - This feature lets you explicitly allocate real CPU bandwidth - to users or control groups (depending on the "Basis for grouping tasks" - setting below. If enabled, it will also make it impossible to - schedule realtime tasks for non-root users until you allocate - realtime bandwidth for them. - See Documentation/scheduler/sched-rt-group.txt for more information. - -choice - depends on GROUP_SCHED - prompt "Basis for grouping tasks" - default USER_SCHED - -config USER_SCHED - bool "user id" - help - This option will choose userid as the basis for grouping - tasks, thus providing equal CPU bandwidth to each user. - -config CGROUP_SCHED - bool "Control groups" - depends on CGROUPS - help - This option allows you to create arbitrary task groups - using the "cgroup" pseudo filesystem and control - the cpu bandwidth allocated to each such task group. - Refer to Documentation/cgroups/cgroups.txt for more - information on "cgroup" pseudo filesystem. - -endchoice - menuconfig CGROUPS boolean "Control Group support" + depends on EVENTFD help This option adds support for grouping sets of processes together, for use with process control subsystems such as Cpusets, CFS, memory @@ -521,7 +574,6 @@ if CGROUPS config CGROUP_DEBUG bool "Example debug cgroup subsystem" - depends on CGROUPS default n help This option enables a simple cgroup subsystem that @@ -532,7 +584,6 @@ config CGROUP_DEBUG config CGROUP_NS bool "Namespace cgroup subsystem" - depends on CGROUPS help Provides a simple namespace cgroup subsystem to provide hierarchical naming of sets of namespaces, @@ -541,21 +592,18 @@ config CGROUP_NS config CGROUP_FREEZER bool "Freezer cgroup subsystem" - depends on CGROUPS help Provides a way to freeze and unfreeze all tasks in a cgroup. config CGROUP_DEVICE bool "Device controller for cgroups" - depends on CGROUPS && EXPERIMENTAL help Provides a cgroup implementing whitelists for devices which a process in the cgroup can mknod or open. config CPUSETS bool "Cpuset support" - depends on CGROUPS help This option will let you create and manage CPUSETs which allow dynamically partitioning a system into sets of CPUs and @@ -571,7 +619,6 @@ config PROC_PID_CPUSET config CGROUP_CPUACCT bool "Simple CPU accounting cgroup subsystem" - depends on CGROUPS help Provides a simple Resource Controller for monitoring the total CPU consumed by the tasks in a cgroup. @@ -581,11 +628,10 @@ config RESOURCE_COUNTERS help This option enables controller independent resource accounting infrastructure that works with cgroups. - depends on CGROUPS config CGROUP_MEM_RES_CTLR bool "Memory Resource Controller for Control Groups" - depends on CGROUPS && RESOURCE_COUNTERS + depends on RESOURCE_COUNTERS select MM_OWNER help Provides a memory resource controller that manages both anonymous @@ -607,8 +653,8 @@ config CGROUP_MEM_RES_CTLR could in turn add some fork/exit overhead. config CGROUP_MEM_RES_CTLR_SWAP - bool "Memory Resource Controller Swap Extension(EXPERIMENTAL)" - depends on CGROUP_MEM_RES_CTLR && SWAP && EXPERIMENTAL + bool "Memory Resource Controller Swap Extension" + depends on CGROUP_MEM_RES_CTLR && SWAP help Add swap management feature to memory resource controller. When you enable this, you can limit mem+swap usage per cgroup. In other words, @@ -623,60 +669,82 @@ config CGROUP_MEM_RES_CTLR_SWAP if boot option "noswapaccount" is set, swap will not be accounted. Now, memory usage of swap_cgroup is 2 bytes per entry. If swap page size is 4096bytes, 512k per 1Gbytes of swap. +config CGROUP_MEM_RES_CTLR_SWAP_ENABLED + bool "Memory Resource Controller Swap Extension enabled by default" + depends on CGROUP_MEM_RES_CTLR_SWAP + default y + help + Memory Resource Controller Swap Extension comes with its price in + a bigger memory consumption. General purpose distribution kernels + which want to enable the feature but keep it disabled by default + and let the user enable it by swapaccount boot command line + parameter should have this option unselected. + For those who want to have the feature enabled by default should + select this option (if, for some reason, they need to disable it + then noswapaccount does the trick). -endif # CGROUPS - -config MM_OWNER - bool +menuconfig CGROUP_SCHED + bool "Group CPU scheduler" + depends on EXPERIMENTAL + default n + help + This feature lets CPU scheduler recognize task groups and control CPU + bandwidth allocation to such task groups. It uses cgroups to group + tasks. -config SYSFS_DEPRECATED - bool +if CGROUP_SCHED +config FAIR_GROUP_SCHED + bool "Group scheduling for SCHED_OTHER" + depends on CGROUP_SCHED + default CGROUP_SCHED -config SYSFS_DEPRECATED_V2 - bool "enable deprecated sysfs features to support old userspace tools" - depends on SYSFS +config RT_GROUP_SCHED + bool "Group scheduling for SCHED_RR/FIFO" + depends on EXPERIMENTAL + depends on CGROUP_SCHED default n - select SYSFS_DEPRECATED - help - This option switches the layout of sysfs to the deprecated - version. Do not use it on recent distributions. - - The current sysfs layout features a unified device tree at - /sys/devices/, which is able to express a hierarchy between - class devices. If the deprecated option is set to Y, the - unified device tree is split into a bus device tree at - /sys/devices/ and several individual class device trees at - /sys/class/. The class and bus devices will be connected by - "<subsystem>:<name>" and the "device" links. The "block" - class devices, will not show up in /sys/class/block/. Some - subsystems will suppress the creation of some devices which - depend on the unified device tree. - - This option is not a pure compatibility option that can - be safely enabled on newer distributions. It will change the - layout of sysfs to the non-extensible deprecated version, - and disable some features, which can not be exported without - confusing older userspace tools. Since 2007/2008 all major - distributions do not enable this option, and ship no tools which - depend on the deprecated layout or this option. - - If you are using a new kernel on an older distribution, or use - older userspace tools, you might need to say Y here. Do not say Y, - if the original kernel, that came with your distribution, has - this option set to N. - -config RELAY - bool "Kernel->user space relay support (formerly relayfs)" help - This option enables support for relay interface support in - certain file systems (such as debugfs). - It is designed to provide an efficient mechanism for tools and - facilities to relay large amounts of data from kernel space to - user space. + This feature lets you explicitly allocate real CPU bandwidth + to task groups. If enabled, it will also make it impossible to + schedule realtime tasks for non-root users until you allocate + realtime bandwidth for them. + See Documentation/scheduler/sched-rt-group.txt for more information. - If unsure, say N. +endif #CGROUP_SCHED + +config BLK_CGROUP + tristate "Block IO controller" + depends on BLOCK + default n + ---help--- + Generic block IO controller cgroup interface. This is the common + cgroup interface which should be used by various IO controlling + policies. + + Currently, CFQ IO scheduler uses it to recognize task groups and + control disk bandwidth allocation (proportional time slice allocation) + to such task groups. It is also used by bio throttling logic in + block layer to implement upper limit in IO rates on a device. + + This option only enables generic Block IO controller infrastructure. + One needs to also enable actual IO controlling logic/policy. For + enabling proportional weight division of disk bandwidth in CFQ seti + CONFIG_CFQ_GROUP_IOSCHED=y and for enabling throttling policy set + CONFIG_BLK_THROTTLE=y. + + See Documentation/cgroups/blkio-controller.txt for more information. + +config DEBUG_BLK_CGROUP + bool "Enable Block IO controller debugging" + depends on BLK_CGROUP + default n + ---help--- + Enable some debugging help. Currently it exports additional stat + files in a cgroup which can be useful for debugging. -config NAMESPACES +endif # CGROUPS + +menuconfig NAMESPACES bool "Namespaces support" if EMBEDDED default !EMBEDDED help @@ -685,48 +753,115 @@ config NAMESPACES or same user id or pid may refer to different tasks when used in different namespaces. +if NAMESPACES + config UTS_NS bool "UTS namespace" - depends on NAMESPACES + default y help In this namespace tasks see different info provided with the uname() system call config IPC_NS bool "IPC namespace" - depends on NAMESPACES && (SYSVIPC || POSIX_MQUEUE) + depends on (SYSVIPC || POSIX_MQUEUE) + default y help In this namespace tasks work with IPC ids which correspond to different IPC objects in different namespaces. config USER_NS bool "User namespace (EXPERIMENTAL)" - depends on NAMESPACES && EXPERIMENTAL + depends on EXPERIMENTAL + default y help This allows containers, i.e. vservers, to use user namespaces to provide different user info for different servers. If unsure, say N. config PID_NS - bool "PID Namespaces (EXPERIMENTAL)" - default n - depends on NAMESPACES && EXPERIMENTAL + bool "PID Namespaces" + default y help Support process id namespaces. This allows having multiple processes with the same pid as long as they are in different pid namespaces. This is a building block of containers. - Unless you want to work with an experimental feature - say N here. - config NET_NS bool "Network namespace" - default n - depends on NAMESPACES && EXPERIMENTAL && NET + depends on NET + default y help Allow user space to create what appear to be multiple instances of the network stack. +endif # NAMESPACES + +config SCHED_AUTOGROUP + bool "Automatic process group scheduling" + select EVENTFD + select CGROUPS + select CGROUP_SCHED + select FAIR_GROUP_SCHED + help + This option optimizes the scheduler for common desktop workloads by + automatically creating and populating task groups. This separation + of workloads isolates aggressive CPU burners (like build jobs) from + desktop applications. Task group autogeneration is currently based + upon task session. + +config MM_OWNER + bool + +config SYSFS_DEPRECATED + bool "enable deprecated sysfs features to support old userspace tools" + depends on SYSFS + default n + help + This option adds code that switches the layout of the "block" class + devices, to not show up in /sys/class/block/, but only in + /sys/block/. + + This switch is only active when the sysfs.deprecated=1 boot option is + passed or the SYSFS_DEPRECATED_V2 option is set. + + This option allows new kernels to run on old distributions and tools, + which might get confused by /sys/class/block/. Since 2007/2008 all + major distributions and tools handle this just fine. + + Recent distributions and userspace tools after 2009/2010 depend on + the existence of /sys/class/block/, and will not work with this + option enabled. + + Only if you are using a new kernel on an old distribution, you might + need to say Y here. + +config SYSFS_DEPRECATED_V2 + bool "enabled deprecated sysfs features by default" + default n + depends on SYSFS + depends on SYSFS_DEPRECATED + help + Enable deprecated sysfs by default. + + See the CONFIG_SYSFS_DEPRECATED option for more details about this + option. + + Only if you are using a new kernel on an old distribution, you might + need to say Y here. Even then, odds are you would not need it + enabled, you can always pass the boot option if absolutely necessary. + +config RELAY + bool "Kernel->user space relay support (formerly relayfs)" + help + This option enables support for relay interface support in + certain file systems (such as debugfs). + It is designed to provide an efficient mechanism for tools and + facilities to relay large amounts of data from kernel space to + user space. + + If unsure, say N. + config BLK_DEV_INITRD bool "Initial RAM filesystem and RAM disk (initramfs/initrd) support" depends on BROKEN || !FRV @@ -961,6 +1096,7 @@ config PERF_EVENTS default y if (PROFILING || PERF_COUNTERS) depends on HAVE_PERF_EVENTS select ANON_INODES + select IRQ_WORK help Enable kernel support for various performance events provided by software and hardware. @@ -984,19 +1120,6 @@ config PERF_EVENTS Say Y if unsure. -config EVENT_PROFILE - bool "Tracepoint profiling sources" - depends on PERF_EVENTS && EVENT_TRACING - default y - help - Allow the use of tracepoints as software performance events. - - When this is enabled, you can create perf events based on - tracepoints using PERF_TYPE_TRACEPOINT and the tracepoint ID - found in debugfs://tracing/events/*/*/id. (The -e/--events - option to the perf tool can parse and interpret symbolic - tracepoints, in the subsystem:tracepoint_name format.) - config PERF_COUNTERS bool "Kernel performance counters (old config option)" depends on HAVE_PERF_EVENTS @@ -1120,7 +1243,7 @@ config MMAP_ALLOW_UNINITIALIZED See Documentation/nommu-mmap.txt for more information. config PROFILING - bool "Profiling support (EXPERIMENTAL)" + bool "Profiling support" help Say Y here to enable the extended profiling support mechanisms used by profilers such as OProfile. @@ -1134,30 +1257,6 @@ config TRACEPOINTS source "arch/Kconfig" -config SLOW_WORK - default n - bool - help - The slow work thread pool provides a number of dynamically allocated - threads that can be used by the kernel to perform operations that - take a relatively long time. - - An example of this would be CacheFiles doing a path lookup followed - by a series of mkdirs and a create call, all of which have to touch - disk. - - See Documentation/slow-work.txt. - -config SLOW_WORK_DEBUG - bool "Slow work debugging through debugfs" - default n - depends on SLOW_WORK && DEBUG_FS - help - Display the contents of the slow work run queue through debugfs, - including items currently executing. - - See Documentation/slow-work.txt. - endmenu # General setup config HAVE_GENERIC_DMA_COHERENT @@ -1270,4 +1369,8 @@ source "block/Kconfig" config PREEMPT_NOTIFIERS bool +config PADATA + depends on SMP + bool + source "kernel/Kconfig.locks" diff --git a/init/do_mounts.c b/init/do_mounts.c index bb008d064c1a..2b54bef33b55 100644 --- a/init/do_mounts.c +++ b/init/do_mounts.c @@ -15,6 +15,7 @@ #include <linux/initrd.h> #include <linux/async.h> #include <linux/fs_struct.h> +#include <linux/slab.h> #include <linux/nfs_fs.h> #include <linux/nfs_fs_sb.h> @@ -57,6 +58,62 @@ static int __init readwrite(char *str) __setup("ro", readonly); __setup("rw", readwrite); +#ifdef CONFIG_BLOCK +/** + * match_dev_by_uuid - callback for finding a partition using its uuid + * @dev: device passed in by the caller + * @data: opaque pointer to a 36 byte char array with a UUID + * + * Returns 1 if the device matches, and 0 otherwise. + */ +static int match_dev_by_uuid(struct device *dev, void *data) +{ + u8 *uuid = data; + struct hd_struct *part = dev_to_part(dev); + + if (!part->info) + goto no_match; + + if (memcmp(uuid, part->info->uuid, sizeof(part->info->uuid))) + goto no_match; + + return 1; +no_match: + return 0; +} + + +/** + * devt_from_partuuid - looks up the dev_t of a partition by its UUID + * @uuid: 36 byte char array containing a hex ascii UUID + * + * The function will return the first partition which contains a matching + * UUID value in its partition_meta_info struct. This does not search + * by filesystem UUIDs. + * + * Returns the matching dev_t on success or 0 on failure. + */ +static dev_t devt_from_partuuid(char *uuid_str) +{ + dev_t res = 0; + struct device *dev = NULL; + u8 uuid[16]; + + /* Pack the requested UUID in the expected format. */ + part_pack_uuid(uuid_str, uuid); + + dev = class_find_device(&block_class, NULL, uuid, &match_dev_by_uuid); + if (!dev) + goto done; + + res = dev->devt; + put_device(dev); + +done: + return res; +} +#endif + /* * Convert a name into device number. We accept the following variants: * @@ -67,6 +124,8 @@ __setup("rw", readwrite); * of partition - device number of disk plus the partition number * 5) /dev/<disk_name>p<decimal> - same as the above, that form is * used when disk name of partitioned disk ends on a digit. + * 6) PARTUUID=00112233-4455-6677-8899-AABBCCDDEEFF representing the + * unique id of a partition if the partition table provides it. * * If name doesn't have fall into the categories above, we return (0,0). * block_class is used to check if something is a disk name. If the disk @@ -81,6 +140,18 @@ dev_t name_to_dev_t(char *name) dev_t res = 0; int part; +#ifdef CONFIG_BLOCK + if (strncmp(name, "PARTUUID=", 9) == 0) { + name += 9; + if (strlen(name) != 36) + goto fail; + res = devt_from_partuuid(name); + if (!res) + goto fail; + goto done; + } +#endif + if (strncmp(name, "/dev/", 5) != 0) { unsigned maj, min; @@ -220,7 +291,7 @@ static int __init do_mount_root(char *name, char *fs, int flags, void *data) if (err) return err; - sys_chdir("/root"); + sys_chdir((const char __user __force *)"/root"); ROOT_DEV = current->fs->pwd.mnt->mnt_sb->s_dev; printk("VFS: Mounted root (%s filesystem)%s on device %u:%u.\n", current->fs->pwd.mnt->mnt_sb->s_type->name, @@ -290,13 +361,13 @@ out: #ifdef CONFIG_ROOT_NFS static int __init mount_nfs_root(void) { - void *data = nfs_root_data(); + char *root_dev, *root_data; - create_dev("/dev/root", ROOT_DEV); - if (data && - do_mount_root("/dev/root", "nfs", root_mountflags, data) == 0) - return 1; - return 0; + if (nfs_root_data(&root_dev, &root_data) != 0) + return 0; + if (do_mount_root(root_dev, "nfs", root_mountflags, root_data) != 0) + return 0; + return 1; } #endif @@ -417,5 +488,5 @@ void __init prepare_namespace(void) out: devtmpfs_mount("dev"); sys_mount(".", "/", NULL, MS_MOVE, NULL); - sys_chroot("."); + sys_chroot((const char __user __force *)"."); } diff --git a/init/do_mounts_initrd.c b/init/do_mounts_initrd.c index 614241b5200c..3098a38f3ae1 100644 --- a/init/do_mounts_initrd.c +++ b/init/do_mounts_initrd.c @@ -24,17 +24,14 @@ static int __init no_initrd(char *str) __setup("noinitrd", no_initrd); -static int __init do_linuxrc(void * shell) +static int __init do_linuxrc(void *_shell) { - static char *argv[] = { "linuxrc", NULL, }; - extern char * envp_init[]; + static const char *argv[] = { "linuxrc", NULL, }; + extern const char *envp_init[]; + const char *shell = _shell; sys_close(old_fd);sys_close(root_fd); - sys_close(0);sys_close(1);sys_close(2); sys_setsid(); - (void) sys_open("/dev/console",O_RDWR,0); - (void) sys_dup(0); - (void) sys_dup(0); return kernel_execve(shell, argv, envp_init); } diff --git a/init/do_mounts_md.c b/init/do_mounts_md.c index 69aebbf8fd2d..32c4799b8c91 100644 --- a/init/do_mounts_md.c +++ b/init/do_mounts_md.c @@ -283,7 +283,7 @@ static void __init autodetect_raid(void) wait_for_device_probe(); - fd = sys_open("/dev/md0", 0, 0); + fd = sys_open((const char __user __force *) "/dev/md0", 0, 0); if (fd >= 0) { sys_ioctl(fd, RAID_AUTORUN, raid_autopart); sys_close(fd); diff --git a/init/do_mounts_rd.c b/init/do_mounts_rd.c index 027a402708de..6e1ee6987c78 100644 --- a/init/do_mounts_rd.c +++ b/init/do_mounts_rd.c @@ -7,6 +7,7 @@ #include <linux/cramfs_fs.h> #include <linux/initrd.h> #include <linux/string.h> +#include <linux/slab.h> #include "do_mounts.h" #include "../fs/squashfs/squashfs_fs.h" @@ -167,7 +168,7 @@ int __init rd_load_image(char *from) char rotator[4] = { '|' , '/' , '-' , '\\' }; #endif - out_fd = sys_open("/dev/ram", O_RDWR, 0); + out_fd = sys_open((const char __user __force *) "/dev/ram", O_RDWR, 0); if (out_fd < 0) goto out; @@ -266,7 +267,7 @@ noclose_input: sys_close(out_fd); out: kfree(buf); - sys_unlink("/dev/ram"); + sys_unlink((const char __user __force *) "/dev/ram"); return res; } diff --git a/init/initramfs.c b/init/initramfs.c index b37d34beb90b..2531811d42cb 100644 --- a/init/initramfs.c +++ b/init/initramfs.c @@ -457,7 +457,8 @@ static char * __init unpack_to_rootfs(char *buf, unsigned len) compress_name); message = msg_buf; } - } + } else + error("junk in compressed archive"); if (state != Reset) error("junk in compressed archive"); this_header = saved_offset + my_inptr; @@ -482,7 +483,8 @@ static int __init retain_initrd_param(char *str) } __setup("retain_initrd", retain_initrd_param); -extern char __initramfs_start[], __initramfs_end[]; +extern char __initramfs_start[]; +extern unsigned long __initramfs_size; #include <linux/initrd.h> #include <linux/kexec.h> @@ -525,9 +527,9 @@ static void __init clean_rootfs(void) int fd; void *buf; struct linux_dirent64 *dirp; - int count; + int num; - fd = sys_open("/", O_RDONLY, 0); + fd = sys_open((const char __user __force *) "/", O_RDONLY, 0); WARN_ON(fd < 0); if (fd < 0) return; @@ -539,9 +541,9 @@ static void __init clean_rootfs(void) } dirp = buf; - count = sys_getdents64(fd, dirp, BUF_SIZE); - while (count > 0) { - while (count > 0) { + num = sys_getdents64(fd, dirp, BUF_SIZE); + while (num > 0) { + while (num > 0) { struct stat st; int ret; @@ -554,12 +556,12 @@ static void __init clean_rootfs(void) sys_unlink(dirp->d_name); } - count -= dirp->d_reclen; + num -= dirp->d_reclen; dirp = (void *)dirp + dirp->d_reclen; } dirp = buf; memset(buf, 0, BUF_SIZE); - count = sys_getdents64(fd, dirp, BUF_SIZE); + num = sys_getdents64(fd, dirp, BUF_SIZE); } sys_close(fd); @@ -569,8 +571,7 @@ static void __init clean_rootfs(void) static int __init populate_rootfs(void) { - char *err = unpack_to_rootfs(__initramfs_start, - __initramfs_end - __initramfs_start); + char *err = unpack_to_rootfs(__initramfs_start, __initramfs_size); if (err) panic(err); /* Failed to decompress INTERNAL initramfs */ if (initrd_start) { @@ -584,12 +585,12 @@ static int __init populate_rootfs(void) return 0; } else { clean_rootfs(); - unpack_to_rootfs(__initramfs_start, - __initramfs_end - __initramfs_start); + unpack_to_rootfs(__initramfs_start, __initramfs_size); } printk(KERN_INFO "rootfs image is not initramfs (%s)" "; looks like an initrd\n", err); - fd = sys_open("/initrd.image", O_WRONLY|O_CREAT, 0700); + fd = sys_open((const char __user __force *) "/initrd.image", + O_WRONLY|O_CREAT, 0700); if (fd >= 0) { sys_write(fd, (char *)initrd_start, initrd_end - initrd_start); diff --git a/init/main.c b/init/main.c index dac44a9356a5..00799c1d4628 100644 --- a/init/main.c +++ b/init/main.c @@ -20,12 +20,10 @@ #include <linux/delay.h> #include <linux/ioport.h> #include <linux/init.h> -#include <linux/smp_lock.h> #include <linux/initrd.h> #include <linux/bootmem.h> #include <linux/acpi.h> #include <linux/tty.h> -#include <linux/gfp.h> #include <linux/percpu.h> #include <linux/kmod.h> #include <linux/vmalloc.h> @@ -33,7 +31,6 @@ #include <linux/start_kernel.h> #include <linux/security.h> #include <linux/smp.h> -#include <linux/workqueue.h> #include <linux/profile.h> #include <linux/rcupdate.h> #include <linux/moduleparam.h> @@ -63,13 +60,14 @@ #include <linux/sched.h> #include <linux/signal.h> #include <linux/idr.h> +#include <linux/kgdb.h> #include <linux/ftrace.h> #include <linux/async.h> #include <linux/kmemcheck.h> -#include <linux/kmemtrace.h> #include <linux/sfi.h> #include <linux/shmem_fs.h> -#include <trace/boot.h> +#include <linux/slab.h> +#include <linux/perf_event.h> #include <asm/io.h> #include <asm/bugs.h> @@ -124,7 +122,9 @@ static char *ramdisk_execute_command; #ifdef CONFIG_SMP /* Setup configured maximum number of CPUs to activate */ -unsigned int __initdata setup_max_cpus = NR_CPUS; +unsigned int setup_max_cpus = NR_CPUS; +EXPORT_SYMBOL(setup_max_cpus); + /* * Setup routine for controlling SMP activation @@ -149,6 +149,20 @@ static int __init nosmp(char *str) early_param("nosmp", nosmp); +/* this is hard limit */ +static int __init nrcpus(char *str) +{ + int nr_cpus; + + get_option(&str, &nr_cpus); + if (nr_cpus > 0 && nr_cpus < nr_cpu_ids) + nr_cpu_ids = nr_cpus; + + return 0; +} + +early_param("nr_cpus", nrcpus); + static int __init maxcpus(char *str) { get_option(&str, &setup_max_cpus); @@ -160,7 +174,7 @@ static int __init maxcpus(char *str) early_param("maxcpus", maxcpus); #else -const unsigned int setup_max_cpus = NR_CPUS; +static const unsigned int setup_max_cpus = NR_CPUS; #endif /* @@ -183,15 +197,15 @@ static int __init set_reset_devices(char *str) __setup("reset_devices", set_reset_devices); -static char * argv_init[MAX_INIT_ARGS+2] = { "init", NULL, }; -char * envp_init[MAX_INIT_ENVS+2] = { "HOME=/", "TERM=linux", NULL, }; +static const char * argv_init[MAX_INIT_ARGS+2] = { "init", NULL, }; +const char * envp_init[MAX_INIT_ENVS+2] = { "HOME=/", "TERM=linux", NULL, }; static const char *panic_later, *panic_param; -extern struct obs_kernel_param __setup_start[], __setup_end[]; +extern const struct obs_kernel_param __setup_start[], __setup_end[]; static int __init obsolete_checksetup(char *line) { - struct obs_kernel_param *p; + const struct obs_kernel_param *p; int had_early_param = 0; p = __setup_start; @@ -407,17 +421,25 @@ static void __init setup_command_line(char *command_line) * gcc-3.4 accidentally inlines this function, so use noinline. */ +static __initdata DECLARE_COMPLETION(kthreadd_done); + static noinline void __init_refok rest_init(void) - __releases(kernel_lock) { int pid; rcu_scheduler_starting(); + /* + * We need to spawn init first so that it obtains pid 1, however + * the init task will end up wanting to create kthreads, which, if + * we schedule it before we create kthreadd, will OOPS. + */ kernel_thread(kernel_init, NULL, CLONE_FS | CLONE_SIGHAND); numa_default_policy(); pid = kernel_thread(kthreadd, NULL, CLONE_FS | CLONE_FILES); + rcu_read_lock(); kthreadd_task = find_task_by_pid_ns(pid, &init_pid_ns); - unlock_kernel(); + rcu_read_unlock(); + complete(&kthreadd_done); /* * The boot idle thread must execute schedule() @@ -435,7 +457,7 @@ static noinline void __init_refok rest_init(void) /* Check for early params. */ static int __init do_early_param(char *param, char *val) { - struct obs_kernel_param *p; + const struct obs_kernel_param *p; for (p = __setup_start; p < __setup_end; p++) { if ((p->early && strcmp(param, p->str) == 0) || @@ -505,6 +527,7 @@ static void __init mm_init(void) page_cgroup_init_flatmem(); mem_init(); kmem_cache_init(); + percpu_init_late(); pgtable_cache_init(); vmalloc_init(); } @@ -512,7 +535,7 @@ static void __init mm_init(void) asmlinkage void __init start_kernel(void) { char * command_line; - extern struct kernel_param __start___param[], __stop___param[]; + extern const struct kernel_param __start___param[], __stop___param[]; smp_setup_processor_id(); @@ -532,13 +555,11 @@ asmlinkage void __init start_kernel(void) local_irq_disable(); early_boot_irqs_off(); - early_init_irq_lock_class(); /* * Interrupts are still disabled. Do necessary setups, then * enable them */ - lock_kernel(); tick_init(); boot_cpu_init(); page_address_init(); @@ -550,7 +571,7 @@ asmlinkage void __init start_kernel(void) setup_per_cpu_areas(); smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */ - build_all_zonelists(); + build_all_zonelists(NULL); page_alloc_init(); printk(KERN_NOTICE "Kernel command line: %s\n", boot_command_line); @@ -583,7 +604,10 @@ asmlinkage void __init start_kernel(void) "enabled *very* early, fixing it\n"); local_irq_disable(); } + idr_init_cache(); + perf_event_init(); rcu_init(); + radix_tree_init(); /* init some links before init_ISA_irqs() */ early_irq_init(); init_IRQ(); @@ -601,7 +625,7 @@ asmlinkage void __init start_kernel(void) local_irq_enable(); /* Interrupts are enabled now so all GFP allocations are safe. */ - set_gfp_allowed_mask(__GFP_BITS_MASK); + gfp_allowed_mask = __GFP_BITS_MASK; kmem_cache_init_late(); @@ -635,10 +659,8 @@ asmlinkage void __init start_kernel(void) #endif page_cgroup_init(); enable_debug_pagealloc(); - kmemtrace_init(); kmemleak_init(); debug_objects_mem_init(); - idr_init_cache(); setup_per_cpu_pageset(); numa_policy_init(); if (late_time_init) @@ -658,8 +680,8 @@ asmlinkage void __init start_kernel(void) buffer_init(); key_init(); security_init(); + dbg_late_init(); vfs_caches_init(totalram_pages); - radix_tree_init(); signals_init(); /* rootfs populating might need page-writeback */ page_writeback_init(); @@ -697,38 +719,39 @@ int initcall_debug; core_param(initcall_debug, initcall_debug, bool, 0644); static char msgbuf[64]; -static struct boot_trace_call call; -static struct boot_trace_ret ret; -int do_one_initcall(initcall_t fn) +static int __init_or_module do_one_initcall_debug(initcall_t fn) { - int count = preempt_count(); ktime_t calltime, delta, rettime; + unsigned long long duration; + int ret; + + printk(KERN_DEBUG "calling %pF @ %i\n", fn, task_pid_nr(current)); + calltime = ktime_get(); + ret = fn(); + rettime = ktime_get(); + delta = ktime_sub(rettime, calltime); + duration = (unsigned long long) ktime_to_ns(delta) >> 10; + printk(KERN_DEBUG "initcall %pF returned %d after %lld usecs\n", fn, + ret, duration); + + return ret; +} - if (initcall_debug) { - call.caller = task_pid_nr(current); - printk("calling %pF @ %i\n", fn, call.caller); - calltime = ktime_get(); - trace_boot_call(&call, fn); - enable_boot_trace(); - } - - ret.result = fn(); +int __init_or_module do_one_initcall(initcall_t fn) +{ + int count = preempt_count(); + int ret; - if (initcall_debug) { - disable_boot_trace(); - rettime = ktime_get(); - delta = ktime_sub(rettime, calltime); - ret.duration = (unsigned long long) ktime_to_ns(delta) >> 10; - trace_boot_ret(&ret, fn); - printk("initcall %pF returned %d after %Ld usecs\n", fn, - ret.result, ret.duration); - } + if (initcall_debug) + ret = do_one_initcall_debug(fn); + else + ret = fn(); msgbuf[0] = 0; - if (ret.result && ret.result != -ENODEV && initcall_debug) - sprintf(msgbuf, "error code %d ", ret.result); + if (ret && ret != -ENODEV && initcall_debug) + sprintf(msgbuf, "error code %d ", ret); if (preempt_count() != count) { strlcat(msgbuf, "preemption imbalance ", sizeof(msgbuf)); @@ -742,7 +765,7 @@ int do_one_initcall(initcall_t fn) printk("initcall %pF returned with %s\n", fn, msgbuf); } - return ret.result; + return ret; } @@ -754,9 +777,6 @@ static void __init do_initcalls(void) for (fn = __early_initcall_end; fn < __initcall_end; fn++) do_one_initcall(*fn); - - /* Make sure there is no pending stuff from the initcall sequence */ - flush_scheduled_work(); } /* @@ -768,7 +788,6 @@ static void __init do_initcalls(void) */ static void __init do_basic_setup(void) { - init_workqueues(); cpuset_init_smp(); usermodehelper_init(); init_tmpfs(); @@ -786,7 +805,7 @@ static void __init do_pre_smp_initcalls(void) do_one_initcall(*fn); } -static void run_init_process(char *init_filename) +static void run_init_process(const char *init_filename) { argv_init[0] = init_filename; kernel_execve(init_filename, argv_init, envp_init); @@ -796,21 +815,14 @@ static void run_init_process(char *init_filename) * makes it inline to init() and it becomes part of init.text section */ static noinline int init_post(void) - __releases(kernel_lock) { /* need to finish all async __init code before freeing the memory */ async_synchronize_full(); free_initmem(); - unlock_kernel(); mark_rodata_ro(); system_state = SYSTEM_RUNNING; numa_default_policy(); - if (sys_open((const char __user *) "/dev/console", O_RDWR, 0) < 0) - printk(KERN_WARNING "Warning: unable to open an initial console.\n"); - - (void) sys_dup(0); - (void) sys_dup(0); current->signal->flags |= SIGNAL_UNKILLABLE; @@ -836,17 +848,20 @@ static noinline int init_post(void) run_init_process("/bin/init"); run_init_process("/bin/sh"); - panic("No init found. Try passing init= option to kernel."); + panic("No init found. Try passing init= option to kernel. " + "See Linux Documentation/init.txt for guidance."); } static int __init kernel_init(void * unused) { - lock_kernel(); - + /* + * Wait until kthreadd is all set-up. + */ + wait_for_completion(&kthreadd_done); /* * init can allocate pages on any node */ - set_mems_allowed(node_possible_map); + set_mems_allowed(node_states[N_HIGH_MEMORY]); /* * init can run on any cpu. */ @@ -866,13 +881,19 @@ static int __init kernel_init(void * unused) smp_prepare_cpus(setup_max_cpus); do_pre_smp_initcalls(); - start_boot_trace(); + lockup_detector_init(); smp_init(); sched_init_smp(); do_basic_setup(); + /* Open the /dev/console on the rootfs, this should never fail */ + if (sys_open((const char __user *) "/dev/console", O_RDWR, 0) < 0) + printk(KERN_WARNING "Warning: unable to open an initial console.\n"); + + (void) sys_dup(0); + (void) sys_dup(0); /* * check if there is an early userspace init. If yes, let it do all * the work diff --git a/init/noinitramfs.c b/init/noinitramfs.c index f4c1a3a1b8c5..267739d85179 100644 --- a/init/noinitramfs.c +++ b/init/noinitramfs.c @@ -29,17 +29,17 @@ static int __init default_rootfs(void) { int err; - err = sys_mkdir("/dev", 0755); + err = sys_mkdir((const char __user __force *) "/dev", 0755); if (err < 0) goto out; - err = sys_mknod((const char __user *) "/dev/console", + err = sys_mknod((const char __user __force *) "/dev/console", S_IFCHR | S_IRUSR | S_IWUSR, new_encode_dev(MKDEV(5, 1))); if (err < 0) goto out; - err = sys_mkdir("/root", 0700); + err = sys_mkdir((const char __user __force *) "/root", 0700); if (err < 0) goto out; |