From c643401218be0f4ab3522e0c0a63016596d6e9ca Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Fri, 17 Nov 2017 15:26:45 -0800 Subject: proc, coredump: add CoreDumping flag to /proc/pid/status Right now there is no convenient way to check if a process is being coredumped at the moment. It might be necessary to recognize such state to prevent killing the process and getting a broken coredump. Writing a large core might take significant time, and the process is unresponsive during it, so it might be killed by timeout, if another process is monitoring and killing/restarting hanging tasks. We're getting a significant number of corrupted coredump files on machines in our fleet, just because processes are being killed by timeout in the middle of the core writing process. We do have a process health check, and some agent is responsible for restarting processes which are not responding for health check requests. Writing a large coredump to the disk can easily exceed the reasonable timeout (especially on an overloaded machine). This flag will allow the agent to distinguish processes which are being coredumped, extend the timeout for them, and let them produce a full coredump file. To provide an ability to detect if a process is in the state of being coredumped, we can expose a boolean CoreDumping flag in /proc/pid/status. Example: $ cat core.sh #!/bin/sh echo "|/usr/bin/sleep 10" > /proc/sys/kernel/core_pattern sleep 1000 & PID=$! cat /proc/$PID/status | grep CoreDumping kill -ABRT $PID sleep 1 cat /proc/$PID/status | grep CoreDumping $ ./core.sh CoreDumping: 0 CoreDumping: 1 [guro@fb.com: document CoreDumping flag in /proc//status] Link: http://lkml.kernel.org/r/20170928135357.GA8470@castle.DHCP.thefacebook.com Link: http://lkml.kernel.org/r/20170920230634.31572-1-guro@fb.com Signed-off-by: Roman Gushchin Cc: Alexander Viro Cc: Ingo Molnar Cc: Konstantin Khlebnikov Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/array.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs') diff --git a/fs/proc/array.c b/fs/proc/array.c index 6f6fc1672ad1..79375fc115d2 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -366,6 +366,11 @@ static void task_cpus_allowed(struct seq_file *m, struct task_struct *task) cpumask_pr_args(&task->cpus_allowed)); } +static inline void task_core_dumping(struct seq_file *m, struct mm_struct *mm) +{ + seq_printf(m, "CoreDumping:\t%d\n", !!mm->core_state); +} + int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { @@ -376,6 +381,7 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, if (mm) { task_mem(m, mm); + task_core_dumping(m, mm); mmput(mm); } task_sig(m, task); -- cgit v1.2.3 From 3ee2a19908f27b8fea8ff14ffa8b755585eb7b4a Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Fri, 17 Nov 2017 15:26:49 -0800 Subject: proc: : uninline name_to_int() Save ~360 bytes. add/remove: 1/0 grow/shrink: 0/4 up/down: 104/-463 (-359) function old new delta name_to_int - 104 +104 proc_pid_lookup 217 126 -91 proc_lookupfd_common 212 121 -91 proc_task_lookup 289 194 -95 __proc_create 588 402 -186 Link: http://lkml.kernel.org/r/20170912194850.GA17730@avx2 Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/Makefile | 1 + fs/proc/internal.h | 23 +---------------------- fs/proc/util.c | 23 +++++++++++++++++++++++ 3 files changed, 25 insertions(+), 22 deletions(-) create mode 100644 fs/proc/util.c (limited to 'fs') diff --git a/fs/proc/Makefile b/fs/proc/Makefile index f7456c4e7d0f..ead487e80510 100644 --- a/fs/proc/Makefile +++ b/fs/proc/Makefile @@ -21,6 +21,7 @@ proc-y += loadavg.o proc-y += meminfo.o proc-y += stat.o proc-y += uptime.o +proc-y += util.o proc-y += version.o proc-y += softirqs.o proc-y += namespaces.o diff --git a/fs/proc/internal.h b/fs/proc/internal.h index a34195e92b20..9aad373cf11d 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -103,28 +103,7 @@ static inline struct task_struct *get_proc_task(struct inode *inode) void task_dump_owner(struct task_struct *task, mode_t mode, kuid_t *ruid, kgid_t *rgid); -static inline unsigned name_to_int(const struct qstr *qstr) -{ - const char *name = qstr->name; - int len = qstr->len; - unsigned n = 0; - - if (len > 1 && *name == '0') - goto out; - while (len-- > 0) { - unsigned c = *name++ - '0'; - if (c > 9) - goto out; - if (n >= (~0U-9)/10) - goto out; - n *= 10; - n += c; - } - return n; -out: - return ~0U; -} - +unsigned name_to_int(const struct qstr *qstr); /* * Offset of the first process in the /proc root directory.. */ diff --git a/fs/proc/util.c b/fs/proc/util.c new file mode 100644 index 000000000000..c29aa497394b --- /dev/null +++ b/fs/proc/util.c @@ -0,0 +1,23 @@ +#include + +unsigned name_to_int(const struct qstr *qstr) +{ + const char *name = qstr->name; + int len = qstr->len; + unsigned n = 0; + + if (len > 1 && *name == '0') + goto out; + while (len-- > 0) { + unsigned c = *name++ - '0'; + if (c > 9) + goto out; + if (n >= (~0U-9)/10) + goto out; + n *= 10; + n += c; + } + return n; +out: + return ~0U; +} -- cgit v1.2.3 From 0746a0bc6e6e76444098cf944848554d21d28cae Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Fri, 17 Nov 2017 15:26:52 -0800 Subject: proc: use do-while in name_to_int() Gcc doesn't know that "len" is guaranteed to be >=1 by dcache and generates standard while-loop prologue duplicating loop condition. add/remove: 0/0 grow/shrink: 0/1 up/down: 0/-27 (-27) function old new delta name_to_int 104 77 -27 Link: http://lkml.kernel.org/r/20170912195213.GB17730@avx2 Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/util.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/proc/util.c b/fs/proc/util.c index c29aa497394b..b161cfa0f9fa 100644 --- a/fs/proc/util.c +++ b/fs/proc/util.c @@ -8,7 +8,7 @@ unsigned name_to_int(const struct qstr *qstr) if (len > 1 && *name == '0') goto out; - while (len-- > 0) { + do { unsigned c = *name++ - '0'; if (c > 9) goto out; @@ -16,7 +16,7 @@ unsigned name_to_int(const struct qstr *qstr) goto out; n *= 10; n += c; - } + } while (--len > 0); return n; out: return ~0U; -- cgit v1.2.3 From 2ae928a9441a3b5f13952e1e8a97d03cb23ea603 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Fri, 17 Nov 2017 15:28:59 -0800 Subject: epoll: account epitem and eppoll_entry to kmemcg A userspace application can directly trigger the allocations from eventpoll_epi and eventpoll_pwq slabs. A buggy or malicious application can consume a significant amount of system memory by triggering such allocations. Indeed we have seen in production where a buggy application was leaking the epoll references and causing a burst of eventpoll_epi and eventpoll_pwq slab allocations. This patch opt-in the charging of eventpoll_epi and eventpoll_pwq slabs. There is a per-user limit (~4% of total memory if no highmem) on these caches. I think it is too generous particularly in the scenario where jobs of multiple users are running on the system and the administrator is reducing cost by overcomitting the memory. This is unaccounted kernel memory and will not be considered by the oom-killer. I think by accounting it to kmemcg, for systems with kmem accounting enabled, we can provide better isolation between jobs of different users. Link: http://lkml.kernel.org/r/20171003021519.23907-1-shakeelb@google.com Signed-off-by: Shakeel Butt Acked-by: Michal Hocko Cc: Alexander Viro Cc: Vladimir Davydov Cc: Johannes Weiner Cc: Greg Thelen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/eventpoll.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 2fabd19cdeea..a45360444895 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -2329,11 +2329,11 @@ static int __init eventpoll_init(void) /* Allocates slab cache used to allocate "struct epitem" items */ epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem), - 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); + 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL); /* Allocates slab cache used to allocate "struct eppoll_entry" */ pwq_cache = kmem_cache_create("eventpoll_pwq", - sizeof(struct eppoll_entry), 0, SLAB_PANIC, NULL); + sizeof(struct eppoll_entry), 0, SLAB_PANIC|SLAB_ACCOUNT, NULL); return 0; } -- cgit v1.2.3 From 57a173bdf5baab48e8e78825c7366c634acd087c Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Fri, 17 Nov 2017 15:29:02 -0800 Subject: epoll: avoid calling ep_call_nested() from ep_poll_safewake() ep_poll_safewake() is used to wakeup potentially nested epoll file descriptors. The function uses ep_call_nested() to prevent entering the same wake up queue more than once, and to prevent excessively deep wakeup paths (deeper than EP_MAX_NESTS). However, this is not necessary since we are already preventing these conditions during EPOLL_CTL_ADD. This saves extra function calls, and avoids taking a global lock during the ep_call_nested() calls. I have, however, left ep_call_nested() for the CONFIG_DEBUG_LOCK_ALLOC case, since ep_call_nested() keeps track of the nesting level, and this is required by the call to spin_lock_irqsave_nested(). It would be nice to remove the ep_call_nested() calls for the CONFIG_DEBUG_LOCK_ALLOC case as well, however its not clear how to simply pass the nesting level through multiple wake_up() levels without more surgery. In any case, I don't think CONFIG_DEBUG_LOCK_ALLOC is generally used for production. This patch, also apparently fixes a workload at Google that Salman Qazi reported by completely removing the poll_safewake_ncalls->lock from wakeup paths. Link: http://lkml.kernel.org/r/1507920533-8812-1-git-send-email-jbaron@akamai.com Signed-off-by: Jason Baron Acked-by: Davidlohr Bueso Cc: Alexander Viro Cc: Salman Qazi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/eventpoll.c | 47 ++++++++++++++++++----------------------------- 1 file changed, 18 insertions(+), 29 deletions(-) (limited to 'fs') diff --git a/fs/eventpoll.c b/fs/eventpoll.c index a45360444895..dc15bb02ee2a 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -276,9 +276,6 @@ static DEFINE_MUTEX(epmutex); /* Used to check for epoll file descriptor inclusion loops */ static struct nested_calls poll_loop_ncalls; -/* Used for safe wake up implementation */ -static struct nested_calls poll_safewake_ncalls; - /* Used to call file's f_op->poll() under the nested calls boundaries */ static struct nested_calls poll_readywalk_ncalls; @@ -551,40 +548,21 @@ out_unlock: * this special case of epoll. */ #ifdef CONFIG_DEBUG_LOCK_ALLOC -static inline void ep_wake_up_nested(wait_queue_head_t *wqueue, - unsigned long events, int subclass) + +static struct nested_calls poll_safewake_ncalls; + +static int ep_poll_wakeup_proc(void *priv, void *cookie, int call_nests) { unsigned long flags; + wait_queue_head_t *wqueue = (wait_queue_head_t *)cookie; - spin_lock_irqsave_nested(&wqueue->lock, flags, subclass); - wake_up_locked_poll(wqueue, events); + spin_lock_irqsave_nested(&wqueue->lock, flags, call_nests + 1); + wake_up_locked_poll(wqueue, POLLIN); spin_unlock_irqrestore(&wqueue->lock, flags); -} -#else -static inline void ep_wake_up_nested(wait_queue_head_t *wqueue, - unsigned long events, int subclass) -{ - wake_up_poll(wqueue, events); -} -#endif -static int ep_poll_wakeup_proc(void *priv, void *cookie, int call_nests) -{ - ep_wake_up_nested((wait_queue_head_t *) cookie, POLLIN, - 1 + call_nests); return 0; } -/* - * Perform a safe wake up of the poll wait list. The problem is that - * with the new callback'd wake up system, it is possible that the - * poll callback is reentered from inside the call to wake_up() done - * on the poll wait queue head. The rule is that we cannot reenter the - * wake up code from the same task more than EP_MAX_NESTS times, - * and we cannot reenter the same wait queue head at all. This will - * enable to have a hierarchy of epoll file descriptor of no more than - * EP_MAX_NESTS deep. - */ static void ep_poll_safewake(wait_queue_head_t *wq) { int this_cpu = get_cpu(); @@ -595,6 +573,15 @@ static void ep_poll_safewake(wait_queue_head_t *wq) put_cpu(); } +#else + +static void ep_poll_safewake(wait_queue_head_t *wq) +{ + wake_up_poll(wq, POLLIN); +} + +#endif + static void ep_remove_wait_queue(struct eppoll_entry *pwq) { wait_queue_head_t *whead; @@ -2315,8 +2302,10 @@ static int __init eventpoll_init(void) */ ep_nested_calls_init(&poll_loop_ncalls); +#ifdef CONFIG_DEBUG_LOCK_ALLOC /* Initialize the structure used to perform safe poll wait head wake ups */ ep_nested_calls_init(&poll_safewake_ncalls); +#endif /* Initialize the structure used to perform file's f_op->poll() calls */ ep_nested_calls_init(&poll_readywalk_ncalls); -- cgit v1.2.3 From 37b5e5212a448bac0fe29d2a51f088014fbaaa41 Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Fri, 17 Nov 2017 15:29:06 -0800 Subject: epoll: remove ep_call_nested() from ep_eventpoll_poll() The use of ep_call_nested() in ep_eventpoll_poll(), which is the .poll routine for an epoll fd, is used to prevent excessively deep epoll nesting, and to prevent circular paths. However, we are already preventing these conditions during EPOLL_CTL_ADD. In terms of too deep epoll chains, we do in fact allow deep nesting of the epoll fds themselves (deeper than EP_MAX_NESTS), however we don't allow more than EP_MAX_NESTS when an epoll file descriptor is actually connected to a wakeup source. Thus, we do not require the use of ep_call_nested(), since ep_eventpoll_poll(), which is called via ep_scan_ready_list() only continues nesting if there are events available. Since ep_call_nested() is implemented using a global lock, applications that make use of nested epoll can see large performance improvements with this change. Davidlohr said: : Improvements are quite obscene actually, such as for the following : epoll_wait() benchmark with 2 level nesting on a 80 core IvyBridge: : : ncpus vanilla dirty delta : 1 2447092 3028315 +23.75% : 4 231265 2986954 +1191.57% : 8 121631 2898796 +2283.27% : 16 59749 2902056 +4757.07% : 32 26837 2326314 +8568.30% : 64 12926 1341281 +10276.61% : : (http://linux-scalability.org/epoll/epoll-test.c) Link: http://lkml.kernel.org/r/1509430214-5599-1-git-send-email-jbaron@akamai.com Signed-off-by: Jason Baron Cc: Davidlohr Bueso Cc: Alexander Viro Cc: Salman Qazi Cc: Hou Tao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/eventpoll.c | 80 +++++++++++++++++++++++++--------------------------------- 1 file changed, 35 insertions(+), 45 deletions(-) (limited to 'fs') diff --git a/fs/eventpoll.c b/fs/eventpoll.c index dc15bb02ee2a..1e048144f17c 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -276,9 +276,6 @@ static DEFINE_MUTEX(epmutex); /* Used to check for epoll file descriptor inclusion loops */ static struct nested_calls poll_loop_ncalls; -/* Used to call file's f_op->poll() under the nested calls boundaries */ -static struct nested_calls poll_readywalk_ncalls; - /* Slab cache used to allocate "struct epitem" */ static struct kmem_cache *epi_cache __read_mostly; @@ -867,11 +864,33 @@ static int ep_eventpoll_release(struct inode *inode, struct file *file) return 0; } -static inline unsigned int ep_item_poll(struct epitem *epi, poll_table *pt) +static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head, + void *priv); +static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead, + poll_table *pt); + +/* + * Differs from ep_eventpoll_poll() in that internal callers already have + * the ep->mtx so we need to start from depth=1, such that mutex_lock_nested() + * is correctly annotated. + */ +static unsigned int ep_item_poll(struct epitem *epi, poll_table *pt, int depth) { + struct eventpoll *ep; + bool locked; + pt->_key = epi->event.events; + if (!is_file_epoll(epi->ffd.file)) + return epi->ffd.file->f_op->poll(epi->ffd.file, pt) & + epi->event.events; - return epi->ffd.file->f_op->poll(epi->ffd.file, pt) & epi->event.events; + ep = epi->ffd.file->private_data; + poll_wait(epi->ffd.file, &ep->poll_wait, pt); + locked = pt && (pt->_qproc == ep_ptable_queue_proc); + + return ep_scan_ready_list(epi->ffd.file->private_data, + ep_read_events_proc, &depth, depth, + locked) & epi->event.events; } static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head, @@ -879,13 +898,15 @@ static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head, { struct epitem *epi, *tmp; poll_table pt; + int depth = *(int *)priv; init_poll_funcptr(&pt, NULL); + depth++; list_for_each_entry_safe(epi, tmp, head, rdllink) { - if (ep_item_poll(epi, &pt)) + if (ep_item_poll(epi, &pt, depth)) { return POLLIN | POLLRDNORM; - else { + } else { /* * Item has been dropped into the ready list by the poll * callback, but it's not actually ready, as far as @@ -899,48 +920,20 @@ static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head, return 0; } -static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead, - poll_table *pt); - -struct readyevents_arg { - struct eventpoll *ep; - bool locked; -}; - -static int ep_poll_readyevents_proc(void *priv, void *cookie, int call_nests) -{ - struct readyevents_arg *arg = priv; - - return ep_scan_ready_list(arg->ep, ep_read_events_proc, NULL, - call_nests + 1, arg->locked); -} - static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait) { - int pollflags; struct eventpoll *ep = file->private_data; - struct readyevents_arg arg; - - /* - * During ep_insert() we already hold the ep->mtx for the tfile. - * Prevent re-aquisition. - */ - arg.locked = wait && (wait->_qproc == ep_ptable_queue_proc); - arg.ep = ep; + int depth = 0; /* Insert inside our poll wait queue */ poll_wait(file, &ep->poll_wait, wait); /* * Proceed to find out if wanted events are really available inside - * the ready list. This need to be done under ep_call_nested() - * supervision, since the call to f_op->poll() done on listed files - * could re-enter here. + * the ready list. */ - pollflags = ep_call_nested(&poll_readywalk_ncalls, EP_MAX_NESTS, - ep_poll_readyevents_proc, &arg, ep, current); - - return pollflags != -1 ? pollflags : 0; + return ep_scan_ready_list(ep, ep_read_events_proc, + &depth, depth, false); } #ifdef CONFIG_PROC_FS @@ -1459,7 +1452,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, * this operation completes, the poll callback can start hitting * the new item. */ - revents = ep_item_poll(epi, &epq.pt); + revents = ep_item_poll(epi, &epq.pt, 1); /* * We have to check if something went wrong during the poll wait queue @@ -1593,7 +1586,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even * Get current event bits. We can safely use the file* here because * its usage count has been increased by the caller of this function. */ - revents = ep_item_poll(epi, &pt); + revents = ep_item_poll(epi, &pt, 1); /* * If the item is "hot" and it is not registered inside the ready @@ -1661,7 +1654,7 @@ static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head, list_del_init(&epi->rdllink); - revents = ep_item_poll(epi, &pt); + revents = ep_item_poll(epi, &pt, 1); /* * If the event mask intersect the caller-requested one, @@ -2307,9 +2300,6 @@ static int __init eventpoll_init(void) ep_nested_calls_init(&poll_safewake_ncalls); #endif - /* Initialize the structure used to perform file's f_op->poll() calls */ - ep_nested_calls_init(&poll_readywalk_ncalls); - /* * We can have many thousands of epitems, so prevent this from * using an extra cache line on 64-bit (and smaller) CPUs -- cgit v1.2.3 From ecc0c469f27765ed1e2b967be0aa17cee1a60b76 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 17 Nov 2017 15:29:13 -0800 Subject: autofs: don't fail mount for transient error Currently if the autofs kernel module gets an error when writing to the pipe which links to the daemon, then it marks the whole moutpoint as catatonic, and it will stop working. It is possible that the error is transient. This can happen if the daemon is slow and more than 16 requests queue up. If a subsequent process tries to queue a request, and is then signalled, the write to the pipe will return -ERESTARTSYS and autofs will take that as total failure. So change the code to assess -ERESTARTSYS and -ENOMEM as transient failures which only abort the current request, not the whole mountpoint. It isn't a crash or a data corruption, but having autofs mountpoints suddenly stop working is rather inconvenient. Ian said: : And given the problems with a half dozen (or so) user space applications : consuming large amounts of CPU under heavy mount and umount activity this : could happen more easily than we expect. Link: http://lkml.kernel.org/r/87y3norvgp.fsf@notabene.neil.brown.name Signed-off-by: NeilBrown Acked-by: Ian Kent Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/autofs4/waitq.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c index 4ac49d038bf3..8fc41705c7cd 100644 --- a/fs/autofs4/waitq.c +++ b/fs/autofs4/waitq.c @@ -81,7 +81,8 @@ static int autofs4_write(struct autofs_sb_info *sbi, spin_unlock_irqrestore(¤t->sighand->siglock, flags); } - return (bytes > 0); + /* if 'wr' returned 0 (impossible) we assume -EIO (safe) */ + return bytes == 0 ? 0 : wr < 0 ? wr : -EIO; } static void autofs4_notify_daemon(struct autofs_sb_info *sbi, @@ -95,6 +96,7 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi, } pkt; struct file *pipe = NULL; size_t pktsz; + int ret; pr_debug("wait id = 0x%08lx, name = %.*s, type=%d\n", (unsigned long) wq->wait_queue_token, @@ -169,7 +171,18 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi, mutex_unlock(&sbi->wq_mutex); if (autofs4_write(sbi, pipe, &pkt, pktsz)) + switch (ret = autofs4_write(sbi, pipe, &pkt, pktsz)) { + case 0: + break; + case -ENOMEM: + case -ERESTARTSYS: + /* Just fail this one */ + autofs4_wait_release(sbi, wq->wait_queue_token, ret); + break; + default: autofs4_catatonic_mode(sbi); + break; + } fput(pipe); } -- cgit v1.2.3 From 98159d977f71c3b3dee898d1c34e56f520b094e7 Mon Sep 17 00:00:00 2001 From: Joe Lawrence Date: Fri, 17 Nov 2017 15:29:17 -0800 Subject: pipe: match pipe_max_size data type with procfs Patch series "A few round_pipe_size() and pipe-max-size fixups", v3. While backporting Michael's "pipe: fix limit handling" patchset to a distro-kernel, Mikulas noticed that current upstream pipe limit handling contains a few problems: 1 - procfs signed wrap: echo'ing a large number into /proc/sys/fs/pipe-max-size and then cat'ing it back out shows a negative value. 2 - round_pipe_size() nr_pages overflow on 32bit: this would subsequently try roundup_pow_of_two(0), which is undefined. 3 - visible non-rounded pipe-max-size value: there is no mutual exclusion or protection between the time pipe_max_size is assigned a raw value from proc_dointvec_minmax() and when it is rounded. 4 - unsigned long -> unsigned int conversion makes for potential odd return errors from do_proc_douintvec_minmax_conv() and do_proc_dopipe_max_size_conv(). This version underwent the same testing as v1: https://marc.info/?l=linux-kernel&m=150643571406022&w=2 This patch (of 4): pipe_max_size is defined as an unsigned int: unsigned int pipe_max_size = 1048576; but its procfs/sysctl representation is an integer: static struct ctl_table fs_table[] = { ... { .procname = "pipe-max-size", .data = &pipe_max_size, .maxlen = sizeof(int), .mode = 0644, .proc_handler = &pipe_proc_fn, .extra1 = &pipe_min_size, }, ... that is signed: int pipe_proc_fn(struct ctl_table *table, int write, void __user *buf, size_t *lenp, loff_t *ppos) { ... ret = proc_dointvec_minmax(table, write, buf, lenp, ppos) This leads to signed results via procfs for large values of pipe_max_size: % echo 2147483647 >/proc/sys/fs/pipe-max-size % cat /proc/sys/fs/pipe-max-size -2147483648 Use unsigned operations on this variable to avoid such negative values. Link: http://lkml.kernel.org/r/1507658689-11669-2-git-send-email-joe.lawrence@redhat.com Signed-off-by: Joe Lawrence Reported-by: Mikulas Patocka Reviewed-by: Mikulas Patocka Cc: Michael Kerrisk Cc: Randy Dunlap Cc: Al Viro Cc: Jens Axboe Cc: Josh Poimboeuf Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/pipe.c | 2 +- kernel/sysctl.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/pipe.c b/fs/pipe.c index 349c9d56d4b3..3909c55ed389 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -1125,7 +1125,7 @@ int pipe_proc_fn(struct ctl_table *table, int write, void __user *buf, { int ret; - ret = proc_dointvec_minmax(table, write, buf, lenp, ppos); + ret = proc_douintvec_minmax(table, write, buf, lenp, ppos); if (ret < 0 || !write) return ret; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 4a13a389e99b..2d42183b4c98 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1816,7 +1816,7 @@ static struct ctl_table fs_table[] = { { .procname = "pipe-max-size", .data = &pipe_max_size, - .maxlen = sizeof(int), + .maxlen = sizeof(pipe_max_size), .mode = 0644, .proc_handler = &pipe_proc_fn, .extra1 = &pipe_min_size, -- cgit v1.2.3 From d3f14c485867cfb2e0c48aa88c41d0ef4bf5209c Mon Sep 17 00:00:00 2001 From: Joe Lawrence Date: Fri, 17 Nov 2017 15:29:21 -0800 Subject: pipe: avoid round_pipe_size() nr_pages overflow on 32-bit round_pipe_size() contains a right-bit-shift expression which may overflow, which would cause undefined results in a subsequent roundup_pow_of_two() call. static inline unsigned int round_pipe_size(unsigned int size) { unsigned long nr_pages; nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; return roundup_pow_of_two(nr_pages) << PAGE_SHIFT; } PAGE_SIZE is defined as (1UL << PAGE_SHIFT), so: - 4 bytes wide on 32-bit (0 to 0xffffffff) - 8 bytes wide on 64-bit (0 to 0xffffffffffffffff) That means that 32-bit round_pipe_size(), nr_pages may overflow to 0: size=0x00000000 nr_pages=0x0 size=0x00000001 nr_pages=0x1 size=0xfffff000 nr_pages=0xfffff size=0xfffff001 nr_pages=0x0 << ! size=0xffffffff nr_pages=0x0 << ! This is bad because roundup_pow_of_two(n) is undefined when n == 0! 64-bit is not a problem as the unsigned int size is 4 bytes wide (similar to 32-bit) and the larger, 8 byte wide unsigned long, is sufficient to handle the largest value of the bit shift expression: size=0xffffffff nr_pages=100000 Modify round_pipe_size() to return 0 if n == 0 and updates its callers to handle accordingly. Link: http://lkml.kernel.org/r/1507658689-11669-3-git-send-email-joe.lawrence@redhat.com Signed-off-by: Joe Lawrence Reported-by: Mikulas Patocka Reviewed-by: Mikulas Patocka Cc: Al Viro Cc: Jens Axboe Cc: Michael Kerrisk Cc: Randy Dunlap Cc: Josh Poimboeuf Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/pipe.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/pipe.c b/fs/pipe.c index 3909c55ed389..f0f4ab36c444 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -1018,13 +1018,19 @@ const struct file_operations pipefifo_fops = { /* * Currently we rely on the pipe array holding a power-of-2 number - * of pages. + * of pages. Returns 0 on error. */ static inline unsigned int round_pipe_size(unsigned int size) { unsigned long nr_pages; + if (size < pipe_min_size) + size = pipe_min_size; + nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; + if (nr_pages == 0) + return 0; + return roundup_pow_of_two(nr_pages) << PAGE_SHIFT; } @@ -1040,6 +1046,8 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg) long ret = 0; size = round_pipe_size(arg); + if (size == 0) + return -EINVAL; nr_pages = size >> PAGE_SHIFT; if (!nr_pages) @@ -1123,13 +1131,18 @@ out_revert_acct: int pipe_proc_fn(struct ctl_table *table, int write, void __user *buf, size_t *lenp, loff_t *ppos) { + unsigned int rounded_pipe_max_size; int ret; ret = proc_douintvec_minmax(table, write, buf, lenp, ppos); if (ret < 0 || !write) return ret; - pipe_max_size = round_pipe_size(pipe_max_size); + rounded_pipe_max_size = round_pipe_size(pipe_max_size); + if (rounded_pipe_max_size == 0) + return -EINVAL; + + pipe_max_size = rounded_pipe_max_size; return ret; } -- cgit v1.2.3 From 7a8d181949fb2c16be00f8cdb354794a30e46b39 Mon Sep 17 00:00:00 2001 From: Joe Lawrence Date: Fri, 17 Nov 2017 15:29:24 -0800 Subject: pipe: add proc_dopipe_max_size() to safely assign pipe_max_size pipe_max_size is assigned directly via procfs sysctl: static struct ctl_table fs_table[] = { ... { .procname = "pipe-max-size", .data = &pipe_max_size, .maxlen = sizeof(int), .mode = 0644, .proc_handler = &pipe_proc_fn, .extra1 = &pipe_min_size, }, ... int pipe_proc_fn(struct ctl_table *table, int write, void __user *buf, size_t *lenp, loff_t *ppos) { ... ret = proc_dointvec_minmax(table, write, buf, lenp, ppos) ... and then later rounded in-place a few statements later: ... pipe_max_size = round_pipe_size(pipe_max_size); ... This leaves a window of time between initial assignment and rounding that may be visible to other threads. (For example, one thread sets a non-rounded value to pipe_max_size while another reads its value.) Similar reads of pipe_max_size are potentially racy: pipe.c :: alloc_pipe_info() pipe.c :: pipe_set_size() Add a new proc_dopipe_max_size() that consolidates reading the new value from the user buffer, verifying bounds, and calling round_pipe_size() with a single assignment to pipe_max_size. Link: http://lkml.kernel.org/r/1507658689-11669-4-git-send-email-joe.lawrence@redhat.com Signed-off-by: Joe Lawrence Reported-by: Mikulas Patocka Reviewed-by: Mikulas Patocka Cc: Al Viro Cc: Jens Axboe Cc: Michael Kerrisk Cc: Randy Dunlap Cc: Josh Poimboeuf Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/pipe.c | 18 +++-------------- include/linux/pipe_fs_i.h | 1 + include/linux/sysctl.h | 3 +++ kernel/sysctl.c | 49 +++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 56 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/pipe.c b/fs/pipe.c index f0f4ab36c444..6d98566201ef 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -1020,7 +1020,7 @@ const struct file_operations pipefifo_fops = { * Currently we rely on the pipe array holding a power-of-2 number * of pages. Returns 0 on error. */ -static inline unsigned int round_pipe_size(unsigned int size) +unsigned int round_pipe_size(unsigned int size) { unsigned long nr_pages; @@ -1125,25 +1125,13 @@ out_revert_acct: } /* - * This should work even if CONFIG_PROC_FS isn't set, as proc_dointvec_minmax + * This should work even if CONFIG_PROC_FS isn't set, as proc_dopipe_max_size * will return an error. */ int pipe_proc_fn(struct ctl_table *table, int write, void __user *buf, size_t *lenp, loff_t *ppos) { - unsigned int rounded_pipe_max_size; - int ret; - - ret = proc_douintvec_minmax(table, write, buf, lenp, ppos); - if (ret < 0 || !write) - return ret; - - rounded_pipe_max_size = round_pipe_size(pipe_max_size); - if (rounded_pipe_max_size == 0) - return -EINVAL; - - pipe_max_size = rounded_pipe_max_size; - return ret; + return proc_dopipe_max_size(table, write, buf, lenp, ppos); } /* diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index 6a80cfc63e0c..2dc5e9870fcd 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -191,5 +191,6 @@ long pipe_fcntl(struct file *, unsigned int, unsigned long arg); struct pipe_inode_info *get_pipe_info(struct file *file); int create_pipe_files(struct file **, int); +unsigned int round_pipe_size(unsigned int size); #endif diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index b769ecfcc3bd..992bc9948232 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -51,6 +51,9 @@ extern int proc_dointvec_minmax(struct ctl_table *, int, extern int proc_douintvec_minmax(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); +extern int proc_dopipe_max_size(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos); extern int proc_dointvec_jiffies(struct ctl_table *, int, void __user *, size_t *, loff_t *); extern int proc_dointvec_userhz_jiffies(struct ctl_table *, int, diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 2d42183b4c98..138b6484f277 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -66,6 +66,7 @@ #include #include #include +#include #include #include @@ -2620,6 +2621,47 @@ int proc_douintvec_minmax(struct ctl_table *table, int write, do_proc_douintvec_minmax_conv, ¶m); } +struct do_proc_dopipe_max_size_conv_param { + unsigned int *min; +}; + +static int do_proc_dopipe_max_size_conv(unsigned long *lvalp, + unsigned int *valp, + int write, void *data) +{ + struct do_proc_dopipe_max_size_conv_param *param = data; + + if (write) { + unsigned int val = round_pipe_size(*lvalp); + + if (val == 0) + return -EINVAL; + + if (param->min && *param->min > val) + return -ERANGE; + + if (*lvalp > UINT_MAX) + return -EINVAL; + + *valp = val; + } else { + unsigned int val = *valp; + *lvalp = (unsigned long) val; + } + + return 0; +} + +int proc_dopipe_max_size(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + struct do_proc_dopipe_max_size_conv_param param = { + .min = (unsigned int *) table->extra1, + }; + return do_proc_douintvec(table, write, buffer, lenp, ppos, + do_proc_dopipe_max_size_conv, ¶m); +} + static void validate_coredump_safety(void) { #ifdef CONFIG_COREDUMP @@ -3125,6 +3167,12 @@ int proc_douintvec_minmax(struct ctl_table *table, int write, return -ENOSYS; } +int proc_dopipe_max_size(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + int proc_dointvec_jiffies(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { @@ -3168,6 +3216,7 @@ EXPORT_SYMBOL(proc_douintvec); EXPORT_SYMBOL(proc_dointvec_jiffies); EXPORT_SYMBOL(proc_dointvec_minmax); EXPORT_SYMBOL_GPL(proc_douintvec_minmax); +EXPORT_SYMBOL_GPL(proc_dopipe_max_size); EXPORT_SYMBOL(proc_dointvec_userhz_jiffies); EXPORT_SYMBOL(proc_dointvec_ms_jiffies); EXPORT_SYMBOL(proc_dostring); -- cgit v1.2.3 From 7554e9c4cfa208acf3164a86c05aaa967b043425 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 17 Nov 2017 15:29:32 -0800 Subject: fs/nilfs2: convert timers to use timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. This requires adding a pointer to hold the timer's target task, as the lifetime of sc_task doesn't appear to match the timer's task. Link: http://lkml.kernel.org/r/20171016235900.GA102729@beast Signed-off-by: Kees Cook Acked-by: Ryusuke Konishi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nilfs2/segment.c | 11 +++++------ fs/nilfs2/segment.h | 1 + 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index f65392fecb5c..472f0b53a724 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -2400,11 +2400,11 @@ static int nilfs_segctor_construct(struct nilfs_sc_info *sci, int mode) return err; } -static void nilfs_construction_timeout(unsigned long data) +static void nilfs_construction_timeout(struct timer_list *t) { - struct task_struct *p = (struct task_struct *)data; + struct nilfs_sc_info *sci = from_timer(sci, t, sc_timer); - wake_up_process(p); + wake_up_process(sci->sc_timer_task); } static void @@ -2542,8 +2542,7 @@ static int nilfs_segctor_thread(void *arg) struct the_nilfs *nilfs = sci->sc_super->s_fs_info; int timeout = 0; - sci->sc_timer.data = (unsigned long)current; - sci->sc_timer.function = nilfs_construction_timeout; + sci->sc_timer_task = current; /* start sync. */ sci->sc_task = current; @@ -2674,7 +2673,7 @@ static struct nilfs_sc_info *nilfs_segctor_new(struct super_block *sb, INIT_LIST_HEAD(&sci->sc_gc_inodes); INIT_LIST_HEAD(&sci->sc_iput_queue); INIT_WORK(&sci->sc_iput_work, nilfs_iput_work_func); - init_timer(&sci->sc_timer); + timer_setup(&sci->sc_timer, nilfs_construction_timeout, 0); sci->sc_interval = HZ * NILFS_SC_DEFAULT_TIMEOUT; sci->sc_mjcp_freq = HZ * NILFS_SC_DEFAULT_SR_FREQ; diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h index 1060949d7dd2..84084a4d9b3e 100644 --- a/fs/nilfs2/segment.h +++ b/fs/nilfs2/segment.h @@ -180,6 +180,7 @@ struct nilfs_sc_info { unsigned long sc_watermark; struct timer_list sc_timer; + struct task_struct *sc_timer_task; struct task_struct *sc_task; }; -- cgit v1.2.3 From 31ccb1f7ba3cfe29631587d451cf5bb8ab593550 Mon Sep 17 00:00:00 2001 From: Andreas Rohner Date: Fri, 17 Nov 2017 15:29:35 -0800 Subject: nilfs2: fix race condition that causes file system corruption There is a race condition between nilfs_dirty_inode() and nilfs_set_file_dirty(). When a file is opened, nilfs_dirty_inode() is called to update the access timestamp in the inode. It calls __nilfs_mark_inode_dirty() in a separate transaction. __nilfs_mark_inode_dirty() caches the ifile buffer_head in the i_bh field of the inode info structure and marks it as dirty. After some data was written to the file in another transaction, the function nilfs_set_file_dirty() is called, which adds the inode to the ns_dirty_files list. Then the segment construction calls nilfs_segctor_collect_dirty_files(), which goes through the ns_dirty_files list and checks the i_bh field. If there is a cached buffer_head in i_bh it is not marked as dirty again. Since nilfs_dirty_inode() and nilfs_set_file_dirty() use separate transactions, it is possible that a segment construction that writes out the ifile occurs in-between the two. If this happens the inode is not on the ns_dirty_files list, but its ifile block is still marked as dirty and written out. In the next segment construction, the data for the file is written out and nilfs_bmap_propagate() updates the b-tree. Eventually the bmap root is written into the i_bh block, which is not dirty, because it was written out in another segment construction. As a result the bmap update can be lost, which leads to file system corruption. Either the virtual block address points to an unallocated DAT block, or the DAT entry will be reused for something different. The error can remain undetected for a long time. A typical error message would be one of the "bad btree" errors or a warning that a DAT entry could not be found. This bug can be reproduced reliably by a simple benchmark that creates and overwrites millions of 4k files. Link: http://lkml.kernel.org/r/1509367935-3086-2-git-send-email-konishi.ryusuke@lab.ntt.co.jp Signed-off-by: Andreas Rohner Signed-off-by: Ryusuke Konishi Tested-by: Andreas Rohner Tested-by: Ryusuke Konishi Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nilfs2/segment.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 472f0b53a724..f572538dcc4f 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -1954,8 +1954,6 @@ static int nilfs_segctor_collect_dirty_files(struct nilfs_sc_info *sci, err, ii->vfs_inode.i_ino); return err; } - mark_buffer_dirty(ibh); - nilfs_mdt_mark_dirty(ifile); spin_lock(&nilfs->ns_inode_lock); if (likely(!ii->i_bh)) ii->i_bh = ibh; @@ -1964,6 +1962,10 @@ static int nilfs_segctor_collect_dirty_files(struct nilfs_sc_info *sci, goto retry; } + // Always redirty the buffer to avoid race condition + mark_buffer_dirty(ii->i_bh); + nilfs_mdt_mark_dirty(ifile); + clear_bit(NILFS_I_QUEUED, &ii->i_state); set_bit(NILFS_I_BUSY, &ii->i_state); list_move_tail(&ii->i_dirty, &sci->sc_dirty_files); -- cgit v1.2.3 From d4f0284a5969fd7809ec8df710eb10598b701638 Mon Sep 17 00:00:00 2001 From: Elena Reshetova Date: Fri, 17 Nov 2017 15:29:39 -0800 Subject: fs, nilfs: convert nilfs_root.count from atomic_t to refcount_t atomic_t variables are currently used to implement reference counters with the following properties: - counter is initialized to 1 using atomic_set() - a resource is freed upon counter reaching zero - once counter reaches zero, its further increments aren't allowed - counter schema uses basic atomic operations (set, inc, inc_not_zero, dec_and_test, etc.) Such atomic variables should be converted to a newly provided refcount_t type and API that prevents accidental counter overflows and underflows. This is important since overflows and underflows can lead to use-after-free situation and be exploitable. The variable nilfs_root.count is used as pure reference counter. Convert it to refcount_t and fix up the operations. Link: http://lkml.kernel.org/r/1509367935-3086-3-git-send-email-konishi.ryusuke@lab.ntt.co.jp Signed-off-by: Elena Reshetova Signed-off-by: Ryusuke Konishi Suggested-by: Kees Cook Reviewed-by: David Windsor Reviewed-by: Hans Liljestrand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nilfs2/the_nilfs.c | 8 ++++---- fs/nilfs2/the_nilfs.h | 5 +++-- 2 files changed, 7 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c index 2dd75bf619ad..afebb5067cec 100644 --- a/fs/nilfs2/the_nilfs.c +++ b/fs/nilfs2/the_nilfs.c @@ -737,7 +737,7 @@ struct nilfs_root *nilfs_lookup_root(struct the_nilfs *nilfs, __u64 cno) } else if (cno > root->cno) { n = n->rb_right; } else { - atomic_inc(&root->count); + refcount_inc(&root->count); spin_unlock(&nilfs->ns_cptree_lock); return root; } @@ -776,7 +776,7 @@ nilfs_find_or_create_root(struct the_nilfs *nilfs, __u64 cno) } else if (cno > root->cno) { p = &(*p)->rb_right; } else { - atomic_inc(&root->count); + refcount_inc(&root->count); spin_unlock(&nilfs->ns_cptree_lock); kfree(new); return root; @@ -786,7 +786,7 @@ nilfs_find_or_create_root(struct the_nilfs *nilfs, __u64 cno) new->cno = cno; new->ifile = NULL; new->nilfs = nilfs; - atomic_set(&new->count, 1); + refcount_set(&new->count, 1); atomic64_set(&new->inodes_count, 0); atomic64_set(&new->blocks_count, 0); @@ -806,7 +806,7 @@ nilfs_find_or_create_root(struct the_nilfs *nilfs, __u64 cno) void nilfs_put_root(struct nilfs_root *root) { - if (atomic_dec_and_test(&root->count)) { + if (refcount_dec_and_test(&root->count)) { struct the_nilfs *nilfs = root->nilfs; nilfs_sysfs_delete_snapshot_group(root); diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h index b305c6f033e7..883d732b0259 100644 --- a/fs/nilfs2/the_nilfs.h +++ b/fs/nilfs2/the_nilfs.h @@ -27,6 +27,7 @@ #include #include #include +#include struct nilfs_sc_info; struct nilfs_sysfs_dev_subgroups; @@ -246,7 +247,7 @@ struct nilfs_root { __u64 cno; struct rb_node rb_node; - atomic_t count; + refcount_t count; struct the_nilfs *nilfs; struct inode *ifile; @@ -299,7 +300,7 @@ void nilfs_swap_super_block(struct the_nilfs *); static inline void nilfs_get_root(struct nilfs_root *root) { - atomic_inc(&root->count); + refcount_inc(&root->count); } static inline int nilfs_valid_fs(struct the_nilfs *nilfs) -- cgit v1.2.3 From 4d685f930a53632ff6b86efe43b95637006371fe Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Fri, 17 Nov 2017 15:29:43 -0800 Subject: nilfs2: align block comments of nilfs_sufile_truncate_range() at * Fix the following checkpatch warning: WARNING: Block comments should align the * on each line #633: FILE: sufile.c:633: +/** + * nilfs_sufile_truncate_range - truncate range of segment array Link: http://lkml.kernel.org/r/1509367935-3086-4-git-send-email-konishi.ryusuke@lab.ntt.co.jp Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nilfs2/sufile.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c index 1541a1e9221a..1341a41e7b43 100644 --- a/fs/nilfs2/sufile.c +++ b/fs/nilfs2/sufile.c @@ -630,22 +630,22 @@ void nilfs_sufile_do_set_error(struct inode *sufile, __u64 segnum, } /** - * nilfs_sufile_truncate_range - truncate range of segment array - * @sufile: inode of segment usage file - * @start: start segment number (inclusive) - * @end: end segment number (inclusive) - * - * Return Value: On success, 0 is returned. On error, one of the - * following negative error codes is returned. - * - * %-EIO - I/O error. - * - * %-ENOMEM - Insufficient amount of memory available. - * - * %-EINVAL - Invalid number of segments specified - * - * %-EBUSY - Dirty or active segments are present in the range - */ + * nilfs_sufile_truncate_range - truncate range of segment array + * @sufile: inode of segment usage file + * @start: start segment number (inclusive) + * @end: end segment number (inclusive) + * + * Return Value: On success, 0 is returned. On error, one of the + * following negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-EINVAL - Invalid number of segments specified + * + * %-EBUSY - Dirty or active segments are present in the range + */ static int nilfs_sufile_truncate_range(struct inode *sufile, __u64 start, __u64 end) { -- cgit v1.2.3 From 3147db8938c7968b7be07f9b87510e334fe42ce1 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Fri, 17 Nov 2017 15:29:46 -0800 Subject: nilfs2: use octal for unreadable permission macro Replace S_IRWXUGO with 0777 because symbolic permissions are considered harmful: https://lwn.net/Articles/696229/ Link: http://lkml.kernel.org/r/1509367935-3086-5-git-send-email-konishi.ryusuke@lab.ntt.co.jp Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nilfs2/namei.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c index 515d13c196da..1a2894aa0194 100644 --- a/fs/nilfs2/namei.c +++ b/fs/nilfs2/namei.c @@ -150,7 +150,7 @@ static int nilfs_symlink(struct inode *dir, struct dentry *dentry, if (err) return err; - inode = nilfs_new_inode(dir, S_IFLNK | S_IRWXUGO); + inode = nilfs_new_inode(dir, S_IFLNK | 0777); err = PTR_ERR(inode); if (IS_ERR(inode)) goto out; -- cgit v1.2.3 From 577753cc57b19949b7ce0fc848c669d37e448c20 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 17 Nov 2017 15:29:50 -0800 Subject: nilfs2: remove inode->i_version initialization It's never used in nilfs2. Link: http://lkml.kernel.org/r/1510064486-1728-2-git-send-email-konishi.ryusuke@lab.ntt.co.jp Signed-off-by: Jeff Layton Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nilfs2/super.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 4fc018dfcfae..3ce20cd44a20 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -160,7 +160,6 @@ struct inode *nilfs_alloc_inode(struct super_block *sb) ii->i_bh = NULL; ii->i_state = 0; ii->i_cno = 0; - ii->vfs_inode.i_version = 1; nilfs_mapping_init(&ii->i_btnode_cache, &ii->vfs_inode); return &ii->vfs_inode; } -- cgit v1.2.3 From 15ec37185ec66b8e199188bf8df3e7baf50ef77d Mon Sep 17 00:00:00 2001 From: Christos Gkekas Date: Fri, 17 Nov 2017 15:29:54 -0800 Subject: hfs/hfsplus: clean up unused variables in bnode.c Delete variables 'tree' and 'sb', which are set but never used. Link: http://lkml.kernel.org/r/1507977146-15875-1-git-send-email-chris.gekas@gmail.com Signed-off-by: Christos Gkekas Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hfs/bnode.c | 4 ---- fs/hfsplus/bnode.c | 4 ---- 2 files changed, 8 deletions(-) (limited to 'fs') diff --git a/fs/hfs/bnode.c b/fs/hfs/bnode.c index 8aec5e732abf..b63a4df7327b 100644 --- a/fs/hfs/bnode.c +++ b/fs/hfs/bnode.c @@ -98,13 +98,11 @@ void hfs_bnode_clear(struct hfs_bnode *node, int off, int len) void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst, struct hfs_bnode *src_node, int src, int len) { - struct hfs_btree *tree; struct page *src_page, *dst_page; hfs_dbg(BNODE_MOD, "copybytes: %u,%u,%u\n", dst, src, len); if (!len) return; - tree = src_node->tree; src += src_node->page_offset; dst += dst_node->page_offset; src_page = src_node->page[0]; @@ -237,7 +235,6 @@ struct hfs_bnode *hfs_bnode_findhash(struct hfs_btree *tree, u32 cnid) static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid) { - struct super_block *sb; struct hfs_bnode *node, *node2; struct address_space *mapping; struct page *page; @@ -249,7 +246,6 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid) return NULL; } - sb = tree->inode->i_sb; size = sizeof(struct hfs_bnode) + tree->pages_per_bnode * sizeof(struct page *); node = kzalloc(size, GFP_KERNEL); diff --git a/fs/hfsplus/bnode.c b/fs/hfsplus/bnode.c index d77015c3f22c..177fae4e6581 100644 --- a/fs/hfsplus/bnode.c +++ b/fs/hfsplus/bnode.c @@ -127,14 +127,12 @@ void hfs_bnode_clear(struct hfs_bnode *node, int off, int len) void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst, struct hfs_bnode *src_node, int src, int len) { - struct hfs_btree *tree; struct page **src_page, **dst_page; int l; hfs_dbg(BNODE_MOD, "copybytes: %u,%u,%u\n", dst, src, len); if (!len) return; - tree = src_node->tree; src += src_node->page_offset; dst += dst_node->page_offset; src_page = src_node->page + (src >> PAGE_SHIFT); @@ -401,7 +399,6 @@ struct hfs_bnode *hfs_bnode_findhash(struct hfs_btree *tree, u32 cnid) static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid) { - struct super_block *sb; struct hfs_bnode *node, *node2; struct address_space *mapping; struct page *page; @@ -414,7 +411,6 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid) return NULL; } - sb = tree->inode->i_sb; size = sizeof(struct hfs_bnode) + tree->pages_per_bnode * sizeof(struct page *); node = kzalloc(size, GFP_KERNEL); -- cgit v1.2.3 From eecd7f4f5b9c2021dbde0a361b365f5970db52aa Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 17 Nov 2017 15:29:57 -0800 Subject: fat: remove redundant assignment of 0 to slots The variable slots is being assigned a value of zero that is never read, slots is being updated again a few lines later. Remove this redundant assignment. Cleans clang warning: Value stored to 'slots' is never read Link: http://lkml.kernel.org/r/20171017140258.22536-1-colin.king@canonical.com Signed-off-by: Colin Ian King Acked-by: OGAWA Hirofumi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fat/dir.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/fat/dir.c b/fs/fat/dir.c index 81cecbe6d7cf..b833ffeee1e1 100644 --- a/fs/fat/dir.c +++ b/fs/fat/dir.c @@ -291,7 +291,6 @@ static int fat_parse_long(struct inode *dir, loff_t *pos, } } parse_long: - slots = 0; ds = (struct msdos_dir_slot *)*de; id = ds->id; if (!(id & 0x40)) -- cgit v1.2.3 From 95846ecf9dac5089aed4b144d912225f8ef86ae4 Mon Sep 17 00:00:00 2001 From: Gargi Sharma Date: Fri, 17 Nov 2017 15:30:30 -0800 Subject: pid: replace pid bitmap implementation with IDR API Patch series "Replacing PID bitmap implementation with IDR API", v4. This series replaces kernel bitmap implementation of PID allocation with IDR API. These patches are written to simplify the kernel by replacing custom code with calls to generic code. The following are the stats for pid and pid_namespace object files before and after the replacement. There is a noteworthy change between the IDR and bitmap implementation. Before text data bss dec hex filename 8447 3894 64 12405 3075 kernel/pid.o After text data bss dec hex filename 3397 304 0 3701 e75 kernel/pid.o Before text data bss dec hex filename 5692 1842 192 7726 1e2e kernel/pid_namespace.o After text data bss dec hex filename 2854 216 16 3086 c0e kernel/pid_namespace.o The following are the stats for ps, pstree and calling readdir on /proc for 10,000 processes. ps: With IDR API With bitmap real 0m1.479s 0m2.319s user 0m0.070s 0m0.060s sys 0m0.289s 0m0.516s pstree: With IDR API With bitmap real 0m1.024s 0m1.794s user 0m0.348s 0m0.612s sys 0m0.184s 0m0.264s proc: With IDR API With bitmap real 0m0.059s 0m0.074s user 0m0.000s 0m0.004s sys 0m0.016s 0m0.016s This patch (of 2): Replace the current bitmap implementation for Process ID allocation. Functions that are no longer required, for example, free_pidmap(), alloc_pidmap(), etc. are removed. The rest of the functions are modified to use the IDR API. The change was made to make the PID allocation less complex by replacing custom code with calls to generic API. [gs051095@gmail.com: v6] Link: http://lkml.kernel.org/r/1507760379-21662-2-git-send-email-gs051095@gmail.com [avagin@openvz.org: restore the old behaviour of the ns_last_pid sysctl] Link: http://lkml.kernel.org/r/20171106183144.16368-1-avagin@openvz.org Link: http://lkml.kernel.org/r/1507583624-22146-2-git-send-email-gs051095@gmail.com Signed-off-by: Gargi Sharma Reviewed-by: Rik van Riel Acked-by: Oleg Nesterov Cc: Julia Lawall Cc: Ingo Molnar Cc: Pavel Tatashin Cc: Kirill Tkhai Cc: Eric W. Biederman Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/platforms/cell/spufs/sched.c | 2 +- fs/proc/loadavg.c | 2 +- include/linux/pid_namespace.h | 14 +-- init/main.c | 2 +- kernel/pid.c | 201 ++++++------------------------ kernel/pid_namespace.c | 53 ++++---- 6 files changed, 65 insertions(+), 209 deletions(-) (limited to 'fs') diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c index 1fbb5da17dd2..e47761cdcb98 100644 --- a/arch/powerpc/platforms/cell/spufs/sched.c +++ b/arch/powerpc/platforms/cell/spufs/sched.c @@ -1093,7 +1093,7 @@ static int show_spu_loadavg(struct seq_file *s, void *private) LOAD_INT(c), LOAD_FRAC(c), count_active_contexts(), atomic_read(&nr_spu_contexts), - task_active_pid_ns(current)->last_pid); + idr_get_cursor(&task_active_pid_ns(current)->idr)); return 0; } diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c index 9bc5c58c00ee..a000d7547479 100644 --- a/fs/proc/loadavg.c +++ b/fs/proc/loadavg.c @@ -24,7 +24,7 @@ static int loadavg_proc_show(struct seq_file *m, void *v) LOAD_INT(avnrun[1]), LOAD_FRAC(avnrun[1]), LOAD_INT(avnrun[2]), LOAD_FRAC(avnrun[2]), nr_running(), nr_threads, - task_active_pid_ns(current)->last_pid); + idr_get_cursor(&task_active_pid_ns(current)->idr)); return 0; } diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h index c78af6061644..92c6aa509d2e 100644 --- a/include/linux/pid_namespace.h +++ b/include/linux/pid_namespace.h @@ -10,15 +10,8 @@ #include #include #include +#include -struct pidmap { - atomic_t nr_free; - void *page; -}; - -#define BITS_PER_PAGE (PAGE_SIZE * 8) -#define BITS_PER_PAGE_MASK (BITS_PER_PAGE-1) -#define PIDMAP_ENTRIES ((PID_MAX_LIMIT+BITS_PER_PAGE-1)/BITS_PER_PAGE) struct fs_pin; @@ -30,9 +23,8 @@ enum { /* definitions for pid_namespace's hide_pid field */ struct pid_namespace { struct kref kref; - struct pidmap pidmap[PIDMAP_ENTRIES]; + struct idr idr; struct rcu_head rcu; - int last_pid; unsigned int nr_hashed; struct task_struct *child_reaper; struct kmem_cache *pid_cachep; @@ -106,6 +98,6 @@ static inline int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd) extern struct pid_namespace *task_active_pid_ns(struct task_struct *tsk); void pidhash_init(void); -void pidmap_init(void); +void pid_idr_init(void); #endif /* _LINUX_PID_NS_H */ diff --git a/init/main.c b/init/main.c index 859a786f7c0a..d0cbcfc06124 100644 --- a/init/main.c +++ b/init/main.c @@ -669,7 +669,7 @@ asmlinkage __visible void __init start_kernel(void) if (late_time_init) late_time_init(); calibrate_delay(); - pidmap_init(); + pid_idr_init(); anon_vma_init(); #ifdef CONFIG_X86 if (efi_enabled(EFI_RUNTIME_SERVICES)) diff --git a/kernel/pid.c b/kernel/pid.c index 020dedbdf066..0ce59369632f 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -39,6 +39,7 @@ #include #include #include +#include #define pid_hashfn(nr, ns) \ hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift) @@ -53,14 +54,6 @@ int pid_max = PID_MAX_DEFAULT; int pid_max_min = RESERVED_PIDS + 1; int pid_max_max = PID_MAX_LIMIT; -static inline int mk_pid(struct pid_namespace *pid_ns, - struct pidmap *map, int off) -{ - return (map - pid_ns->pidmap)*BITS_PER_PAGE + off; -} - -#define find_next_offset(map, off) \ - find_next_zero_bit((map)->page, BITS_PER_PAGE, off) /* * PID-map pages start out as NULL, they get allocated upon @@ -70,10 +63,7 @@ static inline int mk_pid(struct pid_namespace *pid_ns, */ struct pid_namespace init_pid_ns = { .kref = KREF_INIT(2), - .pidmap = { - [ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL } - }, - .last_pid = 0, + .idr = IDR_INIT, .nr_hashed = PIDNS_HASH_ADDING, .level = 0, .child_reaper = &init_task, @@ -101,138 +91,6 @@ EXPORT_SYMBOL_GPL(init_pid_ns); static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock); -static void free_pidmap(struct upid *upid) -{ - int nr = upid->nr; - struct pidmap *map = upid->ns->pidmap + nr / BITS_PER_PAGE; - int offset = nr & BITS_PER_PAGE_MASK; - - clear_bit(offset, map->page); - atomic_inc(&map->nr_free); -} - -/* - * If we started walking pids at 'base', is 'a' seen before 'b'? - */ -static int pid_before(int base, int a, int b) -{ - /* - * This is the same as saying - * - * (a - base + MAXUINT) % MAXUINT < (b - base + MAXUINT) % MAXUINT - * and that mapping orders 'a' and 'b' with respect to 'base'. - */ - return (unsigned)(a - base) < (unsigned)(b - base); -} - -/* - * We might be racing with someone else trying to set pid_ns->last_pid - * at the pid allocation time (there's also a sysctl for this, but racing - * with this one is OK, see comment in kernel/pid_namespace.c about it). - * We want the winner to have the "later" value, because if the - * "earlier" value prevails, then a pid may get reused immediately. - * - * Since pids rollover, it is not sufficient to just pick the bigger - * value. We have to consider where we started counting from. - * - * 'base' is the value of pid_ns->last_pid that we observed when - * we started looking for a pid. - * - * 'pid' is the pid that we eventually found. - */ -static void set_last_pid(struct pid_namespace *pid_ns, int base, int pid) -{ - int prev; - int last_write = base; - do { - prev = last_write; - last_write = cmpxchg(&pid_ns->last_pid, prev, pid); - } while ((prev != last_write) && (pid_before(base, last_write, pid))); -} - -static int alloc_pidmap(struct pid_namespace *pid_ns) -{ - int i, offset, max_scan, pid, last = pid_ns->last_pid; - struct pidmap *map; - - pid = last + 1; - if (pid >= pid_max) - pid = RESERVED_PIDS; - offset = pid & BITS_PER_PAGE_MASK; - map = &pid_ns->pidmap[pid/BITS_PER_PAGE]; - /* - * If last_pid points into the middle of the map->page we - * want to scan this bitmap block twice, the second time - * we start with offset == 0 (or RESERVED_PIDS). - */ - max_scan = DIV_ROUND_UP(pid_max, BITS_PER_PAGE) - !offset; - for (i = 0; i <= max_scan; ++i) { - if (unlikely(!map->page)) { - void *page = kzalloc(PAGE_SIZE, GFP_KERNEL); - /* - * Free the page if someone raced with us - * installing it: - */ - spin_lock_irq(&pidmap_lock); - if (!map->page) { - map->page = page; - page = NULL; - } - spin_unlock_irq(&pidmap_lock); - kfree(page); - if (unlikely(!map->page)) - return -ENOMEM; - } - if (likely(atomic_read(&map->nr_free))) { - for ( ; ; ) { - if (!test_and_set_bit(offset, map->page)) { - atomic_dec(&map->nr_free); - set_last_pid(pid_ns, last, pid); - return pid; - } - offset = find_next_offset(map, offset); - if (offset >= BITS_PER_PAGE) - break; - pid = mk_pid(pid_ns, map, offset); - if (pid >= pid_max) - break; - } - } - if (map < &pid_ns->pidmap[(pid_max-1)/BITS_PER_PAGE]) { - ++map; - offset = 0; - } else { - map = &pid_ns->pidmap[0]; - offset = RESERVED_PIDS; - if (unlikely(last == offset)) - break; - } - pid = mk_pid(pid_ns, map, offset); - } - return -EAGAIN; -} - -int next_pidmap(struct pid_namespace *pid_ns, unsigned int last) -{ - int offset; - struct pidmap *map, *end; - - if (last >= PID_MAX_LIMIT) - return -1; - - offset = (last + 1) & BITS_PER_PAGE_MASK; - map = &pid_ns->pidmap[(last + 1)/BITS_PER_PAGE]; - end = &pid_ns->pidmap[PIDMAP_ENTRIES]; - for (; map < end; map++, offset = 0) { - if (unlikely(!map->page)) - continue; - offset = find_next_bit((map)->page, BITS_PER_PAGE, offset); - if (offset < BITS_PER_PAGE) - return mk_pid(pid_ns, map, offset); - } - return -1; -} - void put_pid(struct pid *pid) { struct pid_namespace *ns; @@ -266,7 +124,7 @@ void free_pid(struct pid *pid) struct upid *upid = pid->numbers + i; struct pid_namespace *ns = upid->ns; hlist_del_rcu(&upid->pid_chain); - switch(--ns->nr_hashed) { + switch (--ns->nr_hashed) { case 2: case 1: /* When all that is left in the pid namespace @@ -284,12 +142,11 @@ void free_pid(struct pid *pid) schedule_work(&ns->proc_work); break; } + + idr_remove(&ns->idr, upid->nr); } spin_unlock_irqrestore(&pidmap_lock, flags); - for (i = 0; i <= pid->level; i++) - free_pidmap(pid->numbers + i); - call_rcu(&pid->rcu, delayed_put_pid); } @@ -308,8 +165,29 @@ struct pid *alloc_pid(struct pid_namespace *ns) tmp = ns; pid->level = ns->level; + for (i = ns->level; i >= 0; i--) { - nr = alloc_pidmap(tmp); + int pid_min = 1; + + idr_preload(GFP_KERNEL); + spin_lock_irq(&pidmap_lock); + + /* + * init really needs pid 1, but after reaching the maximum + * wrap back to RESERVED_PIDS + */ + if (idr_get_cursor(&tmp->idr) > RESERVED_PIDS) + pid_min = RESERVED_PIDS; + + /* + * Store a null pointer so find_pid_ns does not find + * a partially initialized PID (see below). + */ + nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min, + pid_max, GFP_ATOMIC); + spin_unlock_irq(&pidmap_lock); + idr_preload_end(); + if (nr < 0) { retval = nr; goto out_free; @@ -339,6 +217,8 @@ struct pid *alloc_pid(struct pid_namespace *ns) for ( ; upid >= pid->numbers; --upid) { hlist_add_head_rcu(&upid->pid_chain, &pid_hash[pid_hashfn(upid->nr, upid->ns)]); + /* Make the PID visible to find_pid_ns. */ + idr_replace(&upid->ns->idr, pid, upid->nr); upid->ns->nr_hashed++; } spin_unlock_irq(&pidmap_lock); @@ -350,8 +230,11 @@ out_unlock: put_pid_ns(ns); out_free: + spin_lock_irq(&pidmap_lock); while (++i <= ns->level) - free_pidmap(pid->numbers + i); + idr_remove(&ns->idr, (pid->numbers + i)->nr); + + spin_unlock_irq(&pidmap_lock); kmem_cache_free(ns->pid_cachep, pid); return ERR_PTR(retval); @@ -553,16 +436,7 @@ EXPORT_SYMBOL_GPL(task_active_pid_ns); */ struct pid *find_ge_pid(int nr, struct pid_namespace *ns) { - struct pid *pid; - - do { - pid = find_pid_ns(nr, ns); - if (pid) - break; - nr = next_pidmap(ns, nr); - } while (nr > 0); - - return pid; + return idr_get_next(&ns->idr, &nr); } /* @@ -578,7 +452,7 @@ void __init pidhash_init(void) 0, 4096); } -void __init pidmap_init(void) +void __init pid_idr_init(void) { /* Verify no one has done anything silly: */ BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_HASH_ADDING); @@ -590,10 +464,7 @@ void __init pidmap_init(void) PIDS_PER_CPU_MIN * num_possible_cpus()); pr_info("pid_max: default: %u minimum: %u\n", pid_max, pid_max_min); - init_pid_ns.pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL); - /* Reserve PID 0. We never call free_pidmap(0) */ - set_bit(0, init_pid_ns.pidmap[0].page); - atomic_dec(&init_pid_ns.pidmap[0].nr_free); + idr_init(&init_pid_ns.idr); init_pid_ns.pid_cachep = KMEM_CACHE(pid, SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT); diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 4918314893bc..ca7c8a8823b1 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -21,6 +21,7 @@ #include #include #include +#include struct pid_cache { int nr_ids; @@ -98,7 +99,6 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns struct pid_namespace *ns; unsigned int level = parent_pid_ns->level + 1; struct ucounts *ucounts; - int i; int err; err = -EINVAL; @@ -117,17 +117,15 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns if (ns == NULL) goto out_dec; - ns->pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL); - if (!ns->pidmap[0].page) - goto out_free; + idr_init(&ns->idr); ns->pid_cachep = create_pid_cachep(level + 1); if (ns->pid_cachep == NULL) - goto out_free_map; + goto out_free_idr; err = ns_alloc_inum(&ns->ns); if (err) - goto out_free_map; + goto out_free_idr; ns->ns.ops = &pidns_operations; kref_init(&ns->kref); @@ -138,17 +136,10 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns ns->nr_hashed = PIDNS_HASH_ADDING; INIT_WORK(&ns->proc_work, proc_cleanup_work); - set_bit(0, ns->pidmap[0].page); - atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1); - - for (i = 1; i < PIDMAP_ENTRIES; i++) - atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE); - return ns; -out_free_map: - kfree(ns->pidmap[0].page); -out_free: +out_free_idr: + idr_destroy(&ns->idr); kmem_cache_free(pid_ns_cachep, ns); out_dec: dec_pid_namespaces(ucounts); @@ -168,11 +159,9 @@ static void delayed_free_pidns(struct rcu_head *p) static void destroy_pid_namespace(struct pid_namespace *ns) { - int i; - ns_free_inum(&ns->ns); - for (i = 0; i < PIDMAP_ENTRIES; i++) - kfree(ns->pidmap[i].page); + + idr_destroy(&ns->idr); call_rcu(&ns->rcu, delayed_free_pidns); } @@ -213,6 +202,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) int rc; struct task_struct *task, *me = current; int init_pids = thread_group_leader(me) ? 1 : 2; + struct pid *pid; /* Don't allow any more processes into the pid namespace */ disable_pid_allocation(pid_ns); @@ -239,20 +229,16 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) * maintain a tasklist for each pid namespace. * */ + rcu_read_lock(); read_lock(&tasklist_lock); - nr = next_pidmap(pid_ns, 1); - while (nr > 0) { - rcu_read_lock(); - - task = pid_task(find_vpid(nr), PIDTYPE_PID); + nr = 2; + idr_for_each_entry_continue(&pid_ns->idr, pid, nr) { + task = pid_task(pid, PIDTYPE_PID); if (task && !__fatal_signal_pending(task)) send_sig_info(SIGKILL, SEND_SIG_FORCED, task); - - rcu_read_unlock(); - - nr = next_pidmap(pid_ns, nr); } read_unlock(&tasklist_lock); + rcu_read_unlock(); /* * Reap the EXIT_ZOMBIE children we had before we ignored SIGCHLD. @@ -301,6 +287,7 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write, { struct pid_namespace *pid_ns = task_active_pid_ns(current); struct ctl_table tmp = *table; + int ret, next; if (write && !ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN)) return -EPERM; @@ -311,8 +298,14 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write, * it should synchronize its usage with external means. */ - tmp.data = &pid_ns->last_pid; - return proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); + next = idr_get_cursor(&pid_ns->idr) - 1; + + tmp.data = &next; + ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); + if (!ret && write) + idr_set_cursor(&pid_ns->idr, next + 1); + + return ret; } extern int pid_max; -- cgit v1.2.3