summaryrefslogtreecommitdiffstats
path: root/fs/pipe.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/pipe.c')
-rw-r--r--fs/pipe.c155
1 files changed, 105 insertions, 50 deletions
diff --git a/fs/pipe.c b/fs/pipe.c
index b901c8eefafd..87109e761fa5 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -270,22 +270,41 @@ static bool pipe_buf_can_merge(struct pipe_buffer *buf)
return buf->ops == &anon_pipe_buf_ops;
}
+/* Done while waiting without holding the pipe lock - thus the READ_ONCE() */
+static inline bool pipe_readable(const struct pipe_inode_info *pipe)
+{
+ unsigned int head = READ_ONCE(pipe->head);
+ unsigned int tail = READ_ONCE(pipe->tail);
+ unsigned int writers = READ_ONCE(pipe->writers);
+
+ return !pipe_empty(head, tail) || !writers;
+}
+
static ssize_t
pipe_read(struct kiocb *iocb, struct iov_iter *to)
{
size_t total_len = iov_iter_count(to);
struct file *filp = iocb->ki_filp;
struct pipe_inode_info *pipe = filp->private_data;
- int do_wakeup;
+ bool was_full;
ssize_t ret;
/* Null read succeeds. */
if (unlikely(total_len == 0))
return 0;
- do_wakeup = 0;
ret = 0;
__pipe_lock(pipe);
+
+ /*
+ * We only wake up writers if the pipe was full when we started
+ * reading in order to avoid unnecessary wakeups.
+ *
+ * But when we do wake up writers, we do so using a sync wakeup
+ * (WF_SYNC), because we want them to get going and generate more
+ * data for us.
+ */
+ was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage);
for (;;) {
unsigned int head = pipe->head;
unsigned int tail = pipe->tail;
@@ -324,19 +343,11 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
}
if (!buf->len) {
- bool wake;
pipe_buf_release(pipe, buf);
spin_lock_irq(&pipe->wait.lock);
tail++;
pipe->tail = tail;
- do_wakeup = 1;
- wake = head - (tail - 1) == pipe->max_usage / 2;
- if (wake)
- wake_up_locked_poll(
- &pipe->wait, EPOLLOUT | EPOLLWRNORM);
spin_unlock_irq(&pipe->wait.lock);
- if (wake)
- kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
}
total_len -= chars;
if (!total_len)
@@ -347,31 +358,30 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
if (!pipe->writers)
break;
- if (!pipe->waiting_writers) {
- /* syscall merging: Usually we must not sleep
- * if O_NONBLOCK is set, or if we got some data.
- * But if a writer sleeps in kernel space, then
- * we can wait for that data without violating POSIX.
- */
- if (ret)
- break;
- if (filp->f_flags & O_NONBLOCK) {
- ret = -EAGAIN;
- break;
- }
+ if (ret)
+ break;
+ if (filp->f_flags & O_NONBLOCK) {
+ ret = -EAGAIN;
+ break;
}
if (signal_pending(current)) {
if (!ret)
ret = -ERESTARTSYS;
break;
}
- pipe_wait(pipe);
+ __pipe_unlock(pipe);
+ if (was_full) {
+ wake_up_interruptible_sync_poll(&pipe->wait, EPOLLOUT | EPOLLWRNORM);
+ kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
+ }
+ wait_event_interruptible(pipe->wait, pipe_readable(pipe));
+ __pipe_lock(pipe);
+ was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage);
}
__pipe_unlock(pipe);
- /* Signal writers asynchronously that there is more room. */
- if (do_wakeup) {
- wake_up_interruptible_poll(&pipe->wait, EPOLLOUT | EPOLLWRNORM);
+ if (was_full) {
+ wake_up_interruptible_sync_poll(&pipe->wait, EPOLLOUT | EPOLLWRNORM);
kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
}
if (ret > 0)
@@ -384,6 +394,17 @@ static inline int is_packetized(struct file *file)
return (file->f_flags & O_DIRECT) != 0;
}
+/* Done while waiting without holding the pipe lock - thus the READ_ONCE() */
+static inline bool pipe_writable(const struct pipe_inode_info *pipe)
+{
+ unsigned int head = READ_ONCE(pipe->head);
+ unsigned int tail = READ_ONCE(pipe->tail);
+ unsigned int max_usage = READ_ONCE(pipe->max_usage);
+
+ return !pipe_full(head, tail, max_usage) ||
+ !READ_ONCE(pipe->readers);
+}
+
static ssize_t
pipe_write(struct kiocb *iocb, struct iov_iter *from)
{
@@ -391,9 +412,9 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
struct pipe_inode_info *pipe = filp->private_data;
unsigned int head;
ssize_t ret = 0;
- int do_wakeup = 0;
size_t total_len = iov_iter_count(from);
ssize_t chars;
+ bool was_empty = false;
/* Null write succeeds. */
if (unlikely(total_len == 0))
@@ -407,11 +428,21 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
goto out;
}
+ /*
+ * Only wake up if the pipe started out empty, since
+ * otherwise there should be no readers waiting.
+ *
+ * If it wasn't empty we try to merge new data into
+ * the last buffer.
+ *
+ * That naturally merges small writes, but it also
+ * page-aligs the rest of the writes for large writes
+ * spanning multiple pages.
+ */
head = pipe->head;
-
- /* We try to merge small writes */
- chars = total_len & (PAGE_SIZE-1); /* size of the last buffer */
- if (!pipe_empty(head, pipe->tail) && chars != 0) {
+ was_empty = pipe_empty(head, pipe->tail);
+ chars = total_len & (PAGE_SIZE-1);
+ if (chars && !was_empty) {
unsigned int mask = pipe->ring_size - 1;
struct pipe_buffer *buf = &pipe->bufs[(head - 1) & mask];
int offset = buf->offset + buf->len;
@@ -426,7 +457,7 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
ret = -EFAULT;
goto out;
}
- do_wakeup = 1;
+
buf->len += ret;
if (!iov_iter_count(from))
goto out;
@@ -471,17 +502,7 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
}
pipe->head = head + 1;
-
- /* Always wake up, even if the copy fails. Otherwise
- * we lock up (O_NONBLOCK-)readers that sleep due to
- * syscall merging.
- * FIXME! Is this really true?
- */
- wake_up_locked_poll(
- &pipe->wait, EPOLLIN | EPOLLRDNORM);
-
spin_unlock_irq(&pipe->wait.lock);
- kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
/* Insert it into the buffer array */
buf = &pipe->bufs[head & mask];
@@ -524,14 +545,36 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
ret = -ERESTARTSYS;
break;
}
- pipe->waiting_writers++;
- pipe_wait(pipe);
- pipe->waiting_writers--;
+
+ /*
+ * We're going to release the pipe lock and wait for more
+ * space. We wake up any readers if necessary, and then
+ * after waiting we need to re-check whether the pipe
+ * become empty while we dropped the lock.
+ */
+ __pipe_unlock(pipe);
+ if (was_empty) {
+ wake_up_interruptible_sync_poll(&pipe->wait, EPOLLIN | EPOLLRDNORM);
+ kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
+ }
+ wait_event_interruptible(pipe->wait, pipe_writable(pipe));
+ __pipe_lock(pipe);
+ was_empty = pipe_empty(head, pipe->tail);
}
out:
__pipe_unlock(pipe);
- if (do_wakeup) {
- wake_up_interruptible_poll(&pipe->wait, EPOLLIN | EPOLLRDNORM);
+
+ /*
+ * If we do do a wakeup event, we do a 'sync' wakeup, because we
+ * want the reader to start processing things asap, rather than
+ * leave the data pending.
+ *
+ * This is particularly important for small writes, because of
+ * how (for example) the GNU make jobserver uses small writes to
+ * wake up pending jobs
+ */
+ if (was_empty) {
+ wake_up_interruptible_sync_poll(&pipe->wait, EPOLLIN | EPOLLRDNORM);
kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
}
if (ret > 0 && sb_start_write_trylock(file_inode(filp)->i_sb)) {
@@ -574,12 +617,24 @@ pipe_poll(struct file *filp, poll_table *wait)
{
__poll_t mask;
struct pipe_inode_info *pipe = filp->private_data;
- unsigned int head = READ_ONCE(pipe->head);
- unsigned int tail = READ_ONCE(pipe->tail);
+ unsigned int head, tail;
+ /*
+ * Reading only -- no need for acquiring the semaphore.
+ *
+ * But because this is racy, the code has to add the
+ * entry to the poll table _first_ ..
+ */
poll_wait(filp, &pipe->wait, wait);
- /* Reading only -- no need for acquiring the semaphore. */
+ /*
+ * .. and only then can you do the racy tests. That way,
+ * if something changes and you got it wrong, the poll
+ * table entry will wake you up and fix it.
+ */
+ head = READ_ONCE(pipe->head);
+ tail = READ_ONCE(pipe->tail);
+
mask = 0;
if (filp->f_mode & FMODE_READ) {
if (!pipe_empty(head, tail))